From ed3b277b2f498e3cab04c9416aaddf97eec8c3e2 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Mar 2023 20:03:10 +0100 Subject: [PATCH] Backport of Prepare for pandas 2.0 (#2434) onto 1.9.x (#2435) * Prepare for pandas 2.0 (#2434) * is_categorical -> is_categorical_dtype * cat.replace(to_remove, np.nan) -> cat.remove_categories(to_remove) * df1.append(df2) -> pd.concat([df1, df2]) * Series.iteritems -> Series.items * Fix indexing a pandas object with a set in score genes * Release notes (cherry picked from commit 0692ef9ea30335b95f7e7f9aab7be856469d9f35) * Fix anndata-dev test * anndata-dev compatibility * Prep 1.9.3 * Fix pr reference --- .azure-pipelines.yml | 2 +- docs/release-notes/1.9.3.md | 7 +++++++ scanpy/datasets/_datasets.py | 2 +- scanpy/external/exporting.py | 10 +++++----- scanpy/plotting/_anndata.py | 2 +- scanpy/plotting/_tools/scatterplots.py | 2 +- scanpy/preprocessing/_highly_variable_genes.py | 2 +- scanpy/tests/test_get.py | 1 + scanpy/tests/test_ingest.py | 5 +++-- scanpy/tests/test_preprocessing.py | 17 +++++++++-------- scanpy/tools/_score_genes.py | 2 +- 11 files changed, 31 insertions(+), 21 deletions(-) create mode 100644 docs/release-notes/1.9.3.md diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 978dfd41e2..7a8450e58e 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -50,7 +50,7 @@ jobs: displayName: 'Install dependencies' - script: | - 'pip install -v "anndata[dev,test] @ git+https://github.com/scverse/anndata"' + pip install -v "anndata[dev,test] @ git+https://github.com/scverse/anndata" displayName: 'Install development anndata' condition: eq(variables['ANNDATA_DEV'], 'yes') diff --git a/docs/release-notes/1.9.3.md b/docs/release-notes/1.9.3.md new file mode 100644 index 0000000000..da4d531268 --- /dev/null +++ b/docs/release-notes/1.9.3.md @@ -0,0 +1,7 @@ +### 1.9.3 {small}`2023-03-02` + +```{rubric} Bug fixes +``` + +* Variety of fixes against pandas 2.0.0rc0 {pr}`2434` {smaller}`I Virshup` +* Compatibility with anndata 0.9.0rc {pr}`2435` {smaller}`I Virshup` diff --git a/scanpy/datasets/_datasets.py b/scanpy/datasets/_datasets.py index 2b0bbca990..b2e8de8a07 100644 --- a/scanpy/datasets/_datasets.py +++ b/scanpy/datasets/_datasets.py @@ -180,7 +180,7 @@ def paul15() -> ad.AnnData: clusters = f['cluster.id'][()].flatten().astype(int) infogenes_names = f['info.genes_strings'][()].astype(str) # each row has to correspond to a observation, therefore transpose - adata = ad.AnnData(X.transpose(), dtype=X.dtype) + adata = ad.AnnData(X.transpose(), dtype=np.float32) adata.var_names = gene_names adata.row_names = cell_names # names reflecting the cell type identifications from the paper diff --git a/scanpy/external/exporting.py b/scanpy/external/exporting.py index dcf55e2482..fc346bb0dd 100644 --- a/scanpy/external/exporting.py +++ b/scanpy/external/exporting.py @@ -11,7 +11,7 @@ import h5py import matplotlib.pyplot as plt from anndata import AnnData -from pandas.api.types import is_categorical +from pandas.api.types import is_categorical_dtype from ..preprocessing._utils import _get_mean_var from .._utils import NeighborsView @@ -148,7 +148,7 @@ def spring_project( continuous_extras = {} if cell_groupings is None: for obs_name in adata.obs: - if is_categorical(adata.obs[obs_name]): + if is_categorical_dtype(adata.obs[obs_name]): categorical_extras[obs_name] = [str(x) for x in adata.obs[obs_name]] else: if isinstance(cell_groupings, str): @@ -156,7 +156,7 @@ def spring_project( for obs_name in cell_groupings: if obs_name not in adata.obs: logg.warning(f'Cell grouping {obs_name!r} is not in adata.obs') - elif is_categorical(adata.obs[obs_name]): + elif is_categorical_dtype(adata.obs[obs_name]): categorical_extras[obs_name] = [str(x) for x in adata.obs[obs_name]] else: logg.warning( @@ -164,7 +164,7 @@ def spring_project( ) if custom_color_tracks is None: for obs_name in adata.obs: - if not is_categorical(adata.obs[obs_name]): + if not is_categorical_dtype(adata.obs[obs_name]): continuous_extras[obs_name] = np.array(adata.obs[obs_name]) else: if isinstance(custom_color_tracks, str): @@ -172,7 +172,7 @@ def spring_project( for obs_name in custom_color_tracks: if obs_name not in adata.obs: logg.warning(f'Custom color track {obs_name!r} is not in adata.obs') - elif not is_categorical(adata.obs[obs_name]): + elif not is_categorical_dtype(adata.obs[obs_name]): continuous_extras[obs_name] = np.array(adata.obs[obs_name]) else: logg.warning( diff --git a/scanpy/plotting/_anndata.py b/scanpy/plotting/_anndata.py index c20a7125b4..07e69b4291 100755 --- a/scanpy/plotting/_anndata.py +++ b/scanpy/plotting/_anndata.py @@ -2411,7 +2411,7 @@ def _plot_categories_as_colorblocks( labels = [] label2code = {} # dictionary of numerical values asigned to each label for code, (label, value) in enumerate( - obs_tidy.index.value_counts(sort=False).iteritems() + obs_tidy.index.value_counts(sort=False).items() ): ticks.append(value_sum + (value / 2)) labels.append(label) diff --git a/scanpy/plotting/_tools/scatterplots.py b/scanpy/plotting/_tools/scatterplots.py index a5ef7bab7c..573da35c79 100644 --- a/scanpy/plotting/_tools/scatterplots.py +++ b/scanpy/plotting/_tools/scatterplots.py @@ -1168,7 +1168,7 @@ def _get_color_source_vector( else: values = adata.obs_vector(value_to_plot, layer=layer) if groups and is_categorical_dtype(values): - values = values.replace(values.categories.difference(groups), np.nan) + values = values.remove_categories(values.categories.difference(groups)) return values diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index c083c374a8..70c16b3ff2 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -482,7 +482,7 @@ def highly_variable_genes( missing_hvg['highly_variable'] = missing_hvg['highly_variable'].astype(bool) missing_hvg['gene'] = gene_list[~filt] hvg['gene'] = adata_subset.var_names.values - hvg = hvg.append(missing_hvg, ignore_index=True) + hvg = pd.concat([hvg, missing_hvg], ignore_index=True) # Order as before filtering idxs = np.concatenate((np.where(filt)[0], np.where(~filt)[0])) diff --git a/scanpy/tests/test_get.py b/scanpy/tests/test_get.py index 03550b2ffd..4fa787c933 100644 --- a/scanpy/tests/test_get.py +++ b/scanpy/tests/test_get.py @@ -158,6 +158,7 @@ def test_repeated_gene_symbols(): adata = sc.AnnData( np.arange(3 * 4).reshape((3, 4)), var=pd.DataFrame({"gene_symbols": gene_symbols}, index=var_names), + dtype=np.float32, ) with pytest.raises(KeyError, match="symbol_b"): diff --git a/scanpy/tests/test_ingest.py b/scanpy/tests/test_ingest.py index 8bd9c05be2..bd512945b3 100644 --- a/scanpy/tests/test_ingest.py +++ b/scanpy/tests/test_ingest.py @@ -17,10 +17,11 @@ [7.0, 9.4, 6.8, 9.1, 8.0], [8.9, 8.6, 9.6, 1.0, 2.0], [6.5, 8.9, 2.2, 4.5, 8.9], - ] + ], + dtype=np.float32, ) -T = np.array([[2.0, 3.5, 4.0, 1.0, 4.7], [3.2, 2.0, 5.0, 5.0, 8.0]]) +T = np.array([[2.0, 3.5, 4.0, 1.0, 4.7], [3.2, 2.0, 5.0, 5.0, 8.0]], dtype=np.float32) @pytest.fixture diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py index 1dc3d28680..6a4626018a 100644 --- a/scanpy/tests/test_preprocessing.py +++ b/scanpy/tests/test_preprocessing.py @@ -15,11 +15,11 @@ def test_log1p(tmp_path): - A = np.random.rand(200, 10) + A = np.random.rand(200, 10).astype(np.float32) A_l = np.log1p(A) - ad = AnnData(A) - ad2 = AnnData(A) - ad3 = AnnData(A) + ad = AnnData(A.copy()) + ad2 = AnnData(A.copy()) + ad3 = AnnData(A.copy()) ad3.filename = tmp_path / 'test.h5ad' sc.pp.log1p(ad) assert np.allclose(ad.X, A_l) @@ -84,18 +84,19 @@ def test_mean_var_sparse(): def test_normalize_per_cell(): - adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]])) + A = np.array([[1, 0], [3, 0], [5, 6]], dtype=np.float32) + adata = AnnData(A.copy()) sc.pp.normalize_per_cell(adata, counts_per_cell_after=1, key_n_counts='n_counts2') assert adata.X.sum(axis=1).tolist() == [1.0, 1.0, 1.0] # now with copy option - adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]])) + adata = AnnData(A.copy()) # note that sc.pp.normalize_per_cell is also used in # pl.highest_expr_genes with parameter counts_per_cell_after=100 adata_copy = sc.pp.normalize_per_cell(adata, counts_per_cell_after=1, copy=True) assert adata_copy.X.sum(axis=1).tolist() == [1.0, 1.0, 1.0] # now sparse - adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]])) - adata_sparse = AnnData(sp.csr_matrix([[1, 0], [3, 0], [5, 6]])) + adata = AnnData(A.copy()) + adata_sparse = AnnData(sp.csr_matrix(A.copy())) sc.pp.normalize_per_cell(adata) sc.pp.normalize_per_cell(adata_sparse) assert adata.X.sum(axis=1).tolist() == adata_sparse.X.sum(axis=1).A1.tolist() diff --git a/scanpy/tools/_score_genes.py b/scanpy/tools/_score_genes.py index 8c11148f4b..f0783f9827 100644 --- a/scanpy/tools/_score_genes.py +++ b/scanpy/tools/_score_genes.py @@ -148,7 +148,7 @@ def score_genes( control_genes = set() # now pick `ctrl_size` genes from every cut - for cut in np.unique(obs_cut.loc[gene_list]): + for cut in np.unique(obs_cut.loc[list(gene_list)]): r_genes = np.array(obs_cut[obs_cut == cut].index) np.random.shuffle(r_genes) # uses full r_genes if ctrl_size > len(r_genes)