-
Notifications
You must be signed in to change notification settings - Fork 616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speed up categorical regressor with numba #3353
base: main
Are you sure you want to change the base?
Changes from all commits
086f70d
37244a9
b4ecb0a
36858d9
be1bccc
a1a59ae
7b41bc8
119a142
d77fa9c
236e356
bb9cde4
bbb5035
2a92193
c7b78c0
b001c0e
c3ce03e
c50226a
c6665f4
858e247
2421bd5
2e16c45
1b7d7e1
3b7fe6e
726a625
104a0f3
f9b13be
6eafd04
1dae8f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
* Speed up for a categorical regressor in {func}`~scanpy.pp.regress_out` {smaller}`S Dicks` | ||
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -637,6 +637,22 @@ | |||||
DT = TypeVar("DT") | ||||||
|
||||||
|
||||||
@njit | ||||||
def _create_regressor_categorical( | ||||||
X: np.ndarray, number_categories: int, cat_array: np.ndarray | ||||||
) -> np.ndarray: | ||||||
# create regressor matrix for categorical variables | ||||||
regressors = np.zeros(X.shape, dtype=X.dtype) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. check dtype for behavior with integer dtype i.e., need to ensure this is a floating point matrix |
||||||
# iterate over categories | ||||||
for category in range(number_categories): | ||||||
# iterate over genes and calculate mean expression | ||||||
# for each gene per category | ||||||
mask = category == cat_array | ||||||
for ix in numba.prange(X.T.shape[0]): | ||||||
regressors[mask, ix] = X.T[ix, mask].mean() | ||||||
return regressors | ||||||
|
||||||
|
||||||
@njit | ||||||
def get_resid( | ||||||
data: np.ndarray, | ||||||
|
@@ -732,13 +748,16 @@ | |||||
) | ||||||
raise ValueError(msg) | ||||||
logg.debug("... regressing on per-gene means within categories") | ||||||
regressors = np.zeros(X.shape, dtype="float32") | ||||||
|
||||||
# set number of categories to the same dtype as the categories | ||||||
cat_array = adata.obs[keys[0]].cat.codes.to_numpy() | ||||||
number_categories = cat_array.dtype.type(len(adata.obs[keys[0]].cat.categories)) | ||||||
|
||||||
X = _to_dense(X, order="F") if issparse(X) else X | ||||||
ilan-gold marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
# TODO figure out if we should use a numba kernel for this | ||||||
for category in adata.obs[keys[0]].cat.categories: | ||||||
mask = (category == adata.obs[keys[0]]).values | ||||||
for ix, x in enumerate(X.T): | ||||||
regressors[mask, ix] = x[mask].mean() | ||||||
if np.issubdtype(X.dtype, np.integer): | ||||||
target_dtype = np.float32 if X.dtype.itemsize == 4 else np.float64 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the point of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, if there is a reason, just to be more clean:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lastly, maybe I'm misremembering, but did we not discuss putting this in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should also apply to #3461 |
||||||
X = X.astype(target_dtype) | ||||||
regressors = _create_regressor_categorical(X, number_categories, cat_array) | ||||||
variable_is_categorical = True | ||||||
# regress on one or several ordinal variables | ||||||
else: | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -464,14 +464,21 @@ def test_regress_out_constants(): | |
assert_equal(adata, adata_copy) | ||
|
||
|
||
def test_regress_out_reproducible(): | ||
adata = pbmc68k_reduced() | ||
@pytest.mark.parametrize( | ||
("keys", "test_file", "atol"), | ||
[ | ||
(["n_counts", "percent_mito"], "regress_test_small.npy", 0.0), | ||
(["bulk_labels"], "regress_test_small_cat.npy", 1e-6), | ||
], | ||
) | ||
def test_regress_out_reproducible(keys, test_file, atol): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we add a test for integer + float as a param to this test? |
||
adata = sc.datasets.pbmc68k_reduced() | ||
adata = adata.raw.to_adata()[:200, :200].copy() | ||
sc.pp.regress_out(adata, keys=["n_counts", "percent_mito"]) | ||
sc.pp.regress_out(adata, keys=keys) | ||
# This file was generated from the original implementation in version 1.10.3 | ||
# Now we compare new implementation with the old one | ||
tester = np.load(DATA_PATH / "regress_test_small.npy") | ||
np.testing.assert_allclose(adata.X, tester) | ||
flying-sheep marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tester = np.load(DATA_PATH / test_file) | ||
np.testing.assert_allclose(adata.X, tester, atol=atol) | ||
|
||
|
||
def test_regress_out_constants_equivalent(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.