sgkit-dev · tomwhite · Sep 19, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 16, 2024
diff --git a/.github/workflows/cubed.yml b/.github/workflows/cubed.yml
@@ -30,4 +30,4 @@ jobs:
 
     - name: Test with pytest
       run: |
-        pytest -v sgkit/tests/test_aggregation.py -k 'test_count_call_alleles or (test_count_variant_alleles and not test_count_variant_alleles__chunked[call_genotype]) or (test_variant_stats and not test_variant_stats__chunks[chunks2-False])' --use-cubed
+        pytest -v sgkit/tests/test_{aggregation,hwe}.py -k 'test_count_call_alleles or test_hwep or test_sample_stats or (test_count_variant_alleles and not test_count_variant_alleles__chunked[call_genotype]) or (test_variant_stats and not test_variant_stats__chunks[chunks2-False])' --use-cubed
diff --git a/sgkit/distarray.py b/sgkit/distarray.py
@@ -8,3 +8,9 @@
 else:
     # default to dask
     from dask.array import *  # noqa: F401, F403
+
+    # dask doesn't have a top-level astype required by the array API
+    def astype(x, dtype, /, *, copy=True):  # pragma: no cover
+        if not copy and dtype == x.dtype:
+            return x
+        return x.astype(dtype=dtype, copy=copy)
diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py
@@ -457,9 +457,8 @@ def genotype_coords(
     G = da.map_blocks(_index_as_genotype, X, K, new_axis=1, chunks=chunks)
     # allow enough room for all alleles and separators
     dtype = "|S{}".format(max_chars * ploidy + ploidy - 1)
-    S = da.map_blocks(
-        genotype_as_bytes, G, False, max_chars, drop_axis=1, dtype=dtype
-    ).astype("U")
+    S = da.map_blocks(genotype_as_bytes, G, False, max_chars, drop_axis=1, dtype=dtype)
+    S = da.astype(S, "U{}".format(max_chars * ploidy + ploidy - 1))
     new_ds = create_dataset({variables.genotype_id: ("genotypes", S)})
     ds = conditional_merge_datasets(ds, new_ds, merge)
     if assign_coords:
@@ -803,22 +802,23 @@ def sample_stats(
     mixed_ploidy = ds[call_genotype].attrs.get("mixed_ploidy", False)
     if mixed_ploidy:
         raise ValueError("Mixed-ploidy dataset")
-    G = da.asarray(ds[call_genotype].data)
+    GT = da.asarray(ds[call_genotype].transpose("samples", "variants", "ploidy").data)
     H = xr.DataArray(
         da.map_blocks(
-            count_hom,
-            G.transpose(1, 0, 2),
+            lambda *args: count_hom(*args)[:, np.newaxis, :],
+            GT,
             np.zeros(3, np.uint64),
-            drop_axis=(1, 2),
-            new_axis=1,
+            drop_axis=2,
+            new_axis=2,
             dtype=np.int64,
-            chunks=(G.chunks[1], 3),
+            chunks=(GT.chunks[0], 1, 3),
         ),
-        dims=["samples", "categories"],
+        dims=["samples", "variants", "categories"],
     )
-    n_variant, _, _ = G.shape
+    H = H.sum(axis=1)
+    _, n_variant, _ = GT.shape
     n_called = H.sum(axis=-1)
-    call_rate = n_called / n_variant
+    call_rate = n_called.astype(float) / float(n_variant)
     n_hom_ref = H[:, 0]
     n_hom_alt = H[:, 1]
     n_het = H[:, 2]

diff --git a/sgkit/stats/hwe.py b/sgkit/stats/hwe.py
@@ -1,9 +1,9 @@
 from typing import Hashable, Optional
 
-import dask.array as da
 import numpy as np
 from xarray import Dataset
 
+import sgkit.distarray as da
 from sgkit import variables
 from sgkit.accelerate import numba_jit
 from sgkit.stats.aggregation import count_variant_genotypes

diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py
@@ -857,6 +857,17 @@ def test_sample_stats__raise_on_mixed_ploidy():
         sample_stats(ds)
 
 
+@pytest.mark.parametrize("chunks", [(-1, -1, -1), (100, -1, -1), (100, 10, -1)])
+def test_sample_stats__chunks(chunks):
+    ds = simulate_genotype_call_dataset(
+        n_variant=1000, n_sample=30, missing_pct=0.01, seed=0
+    )
+    expect = sample_stats(ds, merge=False).compute()
+    ds["call_genotype"] = ds["call_genotype"].chunk(chunks)
+    actual = sample_stats(ds, merge=False).compute()
+    assert actual.equals(expect)
+
+
 def test_infer_call_ploidy():
     ds = get_dataset(
         [