Skip to content

Commit

Permalink
Add more checks to sequence metadata obtained via URL
Browse files Browse the repository at this point in the history
These additional checks do some basic asserts to ensure
the schema of the metadata columns used by Cladetime
and to ensure completeness/uniqueness of the strain
column (which acts, essentially, as a primary key)
  • Loading branch information
bsweger committed Dec 6, 2024
1 parent 03f4415 commit fc1d971
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions tests/unit/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,13 @@ def test_get_metadata_url(s3_setup, test_file_path):
# ensure lazyframe can be collected and check its shape and columns
metadata_df = metadata.collect()
assert metadata_df.shape == (99373, 58)
assert all(
col in metadata_df.columns for col in ["strain", "date", "country", "division", "location", "clade_nextstrain"]
)
# focus on a handful of columns that an integral to cladetime
metadata_df = metadata.collect().select("strain", "date", "country", "division", "location", "clade_nextstrain")
# strain column is required and should be unique
assert metadata_df.select("strain").n_unique() == len(metadata_df)
# all columns should have a string data type
for data_type in metadata_df.schema.to_python().values():
assert data_type is str

# Get metadata file from S3 using XZ compression. Here we can use a presigned S3 URL
# because for .xz files, get_metadata uses requests to download the file in chunks
Expand All @@ -85,9 +89,13 @@ def test_get_metadata_url(s3_setup, test_file_path):
# ensure lazyframe can be collected and check its shape and columns
metadata_df = metadata.collect()
assert metadata_df.shape == (99373, 58)
assert all(
col in metadata_df.columns for col in ["strain", "date", "country", "division", "location", "clade_nextstrain"]
)
# focus on a handful of columns that an integral to cladetime
metadata_df = metadata.collect().select("strain", "date", "country", "division", "location", "clade_nextstrain")
# strain column is required and should be unique
assert metadata_df.select("strain").n_unique() == len(metadata_df)
# all columns should have a string data type
for data_type in metadata_df.schema.to_python().values():
assert data_type is str


def test_filter_metadata():
Expand Down

0 comments on commit fc1d971

Please sign in to comment.