Skip to content

Commit

Permalink
refactor: Clean up tools.datasets
Browse files Browse the repository at this point in the history
- `Application.generate_typing` now mostly populated by `DataPackage` methods
- Docs are defined alongside expressions
- Factored out repetitive code into `spell_literal_alias`
- `Metadata` examples table is now generated inside the doc
  • Loading branch information
dangotbanned committed Jan 18, 2025
1 parent ad4c747 commit 63f4be0
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 244 deletions.
179 changes: 29 additions & 150 deletions tools/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,28 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

import polars as pl
from polars import col

from tools.codemod import ruff
from tools.datasets.npm import Npm
from tools.fs import REPO_ROOT
from tools.schemapi import utils

if TYPE_CHECKING:
import sys
from collections.abc import Mapping

import polars as pl

from tools.datasets import datapackage

if sys.version_info >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias

_PathAlias: TypeAlias = Literal[
"typing",
"metadata-csv",
"metadata",
"schemas",
"typing", "metadata-csv", "metadata", "schemas", "datapackage"
]
PathMap: TypeAlias = Mapping[_PathAlias, Path]

__all__ = ["app"]

Expand All @@ -63,34 +63,27 @@ class Application:
Directories to store ``.parquet`` metadata files.
out_fp_typing
Path to write metadata-derived typing module.
kwds_npm
Arguments passed to corresponding constructor.
See Also
--------
- tools.datasets.npm.Npm
"""

def __init__(
self,
out_dir_tools: Path,
out_dir_altair: Path,
out_fp_typing: Path,
*,
kwds_npm: Mapping[str, Any] | None = None,
self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path
) -> None:
out_dir_tools.mkdir(exist_ok=True)
kwds_npm = kwds_npm or {}
self._npm: Npm = Npm(out_dir_tools, **kwds_npm)
METADATA = "metadata"
self.paths = types.MappingProxyType["_PathAlias", Path](
{
"typing": out_fp_typing,
"metadata-csv": out_dir_altair / f"{METADATA}.csv.gz",
"metadata": out_dir_altair / f"{METADATA}.parquet",
"schemas": out_dir_altair / "schemas.json.gz",
"datapackage": out_dir_tools / "datapackage.json",
}
)
self._npm: Npm = Npm(self.paths)

@property
def npm(self) -> Npm:
Expand Down Expand Up @@ -118,20 +111,15 @@ def refresh(
https://github.com/vega/vega-datasets/issues/654
"""
print("Syncing datasets ...")
package = self.npm.datapackage(tag=tag, frozen=frozen)
self.write_parquet(package["features"], self.paths["metadata"])
self.write_json_gzip(package["schemas"], self.paths["schemas"])
metadata_min = (
package["features"]
.lazy()
.filter(col("suffix") != ".arrow")
.sort("dataset_name")
)
self.write_csv_gzip(metadata_min, self.paths["metadata-csv"])
dpkg = self.npm.datapackage(tag=tag, frozen=frozen)
self.write_parquet(dpkg.core, self.paths["metadata"])
self.write_json_gzip(dpkg.schemas(), self.paths["schemas"])
self.write_csv_gzip(dpkg.metadata_csv(), self.paths["metadata-csv"])
print("Finished updating datasets.")

if include_typing:
self.generate_typing()
return package["features"]
self.generate_typing(dpkg)
return dpkg.core.collect()

def reset(self) -> None:
"""Remove all metadata files."""
Expand All @@ -140,10 +128,14 @@ def reset(self) -> None:

def read(self, name: _PathAlias, /) -> pl.DataFrame:
"""Read existing metadata from file."""
import polars as pl

return pl.read_parquet(self.paths[name])

def scan(self, name: _PathAlias, /) -> pl.LazyFrame:
"""Scan existing metadata from file."""
import polars as pl

return pl.scan_parquet(self.paths[name])

def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
Expand Down Expand Up @@ -190,114 +182,16 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
df = frame.lazy().collect()
df.write_parquet(fp, compression="zstd", compression_level=17)

def generate_typing(self) -> None:
from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT

dpkg = self.scan("metadata")
metadata_schema = dpkg.collect_schema().to_python()

DATASET_NAME = "dataset_name"
names = (
dpkg.unique(DATASET_NAME)
.select(DATASET_NAME)
.sort(DATASET_NAME)
.collect()
.to_series()
)
def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
indent = " " * 4
NAME = "Dataset"
EXT = "Extension"
EXT_TYPES = tuple(
dpkg.filter(is_image=False)
.select(col("suffix").unique().sort())
.collect()
.to_series()
.to_list()
)
EXT_TYPES = dpkg.extensions()
EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES"
EXTENSION_TYPE_TP = (
f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXT_TYPES)}]"
)
EXTENSION_GUARD = "is_ext_read"
METADATA_TD = "Metadata"
DESCRIPTION_DEFAULT = "_description_"
NOTE_SEP = f"\n\n{indent * 2}.. note::\n{indent * 3}"

sha = (
f"Unique hash for the dataset.{NOTE_SEP}"
f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
f"then this value would remain stable."
)
links = (
f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n"
f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n"
f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n"
f".. _GeoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON\n"
f".. _TopoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON#TopoJSON\n"
)
import textwrap

# NOTE: Uses `pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80)`
examples = f"""\
Examples
--------
``{METADATA_TD}`` keywords form constraints to filter a table like the below sample:
```
shape: (73, 13)
┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐
│ dataset_name ┆ suffix ┆ file_name ┆ … ┆ sha ┆ url │
│ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ ┆ str ┆ str │
╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡
│ 7zip ┆ .png ┆ 7zip.png ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │
│ ┆ ┆ ┆ ┆ d48850099c17… ┆ sdelivr.net/… │
│ airports ┆ .csv ┆ airports.csv ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │
│ ┆ ┆ ┆ ┆ 584c3fa1d31e… ┆ sdelivr.net/… │
│ annual-precip ┆ .json ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │
│ ┆ ┆ json ┆ ┆ 8f16dda65151… ┆ sdelivr.net/… │
│ anscombe ┆ .json ┆ anscombe.json ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │
│ ┆ ┆ ┆ ┆ 3bdf0c866115… ┆ sdelivr.net/… │
│ barley ┆ .json ┆ barley.json ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │
│ ┆ ┆ ┆ ┆ e197ce95c24c… ┆ sdelivr.net/… │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ weekly-weather ┆ .json ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │
│ ┆ ┆ .json ┆ ┆ ccd6baaa89f9… ┆ sdelivr.net/… │
│ wheat ┆ .json ┆ wheat.json ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │
│ ┆ ┆ ┆ ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │
│ windvectors ┆ .csv ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │
│ ┆ ┆ v ┆ ┆ bd59d09fcd94… ┆ sdelivr.net/… │
│ world-110m ┆ .json ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │
│ ┆ ┆ n ┆ ┆ 13c94c0c2840… ┆ sdelivr.net/… │
│ zipcodes ┆ .csv ┆ zipcodes.csv ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │
│ ┆ ┆ ┆ ┆ 0544c95f1bd4… ┆ sdelivr.net/… │
└────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘
```
"""

descriptions: dict[str, str] = {
"dataset_name": "Name of the dataset/`Path.stem`_.",
"suffix": "File extension/`Path.suffix`_.",
"file_name": "Equivalent to `Path.name`_.",
"bytes": "File size in *bytes*.",
"is_tabular": "Can be read as tabular data.",
"is_image": "Only accessible via url.",
"is_geo": "`GeoJSON`_ format.",
"is_topo": "`TopoJSON`_ format.",
"is_spatial": "Any geospatial format. Only natively supported by ``polars``.",
"is_json": "Not supported natively by ``pyarrow``.",
"has_schema": "Data types available for improved ``pandas`` parsing.",
"sha": sha,
"url": "Remote url used to access dataset.",
}
metadata_doc = (
f"\n{indent}".join(
f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
for param in metadata_schema
)
+ f"\n\n{links}\n\n"
f"{textwrap.indent(textwrap.dedent(examples), indent)}"
)

FIELD = "FlFieldStr"
FIELD_TYPES = (
Expand All @@ -322,23 +216,14 @@ def generate_typing(self) -> None:
utils.import_typing_extensions((3, 13), "TypeIs"),
utils.import_typing_extensions((3, 10), "TypeAlias"),
"\n",
f"__all__ = {[NAME, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n"
f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
f"{EXT}: TypeAlias = {utils.spell_literal(EXT_TYPES)}",
f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n",
utils.spell_literal_alias(NAME, dpkg.dataset_names()),
utils.spell_literal_alias(EXT, EXT_TYPES),
f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXT_TYPES!r}",
f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n"
f"{indent}return suffix in set({EXT_TYPES!r})\n",
UNIVERSAL_TYPED_DICT.format(
name=METADATA_TD,
metaclass_kwds=", total=False",
td_args=f"\n{indent}".join(
f"{param}: {tp.__name__}" for param, tp in metadata_schema.items()
),
summary="Full schema for ``metadata.parquet``.",
doc=metadata_doc,
comment="",
),
f"{FIELD}: TypeAlias = {utils.spell_literal(FIELD_TYPES)}\n"
dpkg.typed_dict(),
utils.spell_literal_alias(FIELD, FIELD_TYPES),
'"""\n'
"String representation of `frictionless`_ `Field Types`_.\n\n"
f".. _frictionless:\n{indent}https://github.com/frictionlessdata/frictionless-py\n"
Expand All @@ -348,15 +233,9 @@ def generate_typing(self) -> None:
ruff.write_lint_format(self.paths["typing"], contents)


_alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets"
_alt_datasets = REPO_ROOT / "altair" / "datasets"
app = Application(
Path(__file__).parent / "_metadata",
_alt_datasets / "_metadata",
_alt_datasets / "_typing.py",
)


# This is the tag in http://github.com/vega/vega-datasets from
# which the datasets in this repository are sourced.
_OLD_SOURCE_TAG = "v1.29.0" # 5 years ago
_CURRENT_SOURCE_TAG = "v2.9.0"
Loading

0 comments on commit 63f4be0

Please sign in to comment.