Skip to content

Commit

Permalink
refactor: Make caching **opt-out**, use $XDG_CACHE_HOME
Browse files Browse the repository at this point in the history
Caching is the more sensible default when considering a notebook environment
Using a standardised path now also https://specifications.freedesktop.org/basedir-spec/latest/#variables
  • Loading branch information
dangotbanned committed Jan 14, 2025
1 parent 0df79b0 commit ee0d381
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 11 deletions.
24 changes: 17 additions & 7 deletions altair/datasets/_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,12 @@ def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]:


class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
"""Optional caching of remote dataset requests."""
"""Opt-out caching of remote dataset requests."""

_ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
_XDG_CACHE: ClassVar[Path] = (
Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) / "altair"
).resolve()

def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None:
self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader
Expand Down Expand Up @@ -273,9 +276,13 @@ def path(self) -> Path:
"""
Returns path to datasets cache.
By default, this can be configured using the environment variable:
Defaults to (`XDG_CACHE_HOME`_):
"ALTAIR_DATASETS_DIR"
"$XDG_CACHE_HOME/altair/"
But can be configured using the environment variable:
"$ALTAIR_DATASETS_DIR"
You can set this for the current session via:
Expand All @@ -289,18 +296,21 @@ def path(self) -> Path:
You can *later* disable caching via:
>>> load.cache.path = None
.. _XDG_CACHE_HOME:
https://specifications.freedesktop.org/basedir-spec/latest/#variables
"""
self._ensure_active()
fp = Path(os.environ[self._ENV_VAR])
fp.mkdir(exist_ok=True)
fp = Path(usr) if (usr := os.environ.get(self._ENV_VAR)) else self._XDG_CACHE
fp.mkdir(parents=True, exist_ok=True)
return fp

@path.setter
def path(self, source: StrPath | None, /) -> None:
if source is not None:
os.environ[self._ENV_VAR] = str(Path(source).resolve())
else:
os.environ.pop(self._ENV_VAR, None)
os.environ[self._ENV_VAR] = ""

def __iter__(self) -> Iterator[Path]:
yield from self.path.iterdir()
Expand All @@ -316,7 +326,7 @@ def is_active(self) -> bool:
return not self.is_not_active()

def is_not_active(self) -> bool:
return os.environ.get(self._ENV_VAR) is None
return os.environ.get(self._ENV_VAR) == ""

def is_empty(self) -> bool:
"""Cache is active, but no files are stored in ``self.path``."""
Expand Down
12 changes: 8 additions & 4 deletions altair/datasets/_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

class Loader(Generic[IntoDataFrameT, IntoFrameT]):
"""
Load examples **remotely** from `vega-datasets`_, with *optional* caching.
Load examples **remotely** from `vega-datasets`_, with caching.
A new ``Loader`` must be initialized by specifying a backend:
Expand Down Expand Up @@ -280,11 +280,11 @@ def url(
@property
def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
"""
Optional caching of remote dataset requests.
Caching of remote dataset requests.
Enable caching:
Configure cache path:
self.cache.path = ...
self.cache.path = "..."
Download the latest datasets *ahead-of-time*:
Expand All @@ -293,6 +293,10 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
Remove all downloaded datasets:
self.cache.clear()
Disable caching:
self.cache.path = None
"""
return self._reader.cache

Expand Down

0 comments on commit ee0d381

Please sign in to comment.