Add insta-science-util download. (#14)

Also add `insta-science-util cache {prune,purge}`.
a-scie · Dec 31, 2024 · dad6806 · dad6806
1 parent abfec30
commit dad6806
Show file tree

Hide file tree

Showing 18 changed files with 601 additions and 88 deletions.
diff --git a/python/CHANGES.md b/python/CHANGES.md
@@ -1,5 +1,11 @@
 # insta-science
 
+## 0.4.0
+
+Flesh out the `insta-science-util` script adding support for downloading `science` executables for
+offline serving in firewalled environments as well as support for managing the `insta-science`
+cache.
+
 ## 0.3.1
 
 Fix the semi-automated release process.

diff --git a/python/insta_science/__init__.py b/python/insta_science/__init__.py
@@ -8,7 +8,9 @@
     InputError,
     Platform,
     Science,
+    ScienceExe,
     ScienceNotFound,
+    Url,
     ensure_installed,
 )
 from .version import __version__
@@ -20,7 +22,9 @@
     "InputError",
     "Platform",
     "Science",
+    "ScienceExe",
     "ScienceNotFound",
+    "Url",
     "__version__",
     "ensure_installed",
 )
diff --git a/python/insta_science/_colors.py b/python/insta_science/_colors.py
@@ -20,13 +20,19 @@ def red(self, text) -> str:
     def yellow(self, text) -> str:
         return self.color(text, fg="yellow")
 
-    def color(self, text, fg: str):
+    def green(self, text) -> str:
+        return self.color(text, fg="green")
+
+    def gray(self, text) -> str:
+        return self.color(text, fg="gray")
+
+    def color(self, text, fg: str | None = None, style: str | None = None):
         if not self.use_color:
             return text
 
         import colors
 
-        return colors.color(text, fg=fg)
+        return colors.color(text, fg=fg, style=style)
 
 
 @contextmanager

diff --git a/python/insta_science/_internal/__init__.py b/python/insta_science/_internal/__init__.py
@@ -3,7 +3,7 @@
 
 from .errors import InputError, ScienceNotFound
 from .hashing import Fingerprint
-from .model import Digest, Science
+from .model import Digest, Science, ScienceExe, Url
 from .platform import CURRENT_PLATFORM, Platform
 from .science import ensure_installed
 
@@ -14,6 +14,8 @@
     "InputError",
     "Platform",
     "Science",
+    "ScienceExe",
     "ScienceNotFound",
+    "Url",
     "ensure_installed",
 )
diff --git a/python/insta_science/_internal/a_scie.py b/python/insta_science/_internal/a_scie.py
@@ -6,14 +6,18 @@
 from dataclasses import dataclass
 from datetime import timedelta
 from pathlib import PurePath
+from typing import Iterator
 
 from packaging.version import Version
 
+from .cache import DOWNLOAD_CACHE
 from .fetcher import fetch_and_verify
 from .hashing import Digest, Fingerprint
-from .model import Science, Url
+from .model import Science, ScienceExe, Url
 from .platform import CURRENT_PLATFORM, Platform
 
+_DOWNLOAD_NAMESPACE = "url-exes"
+
 
 @dataclass(frozen=True)
 class _LoadResult:
@@ -22,14 +26,13 @@ class _LoadResult:
 
 
 def _load_project_release(
-    project_name: str,
+    base_url: Url,
     binary_name: str,
     version: Version | None = None,
     fingerprint: Digest | Fingerprint | None = None,
     platform: Platform = CURRENT_PLATFORM,
 ) -> _LoadResult:
     qualified_binary_name = platform.qualified_binary_name(binary_name)
-    base_url = f"https://github.com/a-scie/{project_name}/releases"
     if version:
         version_path = f"download/v{version}"
         ttl = None
@@ -38,20 +41,26 @@ def _load_project_release(
         ttl = timedelta(days=5)
     path = fetch_and_verify(
         url=Url(f"{base_url}/{version_path}/{qualified_binary_name}"),
+        namespace=_DOWNLOAD_NAMESPACE,
         fingerprint=fingerprint,
         executable=True,
         ttl=ttl,
     )
     return _LoadResult(path=path, binary_name=qualified_binary_name)
 
 
-def science(spec: Science | None = None, platform: Platform = CURRENT_PLATFORM) -> PurePath:
-    version = spec.version if spec else None
-    fingerprint = spec.digest if spec and spec.digest else None
-    return _load_project_release(
-        project_name="lift",
-        binary_name="science-fat",
-        version=version,
-        fingerprint=fingerprint,
-        platform=platform,
-    ).path
+def science(spec: Science = Science(), platform: Platform = CURRENT_PLATFORM) -> ScienceExe:
+    return spec.exe(
+        _load_project_release(
+            base_url=spec.base_url,
+            binary_name="science-fat",
+            version=spec.version,
+            fingerprint=spec.digest,
+            platform=platform,
+        ).path
+    )
+
+
+def iter_science_exes() -> Iterator[ScienceExe]:
+    for path in DOWNLOAD_CACHE.iter_entries(namespace=_DOWNLOAD_NAMESPACE):
+        yield ScienceExe(path)
diff --git a/python/insta_science/_internal/bytes.py b/python/insta_science/_internal/bytes.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Science project contributors.
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from enum import Enum
+from typing import Callable
+
+
+@dataclass(frozen=True)
+class Unit:
+    _name: str
+    multiple: float
+    _singular: str | None = None
+
+    def render(self, total_bytes: int | float) -> str:
+        return self._singular if (self._singular and round(total_bytes) == 1) else self._name
+
+
+class ByteUnits(Unit, Enum):
+    BYTES = "bytes", 1.0, "byte"
+    KB = "kB", 1000 * BYTES[1]
+    MB = "MB", 1000 * KB[1]
+    GB = "GB", 1000 * MB[1]
+    TB = "TB", 1000 * GB[1]
+    PB = "PB", 1000 * TB[1]
+
+
+@dataclass(frozen=True)
+class ByteAmount(object):
+    @classmethod
+    def bytes(cls, total_bytes: int) -> ByteAmount:
+        return cls(total_bytes=total_bytes, unit=ByteUnits.BYTES)
+
+    @classmethod
+    def kilobytes(cls, total_bytes: int) -> ByteAmount:
+        return cls(total_bytes=total_bytes, unit=ByteUnits.KB)
+
+    @classmethod
+    def megabytes(cls, total_bytes: int) -> ByteAmount:
+        return cls(total_bytes=total_bytes, unit=ByteUnits.MB)
+
+    @classmethod
+    def gigabytes(cls, total_bytes: int) -> ByteAmount:
+        return cls(total_bytes=total_bytes, unit=ByteUnits.GB)
+
+    @classmethod
+    def terabytes(cls, total_bytes: int) -> ByteAmount:
+        return cls(total_bytes=total_bytes, unit=ByteUnits.TB)
+
+    @classmethod
+    def petabytes(cls, total_bytes: int) -> ByteAmount:
+        return cls(total_bytes=total_bytes, unit=ByteUnits.PB)
+
+    @classmethod
+    def human_readable(cls, total_bytes: int) -> ByteAmount:
+        def select_unit():
+            for unit in ByteUnits:
+                if total_bytes < (1000 * unit.multiple):
+                    return unit
+            return ByteUnits.PB
+
+        return cls(total_bytes=total_bytes, unit=select_unit())
+
+    @classmethod
+    def for_unit(cls, unit: Unit) -> Callable[[int], ByteAmount]:
+        if ByteUnits.BYTES is unit:
+            return cls.bytes
+        elif ByteUnits.KB is unit:
+            return cls.kilobytes
+        elif ByteUnits.MB is unit:
+            return cls.megabytes
+        elif ByteUnits.GB is unit:
+            return cls.gigabytes
+        elif ByteUnits.TB is unit:
+            return cls.terabytes
+        elif ByteUnits.PB is unit:
+            return cls.petabytes
+        raise ValueError(
+            "The unit {unit} has no known corresponding byte amount function".format(unit=unit)
+        )
+
+    total_bytes: int
+    unit: Unit
+
+    def __str__(self) -> str:
+        amount = self.total_bytes / self.unit.multiple
+        integer_part = math.trunc(amount)
+        if self.unit is ByteUnits.BYTES or integer_part // 100 > 0:
+            return "{amount} {unit}".format(amount=round(amount), unit=self.unit.render(amount))
+        elif integer_part // 10 > 0:
+            return "{amount:.1f} {unit}".format(amount=amount, unit=self.unit.render(amount))
+        elif integer_part > 0:
+            return "{amount:.2f} {unit}".format(amount=amount, unit=self.unit.render(amount))
+        else:
+            return "{amount:.3f} {unit}".format(amount=amount, unit=self.unit.render(amount))
diff --git a/python/insta_science/_internal/cache.py b/python/insta_science/_internal/cache.py
@@ -16,6 +16,8 @@
 from filelock import FileLock
 from typing_extensions import TypeAlias
 
+from insta_science._internal.du import DiskUsage
+
 
 @dataclass(frozen=True)
 class Complete:
@@ -31,30 +33,41 @@ class Missing:
 CacheResult: TypeAlias = Union[Complete, Missing]
 
 
-_TTL_EXPIRY_FORMAT = "%m/%d/%y %H:%M:%S"
-
-
 @dataclass(frozen=True)
 class DownloadCache:
+    _TTL_EXPIRY_FORMAT = "%m/%d/%y %H:%M:%S"
+    _CACHED_EXT = ".cached"
+
+    # Bump this when changing cache layout.
+    _CACHE_VERSION = 1
+
     base_dir: Path
 
+    @property
+    def _base(self) -> Path:
+        return self.base_dir / str(self._CACHE_VERSION)
+
     @contextmanager
-    def get_or_create(self, url: str, ttl: timedelta | None = None) -> Iterator[CacheResult]:
+    def get_or_create(
+        self, url: str, *, namespace: str, ttl: timedelta | None = None
+    ) -> Iterator[CacheResult]:
         """A context manager that yields a `cache result.
 
         If the cache result is `Missing`, the block yielded to should materialize the given url
         to the `Missing.work` path. Upon successful exit from this context manager, the given url's
         content will exist at the cache result path.
         """
-        cached_file = self.base_dir / hashlib.sha256(url.encode()).hexdigest()
+        cached_file = (
+            self._base / namespace / f"{hashlib.sha256(url.encode()).hexdigest()}{self._CACHED_EXT}"
+        )
 
         ttl_file = cached_file.with_suffix(".ttl") if ttl else None
         if ttl_file and not ttl_file.exists():
             cached_file.unlink(missing_ok=True)
         elif ttl_file:
             try:
                 datetime_object = datetime.strptime(
-                    ttl_file.read_text().strip(), _TTL_EXPIRY_FORMAT
+                    ttl_file.read_text().strip(), self._TTL_EXPIRY_FORMAT
                 )
                 if datetime.now() > datetime_object:
                     cached_file.unlink(missing_ok=True)
@@ -79,7 +92,18 @@ def get_or_create(self, url: str, ttl: timedelta | None = None) -> Iterator[Cach
                 return
             work.rename(cached_file)
             if ttl_file and ttl:
-                ttl_file.write_text((datetime.now() + ttl).strftime(_TTL_EXPIRY_FORMAT))
+                ttl_file.write_text((datetime.now() + ttl).strftime(self._TTL_EXPIRY_FORMAT))
+
+    def iter_entries(self, *, namespace: str | None = None) -> Iterator[Path]:
+        try:
+            for path in (self._base / namespace if namespace else self._base).iterdir():
+                if path.suffix == self._CACHED_EXT:
+                    yield path
+        except FileNotFoundError:
+            pass
+
+    def usage(self) -> DiskUsage:
+        return DiskUsage.collect(str(self._base))
 
 
 DOWNLOAD_CACHE = DownloadCache(

diff --git a/python/insta_science/_internal/du.py b/python/insta_science/_internal/du.py
@@ -0,0 +1,56 @@
+# Copyright 2024 Science project contributors.
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from __future__ import annotations
+
+import itertools
+import os
+import stat
+from dataclasses import dataclass
+from pathlib import PurePath
+from typing import Iterable
+
+
+@dataclass(frozen=True)
+class DiskUsage(object):
+    @classmethod
+    def aggregate(cls, path: str, usages: Iterable[DiskUsage]) -> DiskUsage:
+        subdirs = 0
+        files = 0
+        size = 0
+        for disk_usage in usages:
+            subdirs += disk_usage.subdirs
+            files += disk_usage.files
+            size += disk_usage.size
+        return cls(path=PurePath(path), subdirs=subdirs, files=files, size=size)
+
+    @classmethod
+    def collect(cls, path: str) -> DiskUsage:
+        """Collects data with the same default semantics as `du`.
+
+        Does not count directory inode sizes.
+        Only counts hard linked file sizes once.
+        Counts symlink size as the size of the target path string not including the null terminator.
+        """
+        subdir_count = 0
+        file_count = 0
+        size = 0
+        seen: set[int] = set()
+        for root, dirs, files in os.walk(path):
+            for f in itertools.chain(dirs, files):
+                stat_info = os.lstat(os.path.join(root, f))
+                if stat_info.st_ino in seen:
+                    continue
+                seen.add(stat_info.st_ino)
+                if stat.S_ISDIR(stat_info.st_mode):
+                    subdir_count += 1
+                else:
+                    file_count += 1
+                    size += stat_info.st_size
+
+        return cls(path=PurePath(path), subdirs=subdir_count, files=file_count, size=size)
+
+    path: PurePath
+    subdirs: int
+    files: int
+    size: int