Skip to content

Commit

Permalink
Add insta-science-util download. (#14)
Browse files Browse the repository at this point in the history
Also add `insta-science-util cache {prune,purge}`.
  • Loading branch information
jsirois authored Dec 31, 2024
1 parent abfec30 commit dad6806
Show file tree
Hide file tree
Showing 18 changed files with 601 additions and 88 deletions.
6 changes: 6 additions & 0 deletions python/CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# insta-science

## 0.4.0

Flesh out the `insta-science-util` script adding support for downloading `science` executables for
offline serving in firewalled environments as well as support for managing the `insta-science`
cache.

## 0.3.1

Fix the semi-automated release process.
Expand Down
4 changes: 4 additions & 0 deletions python/insta_science/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
InputError,
Platform,
Science,
ScienceExe,
ScienceNotFound,
Url,
ensure_installed,
)
from .version import __version__
Expand All @@ -20,7 +22,9 @@
"InputError",
"Platform",
"Science",
"ScienceExe",
"ScienceNotFound",
"Url",
"__version__",
"ensure_installed",
)
10 changes: 8 additions & 2 deletions python/insta_science/_colors.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,19 @@ def red(self, text) -> str:
def yellow(self, text) -> str:
return self.color(text, fg="yellow")

def color(self, text, fg: str):
def green(self, text) -> str:
return self.color(text, fg="green")

def gray(self, text) -> str:
return self.color(text, fg="gray")

def color(self, text, fg: str | None = None, style: str | None = None):
if not self.use_color:
return text

import colors

return colors.color(text, fg=fg)
return colors.color(text, fg=fg, style=style)


@contextmanager
Expand Down
4 changes: 3 additions & 1 deletion python/insta_science/_internal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from .errors import InputError, ScienceNotFound
from .hashing import Fingerprint
from .model import Digest, Science
from .model import Digest, Science, ScienceExe, Url
from .platform import CURRENT_PLATFORM, Platform
from .science import ensure_installed

Expand All @@ -14,6 +14,8 @@
"InputError",
"Platform",
"Science",
"ScienceExe",
"ScienceNotFound",
"Url",
"ensure_installed",
)
35 changes: 22 additions & 13 deletions python/insta_science/_internal/a_scie.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
from dataclasses import dataclass
from datetime import timedelta
from pathlib import PurePath
from typing import Iterator

from packaging.version import Version

from .cache import DOWNLOAD_CACHE
from .fetcher import fetch_and_verify
from .hashing import Digest, Fingerprint
from .model import Science, Url
from .model import Science, ScienceExe, Url
from .platform import CURRENT_PLATFORM, Platform

_DOWNLOAD_NAMESPACE = "url-exes"


@dataclass(frozen=True)
class _LoadResult:
Expand All @@ -22,14 +26,13 @@ class _LoadResult:


def _load_project_release(
project_name: str,
base_url: Url,
binary_name: str,
version: Version | None = None,
fingerprint: Digest | Fingerprint | None = None,
platform: Platform = CURRENT_PLATFORM,
) -> _LoadResult:
qualified_binary_name = platform.qualified_binary_name(binary_name)
base_url = f"https://github.com/a-scie/{project_name}/releases"
if version:
version_path = f"download/v{version}"
ttl = None
Expand All @@ -38,20 +41,26 @@ def _load_project_release(
ttl = timedelta(days=5)
path = fetch_and_verify(
url=Url(f"{base_url}/{version_path}/{qualified_binary_name}"),
namespace=_DOWNLOAD_NAMESPACE,
fingerprint=fingerprint,
executable=True,
ttl=ttl,
)
return _LoadResult(path=path, binary_name=qualified_binary_name)


def science(spec: Science | None = None, platform: Platform = CURRENT_PLATFORM) -> PurePath:
version = spec.version if spec else None
fingerprint = spec.digest if spec and spec.digest else None
return _load_project_release(
project_name="lift",
binary_name="science-fat",
version=version,
fingerprint=fingerprint,
platform=platform,
).path
def science(spec: Science = Science(), platform: Platform = CURRENT_PLATFORM) -> ScienceExe:
return spec.exe(
_load_project_release(
base_url=spec.base_url,
binary_name="science-fat",
version=spec.version,
fingerprint=spec.digest,
platform=platform,
).path
)


def iter_science_exes() -> Iterator[ScienceExe]:
for path in DOWNLOAD_CACHE.iter_entries(namespace=_DOWNLOAD_NAMESPACE):
yield ScienceExe(path)
98 changes: 98 additions & 0 deletions python/insta_science/_internal/bytes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Copyright 2024 Science project contributors.
# Licensed under the Apache License, Version 2.0 (see LICENSE).

from __future__ import annotations

import math
from dataclasses import dataclass
from enum import Enum
from typing import Callable


@dataclass(frozen=True)
class Unit:
_name: str
multiple: float
_singular: str | None = None

def render(self, total_bytes: int | float) -> str:
return self._singular if (self._singular and round(total_bytes) == 1) else self._name


class ByteUnits(Unit, Enum):
BYTES = "bytes", 1.0, "byte"
KB = "kB", 1000 * BYTES[1]
MB = "MB", 1000 * KB[1]
GB = "GB", 1000 * MB[1]
TB = "TB", 1000 * GB[1]
PB = "PB", 1000 * TB[1]


@dataclass(frozen=True)
class ByteAmount(object):
@classmethod
def bytes(cls, total_bytes: int) -> ByteAmount:
return cls(total_bytes=total_bytes, unit=ByteUnits.BYTES)

@classmethod
def kilobytes(cls, total_bytes: int) -> ByteAmount:
return cls(total_bytes=total_bytes, unit=ByteUnits.KB)

@classmethod
def megabytes(cls, total_bytes: int) -> ByteAmount:
return cls(total_bytes=total_bytes, unit=ByteUnits.MB)

@classmethod
def gigabytes(cls, total_bytes: int) -> ByteAmount:
return cls(total_bytes=total_bytes, unit=ByteUnits.GB)

@classmethod
def terabytes(cls, total_bytes: int) -> ByteAmount:
return cls(total_bytes=total_bytes, unit=ByteUnits.TB)

@classmethod
def petabytes(cls, total_bytes: int) -> ByteAmount:
return cls(total_bytes=total_bytes, unit=ByteUnits.PB)

@classmethod
def human_readable(cls, total_bytes: int) -> ByteAmount:
def select_unit():
for unit in ByteUnits:
if total_bytes < (1000 * unit.multiple):
return unit
return ByteUnits.PB

return cls(total_bytes=total_bytes, unit=select_unit())

@classmethod
def for_unit(cls, unit: Unit) -> Callable[[int], ByteAmount]:
if ByteUnits.BYTES is unit:
return cls.bytes
elif ByteUnits.KB is unit:
return cls.kilobytes
elif ByteUnits.MB is unit:
return cls.megabytes
elif ByteUnits.GB is unit:
return cls.gigabytes
elif ByteUnits.TB is unit:
return cls.terabytes
elif ByteUnits.PB is unit:
return cls.petabytes
raise ValueError(
"The unit {unit} has no known corresponding byte amount function".format(unit=unit)
)

total_bytes: int
unit: Unit

def __str__(self) -> str:
amount = self.total_bytes / self.unit.multiple
integer_part = math.trunc(amount)
if self.unit is ByteUnits.BYTES or integer_part // 100 > 0:
return "{amount} {unit}".format(amount=round(amount), unit=self.unit.render(amount))
elif integer_part // 10 > 0:
return "{amount:.1f} {unit}".format(amount=amount, unit=self.unit.render(amount))
elif integer_part > 0:
return "{amount:.2f} {unit}".format(amount=amount, unit=self.unit.render(amount))
else:
return "{amount:.3f} {unit}".format(amount=amount, unit=self.unit.render(amount))
38 changes: 31 additions & 7 deletions python/insta_science/_internal/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from filelock import FileLock
from typing_extensions import TypeAlias

from insta_science._internal.du import DiskUsage


@dataclass(frozen=True)
class Complete:
Expand All @@ -31,30 +33,41 @@ class Missing:
CacheResult: TypeAlias = Union[Complete, Missing]


_TTL_EXPIRY_FORMAT = "%m/%d/%y %H:%M:%S"


@dataclass(frozen=True)
class DownloadCache:
_TTL_EXPIRY_FORMAT = "%m/%d/%y %H:%M:%S"
_CACHED_EXT = ".cached"

# Bump this when changing cache layout.
_CACHE_VERSION = 1

base_dir: Path

@property
def _base(self) -> Path:
return self.base_dir / str(self._CACHE_VERSION)

@contextmanager
def get_or_create(self, url: str, ttl: timedelta | None = None) -> Iterator[CacheResult]:
def get_or_create(
self, url: str, *, namespace: str, ttl: timedelta | None = None
) -> Iterator[CacheResult]:
"""A context manager that yields a `cache result.
If the cache result is `Missing`, the block yielded to should materialize the given url
to the `Missing.work` path. Upon successful exit from this context manager, the given url's
content will exist at the cache result path.
"""
cached_file = self.base_dir / hashlib.sha256(url.encode()).hexdigest()
cached_file = (
self._base / namespace / f"{hashlib.sha256(url.encode()).hexdigest()}{self._CACHED_EXT}"
)

ttl_file = cached_file.with_suffix(".ttl") if ttl else None
if ttl_file and not ttl_file.exists():
cached_file.unlink(missing_ok=True)
elif ttl_file:
try:
datetime_object = datetime.strptime(
ttl_file.read_text().strip(), _TTL_EXPIRY_FORMAT
ttl_file.read_text().strip(), self._TTL_EXPIRY_FORMAT
)
if datetime.now() > datetime_object:
cached_file.unlink(missing_ok=True)
Expand All @@ -79,7 +92,18 @@ def get_or_create(self, url: str, ttl: timedelta | None = None) -> Iterator[Cach
return
work.rename(cached_file)
if ttl_file and ttl:
ttl_file.write_text((datetime.now() + ttl).strftime(_TTL_EXPIRY_FORMAT))
ttl_file.write_text((datetime.now() + ttl).strftime(self._TTL_EXPIRY_FORMAT))

def iter_entries(self, *, namespace: str | None = None) -> Iterator[Path]:
try:
for path in (self._base / namespace if namespace else self._base).iterdir():
if path.suffix == self._CACHED_EXT:
yield path
except FileNotFoundError:
pass

def usage(self) -> DiskUsage:
return DiskUsage.collect(str(self._base))


DOWNLOAD_CACHE = DownloadCache(
Expand Down
56 changes: 56 additions & 0 deletions python/insta_science/_internal/du.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright 2024 Science project contributors.
# Licensed under the Apache License, Version 2.0 (see LICENSE).

from __future__ import annotations

import itertools
import os
import stat
from dataclasses import dataclass
from pathlib import PurePath
from typing import Iterable


@dataclass(frozen=True)
class DiskUsage(object):
@classmethod
def aggregate(cls, path: str, usages: Iterable[DiskUsage]) -> DiskUsage:
subdirs = 0
files = 0
size = 0
for disk_usage in usages:
subdirs += disk_usage.subdirs
files += disk_usage.files
size += disk_usage.size
return cls(path=PurePath(path), subdirs=subdirs, files=files, size=size)

@classmethod
def collect(cls, path: str) -> DiskUsage:
"""Collects data with the same default semantics as `du`.
Does not count directory inode sizes.
Only counts hard linked file sizes once.
Counts symlink size as the size of the target path string not including the null terminator.
"""
subdir_count = 0
file_count = 0
size = 0
seen: set[int] = set()
for root, dirs, files in os.walk(path):
for f in itertools.chain(dirs, files):
stat_info = os.lstat(os.path.join(root, f))
if stat_info.st_ino in seen:
continue
seen.add(stat_info.st_ino)
if stat.S_ISDIR(stat_info.st_mode):
subdir_count += 1
else:
file_count += 1
size += stat_info.st_size

return cls(path=PurePath(path), subdirs=subdir_count, files=file_count, size=size)

path: PurePath
subdirs: int
files: int
size: int
Loading

0 comments on commit dad6806

Please sign in to comment.