Skip to content

Commit

Permalink
Properly handle fetch errors and add retries. (#20)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsirois authored Jan 6, 2025
1 parent 639e9ce commit dc54ea0
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 47 deletions.
4 changes: 4 additions & 0 deletions python/CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# insta-science

## 0.4.3

Implement proper HTTP error handling and add retry support in for failed `science` fetches.

## 0.4.2

Actually implement support for `[tool.insta-science] cache` configuration as claimed in the README.
Expand Down
File renamed without changes.
123 changes: 80 additions & 43 deletions python/insta_science/_internal/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,26 @@
import hashlib
import logging
import os
import sys
from datetime import timedelta
from netrc import NetrcParseError
from pathlib import PurePath
from typing import Mapping

import httpx
from httpx import HTTPStatusError, TimeoutException
from tenacity import (
before_sleep_log,
retry,
retry_if_exception,
stop_after_attempt,
wait_exponential_jitter,
)
from tqdm import tqdm

from . import hashing
from .cache import DownloadCache, Missing
from .colors import color_support
from .errors import InputError
from .hashing import Digest, ExpectedDigest, Fingerprint
from .model import Url
Expand Down Expand Up @@ -127,6 +137,31 @@ def _expected_digest(
)


@retry(
# Raise the final exception in a retry chain if all retries fail.
reraise=True,
retry=retry_if_exception(
lambda ex: (
isinstance(ex, TimeoutException)
or (
# See: https://tools.ietf.org/html/rfc2616#page-39
isinstance(ex, HTTPStatusError)
and ex.response.status_code
in (
408, # Request Time-out
500, # Internal Server Error
502, # Bad Gateway
503, # Service Unavailable
504, # Gateway Time-out
)
)
)
),
stop=stop_after_attempt(3),
wait=wait_exponential_jitter(initial=0.5, exp_base=2, jitter=0.5),
# This logs the retries since there is a sleep before each (see wait above).
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def fetch_and_verify(
url: Url,
cache: DownloadCache,
Expand All @@ -140,59 +175,61 @@ def fetch_and_verify(
verified_fingerprint = False
with cache.get_or_create(url, namespace=namespace, ttl=ttl) as cache_result:
if isinstance(cache_result, Missing):
# TODO(John Sirois): XXX: Log or invoke callback for logging.
# click.secho(f"Downloading {url} ...", fg="green")
work = cache_result.work
with _configured_client(url, headers) as client:
expected_digest = _expected_digest(
url, headers, fingerprint, algorithm=digest_algorithm
)
digest = hashlib.new(digest_algorithm)
total_bytes = 0
with client.stream("GET", url) as response, work.open("wb") as cache_fp:
total = (
int(content_length)
if (content_length := response.headers.get("Content-Length"))
else None
with color_support() as colors:
print(colors.color(f"Downloading {url} ...", fg="green"), file=sys.stderr)
work = cache_result.work
with _configured_client(url, headers) as client:
expected_digest = _expected_digest(
url, headers, fingerprint, algorithm=digest_algorithm
)
if expected_digest.is_too_big(total):
raise InputError(
f"The content at {url} is expected to be {expected_digest.size} "
f"bytes, but advertises a Content-Length of {total} bytes."
digest = hashlib.new(digest_algorithm)
total_bytes = 0
with client.stream("GET", url) as response, work.open("wb") as cache_fp:
response.raise_for_status()
total = (
int(content_length)
if (content_length := response.headers.get("Content-Length"))
else None
)
with tqdm(
total=total, unit_scale=True, unit_divisor=1024, unit="B"
) as progress:
num_bytes_downloaded = response.num_bytes_downloaded
for data in response.iter_bytes():
total_bytes += len(data)
if expected_digest.is_too_big(total_bytes):
raise InputError(
f"The download from {url} was expected to be "
f"{expected_digest.size} bytes, but downloaded "
f"{total_bytes} so far."
)
digest.update(data)
cache_fp.write(data)
progress.update(response.num_bytes_downloaded - num_bytes_downloaded)
if expected_digest.is_too_big(total):
raise InputError(
f"The content at {url} is expected to be {expected_digest.size} "
f"bytes, but advertises a Content-Length of {total} bytes."
)
with tqdm(
total=total, unit_scale=True, unit_divisor=1024, unit="B"
) as progress:
num_bytes_downloaded = response.num_bytes_downloaded
expected_digest.check(
subject=f"download from {url}",
actual_fingerprint=Fingerprint(digest.hexdigest()),
actual_size=total_bytes,
)
verified_fingerprint = True
if executable:
work.chmod(0o755)
for data in response.iter_bytes():
total_bytes += len(data)
if expected_digest.is_too_big(total_bytes):
raise InputError(
f"The download from {url} was expected to be "
f"{expected_digest.size} bytes, but downloaded "
f"{total_bytes} so far."
)
digest.update(data)
cache_fp.write(data)
progress.update(
response.num_bytes_downloaded - num_bytes_downloaded
)
num_bytes_downloaded = response.num_bytes_downloaded
expected_digest.check(
subject=f"download from {url}",
actual_fingerprint=Fingerprint(digest.hexdigest()),
actual_size=total_bytes,
)
verified_fingerprint = True
if executable:
work.chmod(0o755)

if not verified_fingerprint:
expected_cached_digest = _maybe_expected_digest(
fingerprint, headers=headers, algorithm=digest_algorithm
)
if expected_cached_digest:
expected_cached_digest.check_path(
subject=f"cached download from {url}",
path=cache_result.path,
subject=f"cached download from {url}", path=cache_result.path
)

return cache_result.path
2 changes: 1 addition & 1 deletion python/insta_science/shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import NoReturn

from . import CURRENT_PLATFORM, InputError, ScienceNotFound, ensure_installed
from ._colors import color_support
from ._internal.colors import color_support


def main() -> NoReturn:
Expand Down
2 changes: 1 addition & 1 deletion python/insta_science/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from packaging.version import Version

from . import CURRENT_PLATFORM, InputError, Platform, Science
from ._colors import Colors, color_support
from ._internal import a_scie, parser, project, science
from ._internal.bytes import ByteAmount
from ._internal.cache import DownloadCache, download_cache
from ._internal.colors import Colors, color_support
from ._internal.du import DiskUsage
from ._internal.model import Configuration
from .version import __version__
Expand Down
2 changes: 1 addition & 1 deletion python/insta_science/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Science project contributors.
# Licensed under the Apache License, Version 2.0 (see LICENSE).

__version__ = "0.4.2"
__version__ = "0.4.3"
1 change: 1 addition & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies = [
"filelock",
"httpx",
"packaging",
"tenacity",
"tomli; python_version < '3.11'",
"tqdm",
"typing-extensions",
Expand Down
13 changes: 12 additions & 1 deletion python/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit dc54ea0

Please sign in to comment.