Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bulk scan error handling #1126

Merged
merged 3 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
---
version: "3"
networks:
app:
services:
Expand Down
16 changes: 10 additions & 6 deletions scanner/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ def extract_assets(soup: BeautifulSoup, site_url: str) -> List[Asset]:
)
)

response = fetch_asset(script.attrs['src'], site_url)
asset_text = fetch_asset(script.attrs['src'], site_url)
# assets in content from external js
for text in extract_strings(response.text):
for text in extract_strings(asset_text):
for url in extract_urls(text):
assets.append(Asset(resource=url, kind='script-resource', initiator=script.attrs['src']))
# js embedded in <script> tags
Expand All @@ -102,9 +102,9 @@ def extract_assets(soup: BeautifulSoup, site_url: str) -> List[Asset]:
stylesheet_links = soup.find_all('link', rel='stylesheet')
for link in stylesheet_links:
if link.attrs.get('href'):
response = fetch_asset(link.attrs['href'], site_url)
asset_text = fetch_asset(link.attrs['href'], site_url)
# assets in content from stylesheet link
for url in urls_from_css(response.text):
for url in urls_from_css(asset_text):
assets.append(Asset(resource=url, kind='style-resource', initiator=link.attrs['href']))

# stylesheet link
Expand Down Expand Up @@ -170,7 +170,7 @@ def descendants(nodes):
return children


def fetch_asset(asset_url: str, site_url: str) -> requests.models.Response:
def fetch_asset(asset_url: str, site_url: str) -> str:
site_url = urllib.parse.urlparse(site_url)
asset_url = urllib.parse.urlparse(asset_url)

Expand All @@ -181,7 +181,11 @@ def fetch_asset(asset_url: str, site_url: str) -> requests.models.Response:

# Note: headers include User-Agent which is required for correct
# scanning.
return requests.get(asset_url.geturl(), headers=HEADERS, timeout=5)
try:
response = requests.get(asset_url.geturl(), headers=HEADERS, timeout=5)
except requests.RequestException:
return ''
return response.text


def parse_srcset(srcset: str) -> List[str]:
Expand Down
5 changes: 4 additions & 1 deletion scanner/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ def bulk_scan(securedrops: 'DirectoryEntryQuerySet') -> None:
tldextract.extract(d).registered_domain
for d in entry.permitted_domains_for_assets
]
current_result = perform_scan(entry.landing_page_url, permitted_domains)
try:
current_result = perform_scan(entry.landing_page_url, permitted_domains)
except Exception:
continue
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am slightly worried about this failing too silently. Should we maybe have a warning of some sorts logging the error, in case there is some real error that we may need to debug?


# This is usually handled by Result.save, but since we're doing a
# bulk save, we need to do it here
Expand Down
Loading