Skip to content

Commit

Permalink
Store found URL for domain regardless of HTML's <base> tag
Browse files Browse the repository at this point in the history
  • Loading branch information
ra1nb0rn committed Jul 21, 2020
1 parent 66ca4be commit 9b16eb0
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions modules/web/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ def process_response(self, response):
self.extract_cookies(response.headers.getlist("Set-Cookie"), response.url)

# use scrapy's lxml linkextractor to extract links / URLs
scrapy_urls = set()
try:
# extract <base> URL's domain if a <base> tag exists
base_domain = ""
Expand All @@ -308,10 +309,21 @@ def process_response(self, response):
allowed_domains = [self.domain, "%s:%s" % (self.domain, self.port)]
if base_domain:
allowed_domains.append(base_domain)
scrapy_links = LinkExtractor(allow_domains=allowed_domains,
raw_scrapy_links = LinkExtractor(allow_domains=allowed_domains,
tags=("a", "area", "script", "link", "source", "img"),
attrs=("src", "href"),
deny_extensions=set()).extract_links(response)
raw_scrapy_urls = [link.url for link in raw_scrapy_links]

# copy discovered URLs and additionally insert initial network location
scrapy_urls = raw_scrapy_urls.copy()
if base_domain and base_domain != allowed_domains[0] and base_domain != allowed_domains[1]:
orig_netloc = urllib.parse.urlparse(response.url).netloc
for scrapy_url in raw_scrapy_urls:
parsed_scrapy_url = list(urllib.parse.urlsplit(scrapy_url))
parsed_scrapy_url[1] = orig_netloc
scrapy_urls.append(urllib.parse.urlunsplit(parsed_scrapy_url))
scrapy_urls = set(scrapy_urls)
except (AttributeError, scrapy.exceptions.NotSupported) as e:
if str(e) == "Response content isn't text":
# stop processing and return no new URLs
Expand Down Expand Up @@ -339,8 +351,7 @@ def process_response(self, response):

# unite discovered URLs
urls = set()
for link in scrapy_links:
urls.add(link.url)
urls |= scrapy_urls
urls |= linkfinder_urls
urls |= dynamic_urls
urls |= form_urls
Expand Down

0 comments on commit 9b16eb0

Please sign in to comment.