Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix tests: httpbun.org → .com #455

Merged
merged 4 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def test_download():
#teststring = fetch_url(url)
#assert teststring is not None
#assert cli.examine(teststring, args, url) is None
url = 'https://httpbun.org/html'
url = 'https://httpbun.com/html'
teststring = fetch_url(url)
assert teststring is not None
assert cli.examine(teststring, args, url) is not None
Expand Down Expand Up @@ -408,27 +408,27 @@ def test_crawling():
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)

testargs = ['', '--crawl', 'https://httpbun.org/html']
testargs = ['', '--crawl', 'https://httpbun.com/html']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbun.org/html\n'
assert f.getvalue() == 'https://httpbun.com/html\n'

spider.URL_STORE = UrlStore(compressed=False, strict=False)
# links permitted
testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
testargs = ['', '--crawl', 'https://httpbun.com/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
# possibly a bug on Github actions, should be 2 URLs
assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
assert f.getvalue() in ('https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n', 'https://httpbun.com/links/1/1\n')
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# 0 links permitted
args.crawl = 'https://httpbun.org/links/4/4'
args.crawl = 'https://httpbun.com/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
Expand All @@ -437,13 +437,13 @@ def test_crawling():
spider.URL_STORE = UrlStore(compressed=False, strict=False)

# Exploration (Sitemap + Crawl)
testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
testargs = ['', '--explore', 'https://httpbun.com/html', '--list']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip() == 'https://httpbun.org/html'
assert f.getvalue().strip() == 'https://httpbun.com/html'


def test_probing():
Expand Down
18 changes: 9 additions & 9 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,27 +53,27 @@ def test_fetch():
assert _send_request('', True, DEFAULT_CONFIG) is None

# is_live general tests
assert _urllib3_is_live_page('https://httpbun.org/status/301') is True
assert _urllib3_is_live_page('https://httpbun.org/status/404') is False
assert is_live_page('https://httpbun.org/status/403') is False
assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
assert _urllib3_is_live_page('https://httpbun.com/status/404') is False
assert is_live_page('https://httpbun.com/status/403') is False
# is_live pycurl tests
if pycurl is not None:
assert _pycurl_is_live_page('https://httpbun.org/status/301') is True
assert _pycurl_is_live_page('https://httpbun.com/status/301') is True

# fetch_url
assert fetch_url('#@1234') is None
assert fetch_url('https://httpbun.org/status/404') is None
assert fetch_url('https://httpbun.com/status/404') is None
# test if the functions default to no_ssl
# doesn't work?
# assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
if pycurl is not None:
assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# no SSL, no decoding
url = 'https://httpbun.org/status/200'
response = _send_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
url = 'https://httpbun.com/status/200'
response = _send_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert response.data == b''
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
# response object
Expand Down Expand Up @@ -155,7 +155,7 @@ def test_queue():
testargs = ['', '-v']
with patch.object(sys, 'argv', testargs):
args = parse_args(testargs)
inputurls = ['https://httpbun.org/status/301', 'https://httpbun.org/status/304', 'https://httpbun.org/status/200', 'https://httpbun.org/status/300', 'https://httpbun.org/status/400', 'https://httpbun.org/status/505']
inputurls = ['https://httpbun.com/status/301', 'https://httpbun.com/status/304', 'https://httpbun.com/status/200', 'https://httpbun.com/status/300', 'https://httpbun.com/status/400', 'https://httpbun.com/status/505']
url_store = add_to_compressed_dict(inputurls)
args.archived = True
args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
Expand Down
4 changes: 2 additions & 2 deletions tests/feeds_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def test_feeds_helpers():
) == ["https://example.org/rss"]
# feed discovery
assert not find_feed_urls("http://")
assert not find_feed_urls("https://httpbun.org/status/404")
assert not find_feed_urls("https://httpbun.com/status/404")
# Feedburner/Google links
assert handle_link_list(["https://feedproxy.google.com/ABCD"], params) == [
"https://feedproxy.google.com/ABCD"
Expand All @@ -271,7 +271,7 @@ def test_feeds_helpers():

def test_cli_behavior():
"""Test command-line interface with respect to feeds"""
testargs = ["", "--list", "--feed", "https://httpbun.org/xml"]
testargs = ["", "--list", "--feed", "https://httpbun.com/xml"]
with patch.object(sys, "argv", testargs):
assert main() is None

Expand Down
2 changes: 1 addition & 1 deletion tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_extraction():
def test_robotstxt():
'''Check if sitemaps can be found over robots.txt'''
assert not sitemaps.find_robots_sitemaps('https://http.org')
baseurl = 'https://httpbun.org'
baseurl = 'https://httpbun.com'
assert not sitemaps.find_robots_sitemaps(baseurl)
assert not sitemaps.extract_robots_sitemaps('# test', baseurl)
assert not sitemaps.extract_robots_sitemaps('# test'*10000, baseurl)
Expand Down
46 changes: 23 additions & 23 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,40 +27,40 @@ def test_redirections():
"Test redirection detection."
_, _, baseurl = spider.probe_alternative_homepage('xyz')
assert baseurl is None
_, _, baseurl = spider.probe_alternative_homepage('https://httpbun.org/redirect-to?url=https://example.org')
_, _, baseurl = spider.probe_alternative_homepage('https://httpbun.com/redirect-to?url=https://example.org')
assert baseurl == 'https://example.org'
#_, _, baseurl = spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')


def test_meta_redirections():
"Test redirection detection using meta tag."
# empty
htmlstring, homepage = '"refresh"', 'https://httpbun.org/'
htmlstring, homepage = '"refresh"', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage
htmlstring, homepage = '<html></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# unusable
htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.org/'
htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# malformed
htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# wrong URL
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 is None and homepage2 is None

# normal
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.org/html"/></html>', 'http://test.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.com/html"/></html>', 'http://test.org/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 is not None and homepage2 == 'https://httpbun.org/html'
assert htmlstring2 is not None and homepage2 == 'https://httpbun.com/html'


def test_process_links():
Expand Down Expand Up @@ -103,7 +103,7 @@ def test_process_links():

def test_crawl_logic():
"Test functions related to crawling sequence and consistency."
url = 'https://httpbun.org/html'
url = 'https://httpbun.com/html'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# erroneous webpage
with pytest.raises(ValueError):
Expand All @@ -118,31 +118,31 @@ def test_crawl_logic():
base_url, i, known_num, rules, is_on = spider.init_crawl(url, None, None)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.org' and i == 1
assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.com' and i == 1
# delay between requests
assert spider.URL_STORE.get_crawl_delay('https://httpbun.org') == 5
assert spider.URL_STORE.get_crawl_delay('https://httpbun.org', default=2.0) == 2.0
assert spider.URL_STORE.get_crawl_delay('https://httpbun.com') == 5
assert spider.URL_STORE.get_crawl_delay('https://httpbun.com', default=2.0) == 2.0
# existing todo
spider.URL_STORE = UrlStore(compressed=False, strict=False)
base_url, i, known_num, rules, is_on = spider.init_crawl(url, [url,], None)
assert base_url == 'https://httpbun.org' and i == 0
assert base_url == 'https://httpbun.com' and i == 0


def test_crawl_page():
"Test page-by-page processing."
base_url = 'https://httpbun.org'
base_url = 'https://httpbun.com'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(['https://httpbun.org/links/2/2'])
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org')
spider.URL_STORE.add_urls(['https://httpbun.com/links/2/2'])
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com')
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert sorted(todo) == ['https://httpbun.org/links/2/0', 'https://httpbun.org/links/2/1']
assert sorted(todo) == ['https://httpbun.com/links/2/0', 'https://httpbun.com/links/2/1']
assert len(known_links) == 3 and visited_num == 1
# initial page
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(['https://httpbun.org/html'])
spider.URL_STORE.add_urls(['https://httpbun.com/html'])
# if LANGID_FLAG is True:
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org', initial=True, lang='de')
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com', initial=True, lang='de')
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert len(todo) == 0 and len(known_links) == 1 and visited_num == 1
Expand All @@ -152,10 +152,10 @@ def test_crawl_page():
def test_focused_crawler():
"Test the whole focused crawler mechanism."
spider.URL_STORE = UrlStore()
todo, known_links = spider.focused_crawler("https://httpbun.org/links/1/1", max_seen_urls=1)
## TODO: check this on Github actions:
# assert sorted(known_links) == ['https://httpbun.org/links/1/0', 'https://httpbun.org/links/1/1']
# assert sorted(todo) == ['https://httpbun.org/links/1/0']
todo, known_links = spider.focused_crawler("https://httpbun.com/links/1/1", max_seen_urls=1)
## fails on Github Actions
## assert sorted(known_links) == ['https://httpbun.com/links/1/0', 'https://httpbun.com/links/1/1']
## assert sorted(todo) == ['https://httpbun.com/links/1/0']


if __name__ == '__main__':
Expand Down
Loading