diff --git a/docs/api.rst b/docs/api.rst index 01f5755d4..c91699de0 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -937,18 +937,18 @@ Example contents of this file:: [rules] ; optional, default ".*" - whitelist= + allowlist= .*mywebsite\.com.* - ; optional, default is no blacklist - blacklist= + ; optional, default is no denylist + denylist= .*\.js.* .*\.css.* .*\.png -whitelist and blacklist are newline-separated lists of regexes. -If URL matches one of whitelist patterns and matches none of blacklist -patterns, proxy specified in ``[proxy]`` section is used; +``allowlist`` and ``denylist`` are newline-separated lists of regexes. +If URL matches one of the allowlist patterns and matches none of the denylist +patterns, the proxy specified in the ``[proxy]`` section is used; no proxy is used otherwise. Then, to apply proxy rules according to this profile, diff --git a/docs/internals/js-python-lua.rst b/docs/internals/js-python-lua.rst index c2a166c8e..1240b151d 100644 --- a/docs/internals/js-python-lua.rst +++ b/docs/internals/js-python-lua.rst @@ -39,7 +39,7 @@ Conversion rules: * If there is a need to expose a custom Python object to Lua then a subclass of :class:`splash.qtrender_lua.BaseExposedObject` is used; it is wrapped to a Lua table using utilities from wraputils.lua. - Lua table exposes whitelisted attributes and methods of the object + Lua table exposes allowlisted attributes and methods of the object using metatable, and disallows access to all other attributes. * Other than that, there is no automatic conversion. If something is not diff --git a/docs/scripting-libs.rst b/docs/scripting-libs.rst index 48d160091..44f751fef 100644 --- a/docs/scripting-libs.rst +++ b/docs/scripting-libs.rst @@ -347,7 +347,7 @@ Example:: When you use a :ref:`Lua sandbox ` (default) Lua ``require`` function is restricted when used in scripts: it only allows to load -modules from a whitelist. This whitelist is empty by default, i.e. by default +modules from an allowlist. This allowlist is empty by default, i.e. by default you can require nothing. To make your modules available for scripts start Splash with ``--lua-sandbox-allowed-modules`` option. It should contain a semicolon-separated list of Lua module names allowed in a sandbox:: diff --git a/splash/proxy.py b/splash/proxy.py index 97b8b666c..657e074ae 100644 --- a/splash/proxy.py +++ b/splash/proxy.py @@ -5,10 +5,11 @@ which proxies to use for a given request. QNetworkManager calls a proxy factory for each outgoing request. """ -import re +import configparser import os +import re +import warnings from urllib.parse import urlparse -import configparser from PyQt5.QtNetwork import QNetworkProxy @@ -21,15 +22,15 @@ def _raise_proxy_error(description, **kwargs): RenderOptions.raise_error("proxy", description, **kwargs) -class _BlackWhiteSplashProxyFactory(object): +class _AllowDenySplashProxyFactory(object): """ Proxy factory that enables non-default proxy list when - requested URL is matched by one of whitelist patterns - while not being matched by one of the blacklist patterns. + requested URL is matched by one of the allowlist patterns + while not being matched by one of the denylist patterns. """ - def __init__(self, blacklist=None, whitelist=None, proxy_list=None): - self.blacklist = blacklist or [] - self.whitelist = whitelist or [] + def __init__(self, allowlist=None, denylist=None, proxy_list=None): + self.allowlist = allowlist or [] + self.denylist = denylist or [] self.proxy_list = proxy_list or [] def queryProxy(self, query=None, *args, **kwargs): @@ -47,13 +48,13 @@ def should_use_proxy_list(self, protocol, url): # don't try to proxy unknown protocols return False - if any(re.match(p, url) for p in self.blacklist): + if any(re.match(p, url) for p in self.denylist): return False - if any(re.match(p, url) for p in self.whitelist): + if any(re.match(p, url) for p in self.allowlist): return True - return not bool(self.whitelist) + return not bool(self.allowlist) def _get_default_proxy_list(self): return [QNetworkProxy(QNetworkProxy.DefaultProxy)] @@ -65,9 +66,9 @@ def _get_custom_proxy_list(self): ] -class ProfilesSplashProxyFactory(_BlackWhiteSplashProxyFactory): +class ProfilesSplashProxyFactory(_AllowDenySplashProxyFactory): r""" - This proxy factory reads BlackWhiteQNetworkProxyFactory + This proxy factory reads _AllowDenySplashProxyFactory parameters from ini file; name of the profile can be set per-request using GET parameter. @@ -82,10 +83,10 @@ class ProfilesSplashProxyFactory(_BlackWhiteSplashProxyFactory): type=HTTP [rules] - whitelist= + allowlist= .*mywebsite\.com.* - blacklist= + denylist= .*\.js.* .*\.css.* .*\.png @@ -99,12 +100,16 @@ class ProfilesSplashProxyFactory(_BlackWhiteSplashProxyFactory): def __init__(self, proxy_profiles_path, profile_name): self.proxy_profiles_path = proxy_profiles_path - blacklist, whitelist, proxy_list = self._get_filter_params(profile_name) - super(ProfilesSplashProxyFactory, self).__init__(blacklist, whitelist, proxy_list) + allowlist, denylist, proxy_list = self._get_filter_params(profile_name) + super(ProfilesSplashProxyFactory, self).__init__( + allowlist=allowlist, + denylist=denylist, + proxy_list=proxy_list, + ) def _get_filter_params(self, profile_name=None): """ - Return (blacklist, whitelist, proxy_list) tuple + Return a (allowlist, denylist, proxy_list) tuple loaded from profile ``profile_name``. """ if profile_name is None: @@ -132,8 +137,20 @@ def _parse_ini(self, ini_path): if not parser.read(ini_path): _raise_proxy_error(self.NO_PROXY_PROFILE_MSG) - blacklist = _get_lines(parser, 'rules', 'blacklist', []) - whitelist = _get_lines(parser, 'rules', 'whitelist', []) + allowlist = _get_lines(parser, 'rules', 'whitelist', []) + if allowlist: + warnings.warn('{}: ‘whitelist’ is deprecated, use ‘allowlist’ ' + 'instead'.format(ini_path), DeprecationWarning) + else: + allowlist = _get_lines(parser, 'rules', 'allowlist', []) + + denylist = _get_lines(parser, 'rules', 'blacklist', []) + if denylist: + warnings.warn('{}: ‘blacklist’ is deprecated, use ‘denylist’ ' + 'instead'.format(ini_path), DeprecationWarning) + else: + denylist = _get_lines(parser, 'rules', 'denylist', []) + try: proxy = dict(parser.items('proxy')) except configparser.NoSectionError: @@ -160,7 +177,7 @@ def _parse_ini(self, ini_path): proxy_list = [(host, port, proxy.get('username'), proxy.get('password'), proxy.get('type'))] - return blacklist, whitelist, proxy_list + return allowlist, denylist, proxy_list class DirectSplashProxyFactory(object): diff --git a/splash/tests/proxy_profiles/test.ini b/splash/tests/proxy_profiles/test.ini index c9a8ede7e..cb7346fcf 100644 --- a/splash/tests/proxy_profiles/test.ini +++ b/splash/tests/proxy_profiles/test.ini @@ -4,9 +4,9 @@ port = 8990 [rules] -whitelist = +allowlist = .* -blacklist = +denylist = .*\.js$ .*1\.html$ diff --git a/splash/tests/proxy_profiles/test_deprecated.ini b/splash/tests/proxy_profiles/test_deprecated.ini new file mode 100644 index 000000000..c9a8ede7e --- /dev/null +++ b/splash/tests/proxy_profiles/test_deprecated.ini @@ -0,0 +1,12 @@ +[proxy] +host = 0.0.0.0 +port = 8990 + +[rules] + +whitelist = + .* + +blacklist = + .*\.js$ + .*1\.html$ diff --git a/splash/tests/test_proxy.py b/splash/tests/test_proxy.py index d86b41618..719f7a8eb 100644 --- a/splash/tests/test_proxy.py +++ b/splash/tests/test_proxy.py @@ -2,12 +2,13 @@ import os import shutil import unittest +import warnings import pytest import requests from splash.proxy import ( - _BlackWhiteSplashProxyFactory, + _AllowDenySplashProxyFactory, ProfilesSplashProxyFactory, DirectSplashProxyFactory ) @@ -17,38 +18,37 @@ from splash.tests.utils import MockServers -class BlackWhiteProxyFactoryTest(unittest.TestCase): +class AllowDenyProxyFactoryTest(unittest.TestCase): def _factory(self, **kwargs): params = { "proxy_list": [("proxy.crawlera.com", 8010, "username", "password")], - "whitelist": [ + 'allowlist': [ r".*scrapinghub\.com.*", ], - "blacklist": [ + 'denylist': [ r".*\.js", r".*\.css", ] } params.update(kwargs) - return _BlackWhiteSplashProxyFactory(**params) + return _AllowDenySplashProxyFactory(**params) def test_noproxy(self): - f = _BlackWhiteSplashProxyFactory() + f = _AllowDenySplashProxyFactory() self.assertFalse(f.should_use_proxy_list('http', 'crawlera.com')) - def test_whitelist(self): + def test_allowlist(self): self.assertUsesCustom('http://www.scrapinghub.com') self.assertUsesDefault('http://www.google-analytics.com/ga.js') self.assertUsesDefault('http://crawlera.com') - def test_blacklist(self): + def test_denylist(self): self.assertUsesDefault('http://www.scrapinghub.com/static/styles/screen.css') - def test_no_whitelist(self): - self.assertUsesCustom('http://crawlera.com', whitelist=[]) - self.assertUsesDefault('http://www.google-analytics.com/ga.js', whitelist=[]) - + def test_no_allowlist(self): + self.assertUsesCustom('http://crawlera.com', allowlist=[]) + self.assertUsesDefault('http://www.google-analytics.com/ga.js', allowlist=[]) def assertUsesDefault(self, url, protocol='http', **kwargs): f = self._factory(**kwargs) @@ -95,30 +95,7 @@ def assertNotProxied(self, html): assert 'PROXY_USED' not in html -class HtmlProxyRenderTest(BaseHtmlProxyTest): - - def test_proxy_works(self): - r1 = self.request({'url': self.mockurl('jsrender')}) - self.assertNotProxied(r1.text) - - r2 = self.request({'url': self.mockurl('jsrender'), 'proxy': 'test'}) - self.assertProxied(r2.text) - - def test_blacklist(self): - params = {'url': self.mockurl('iframes'), - 'proxy': 'test', 'html': 1, 'iframes': 1} - r = self.request(params, endpoint='render.json') - data = r.json() - - # only 1.html is blacklisted in test.ini - self.assertProxied(data['html']) - assert any('1.html' in f['requestedUrl'] for f in data['childFrames']) - - for frame in data['childFrames']: - if '1.html' in frame['requestedUrl']: - self.assertNotProxied(frame['html']) - else: - self.assertProxied(frame['html']) +class HtmlProxyBadRenderTest(BaseHtmlProxyTest): def test_insecure(self): r = self.request({'url': self.mockurl('jsrender'), @@ -144,6 +121,47 @@ def test_no_proxy_settings(self): self.assertJsonError(r, 400, 'BadOption') +class HtmlProxyRenderTest(BaseHtmlProxyTest): + profile = 'test' + + def test_proxy_works(self): + r1 = self.request({'url': self.mockurl('jsrender')}) + self.assertNotProxied(r1.text) + + r2 = self.request({'url': self.mockurl('jsrender'), + 'proxy': self.profile}) + self.assertProxied(r2.text) + + def test_denylist(self): + params = {'url': self.mockurl('iframes'), + 'proxy': self.profile, 'html': 1, 'iframes': 1} + r = self.request(params, endpoint='render.json') + data = r.json() + + # only 1.html is denylisted in test.ini + self.assertProxied(data['html']) + assert any('1.html' in f['requestedUrl'] for f in data['childFrames']) + + for frame in data['childFrames']: + if '1.html' in frame['requestedUrl']: + self.assertNotProxied(frame['html']) + else: + self.assertProxied(frame['html']) + + +class HtmlProxyRenderDeprecatedProfileTest(HtmlProxyRenderTest): + profile = 'test_deprecated' + + def test_deprecated(self): + warnings.simplefilter('always') + profiles_path = os.path.abspath(os.path.join(os.path.dirname(__file__), + 'proxy_profiles')) + with pytest.warns(DeprecationWarning, match='whitelist'): + ProfilesSplashProxyFactory(profiles_path, self.profile) + with pytest.warns(DeprecationWarning, match='blacklist'): + ProfilesSplashProxyFactory(profiles_path, self.profile) + + class HtmlProxyDefaultProfileTest(BaseHtmlProxyTest): def ts2_request(self, ts2, query, endpoint='render.html'): diff --git a/splash/tests/utils.py b/splash/tests/utils.py index 80f1930e0..3e0979990 100644 --- a/splash/tests/utils.py +++ b/splash/tests/utils.py @@ -166,12 +166,14 @@ def _copy_test_folder(self, src, dst=None): return dst_path def _fix_testproxy_port(self): - filename = os.path.join(self.proxy_profiles_path, u'test.ini') - with open(filename, 'rb') as f: - data = f.read().decode('utf-8') - data = data.replace(u'8990', str(self.mock_proxy_port)) - with open(filename, 'wb') as f: - f.write(data.encode('utf-8')) + for profile in (u'test', u'test_deprecated'): + filename = os.path.join(self.proxy_profiles_path, + profile + u'.ini') + with open(filename, 'rb') as f: + data = f.read().decode('utf-8') + data = data.replace(u'8990', str(self.mock_proxy_port)) + with open(filename, 'wb') as f: + f.write(data.encode('utf-8')) def __enter__(self): self.mockserver = MockServer(