Skip to content

Commit

Permalink
Apply allowlist/denylist wording (#1110)
Browse files Browse the repository at this point in the history
Applied mainly to proxy profiles.

Also included a change to allow overriding the tests to execute with ‘docker run’ parameters.
  • Loading branch information
Gallaecio authored Feb 21, 2021
1 parent 3329bd7 commit 802d839
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 73 deletions.
12 changes: 6 additions & 6 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -937,18 +937,18 @@ Example contents of this file::

[rules]
; optional, default ".*"
whitelist=
allowlist=
.*mywebsite\.com.*

; optional, default is no blacklist
blacklist=
; optional, default is no denylist
denylist=
.*\.js.*
.*\.css.*
.*\.png

whitelist and blacklist are newline-separated lists of regexes.
If URL matches one of whitelist patterns and matches none of blacklist
patterns, proxy specified in ``[proxy]`` section is used;
``allowlist`` and ``denylist`` are newline-separated lists of regexes.
If URL matches one of the allowlist patterns and matches none of the denylist
patterns, the proxy specified in the ``[proxy]`` section is used;
no proxy is used otherwise.

Then, to apply proxy rules according to this profile,
Expand Down
2 changes: 1 addition & 1 deletion docs/internals/js-python-lua.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Conversion rules:
* If there is a need to expose a custom Python object to Lua then
a subclass of :class:`splash.qtrender_lua.BaseExposedObject` is used; it is
wrapped to a Lua table using utilities from wraputils.lua.
Lua table exposes whitelisted attributes and methods of the object
Lua table exposes allowlisted attributes and methods of the object
using metatable, and disallows access to all other attributes.

* Other than that, there is no automatic conversion. If something is not
Expand Down
2 changes: 1 addition & 1 deletion docs/scripting-libs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ Example::

When you use a :ref:`Lua sandbox <lua-sandbox>` (default) Lua ``require``
function is restricted when used in scripts: it only allows to load
modules from a whitelist. This whitelist is empty by default, i.e. by default
modules from an allowlist. This allowlist is empty by default, i.e. by default
you can require nothing. To make your modules available for scripts start
Splash with ``--lua-sandbox-allowed-modules`` option. It should contain a
semicolon-separated list of Lua module names allowed in a sandbox::
Expand Down
59 changes: 38 additions & 21 deletions splash/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
which proxies to use for a given request. QNetworkManager calls
a proxy factory for each outgoing request.
"""
import re
import configparser
import os
import re
import warnings
from urllib.parse import urlparse
import configparser

from PyQt5.QtNetwork import QNetworkProxy

Expand All @@ -21,15 +22,15 @@ def _raise_proxy_error(description, **kwargs):
RenderOptions.raise_error("proxy", description, **kwargs)


class _BlackWhiteSplashProxyFactory(object):
class _AllowDenySplashProxyFactory(object):
"""
Proxy factory that enables non-default proxy list when
requested URL is matched by one of whitelist patterns
while not being matched by one of the blacklist patterns.
requested URL is matched by one of the allowlist patterns
while not being matched by one of the denylist patterns.
"""
def __init__(self, blacklist=None, whitelist=None, proxy_list=None):
self.blacklist = blacklist or []
self.whitelist = whitelist or []
def __init__(self, allowlist=None, denylist=None, proxy_list=None):
self.allowlist = allowlist or []
self.denylist = denylist or []
self.proxy_list = proxy_list or []

def queryProxy(self, query=None, *args, **kwargs):
Expand All @@ -47,13 +48,13 @@ def should_use_proxy_list(self, protocol, url):
# don't try to proxy unknown protocols
return False

if any(re.match(p, url) for p in self.blacklist):
if any(re.match(p, url) for p in self.denylist):
return False

if any(re.match(p, url) for p in self.whitelist):
if any(re.match(p, url) for p in self.allowlist):
return True

return not bool(self.whitelist)
return not bool(self.allowlist)

def _get_default_proxy_list(self):
return [QNetworkProxy(QNetworkProxy.DefaultProxy)]
Expand All @@ -65,9 +66,9 @@ def _get_custom_proxy_list(self):
]


class ProfilesSplashProxyFactory(_BlackWhiteSplashProxyFactory):
class ProfilesSplashProxyFactory(_AllowDenySplashProxyFactory):
r"""
This proxy factory reads BlackWhiteQNetworkProxyFactory
This proxy factory reads _AllowDenySplashProxyFactory
parameters from ini file; name of the profile can be set per-request
using GET parameter.
Expand All @@ -82,10 +83,10 @@ class ProfilesSplashProxyFactory(_BlackWhiteSplashProxyFactory):
type=HTTP
[rules]
whitelist=
allowlist=
.*mywebsite\.com.*
blacklist=
denylist=
.*\.js.*
.*\.css.*
.*\.png
Expand All @@ -99,12 +100,16 @@ class ProfilesSplashProxyFactory(_BlackWhiteSplashProxyFactory):

def __init__(self, proxy_profiles_path, profile_name):
self.proxy_profiles_path = proxy_profiles_path
blacklist, whitelist, proxy_list = self._get_filter_params(profile_name)
super(ProfilesSplashProxyFactory, self).__init__(blacklist, whitelist, proxy_list)
allowlist, denylist, proxy_list = self._get_filter_params(profile_name)
super(ProfilesSplashProxyFactory, self).__init__(
allowlist=allowlist,
denylist=denylist,
proxy_list=proxy_list,
)

def _get_filter_params(self, profile_name=None):
"""
Return (blacklist, whitelist, proxy_list) tuple
Return a (allowlist, denylist, proxy_list) tuple
loaded from profile ``profile_name``.
"""
if profile_name is None:
Expand Down Expand Up @@ -132,8 +137,20 @@ def _parse_ini(self, ini_path):
if not parser.read(ini_path):
_raise_proxy_error(self.NO_PROXY_PROFILE_MSG)

blacklist = _get_lines(parser, 'rules', 'blacklist', [])
whitelist = _get_lines(parser, 'rules', 'whitelist', [])
allowlist = _get_lines(parser, 'rules', 'whitelist', [])
if allowlist:
warnings.warn('{}: ‘whitelist’ is deprecated, use ‘allowlist’ '
'instead'.format(ini_path), DeprecationWarning)
else:
allowlist = _get_lines(parser, 'rules', 'allowlist', [])

denylist = _get_lines(parser, 'rules', 'blacklist', [])
if denylist:
warnings.warn('{}: ‘blacklist’ is deprecated, use ‘denylist’ '
'instead'.format(ini_path), DeprecationWarning)
else:
denylist = _get_lines(parser, 'rules', 'denylist', [])

try:
proxy = dict(parser.items('proxy'))
except configparser.NoSectionError:
Expand All @@ -160,7 +177,7 @@ def _parse_ini(self, ini_path):
proxy_list = [(host, port,
proxy.get('username'), proxy.get('password'),
proxy.get('type'))]
return blacklist, whitelist, proxy_list
return allowlist, denylist, proxy_list


class DirectSplashProxyFactory(object):
Expand Down
4 changes: 2 additions & 2 deletions splash/tests/proxy_profiles/test.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ port = 8990

[rules]

whitelist =
allowlist =
.*

blacklist =
denylist =
.*\.js$
.*1\.html$
12 changes: 12 additions & 0 deletions splash/tests/proxy_profiles/test_deprecated.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[proxy]
host = 0.0.0.0
port = 8990

[rules]

whitelist =
.*

blacklist =
.*\.js$
.*1\.html$
90 changes: 54 additions & 36 deletions splash/tests/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import os
import shutil
import unittest
import warnings

import pytest
import requests

from splash.proxy import (
_BlackWhiteSplashProxyFactory,
_AllowDenySplashProxyFactory,
ProfilesSplashProxyFactory,
DirectSplashProxyFactory
)
Expand All @@ -17,38 +18,37 @@
from splash.tests.utils import MockServers


class BlackWhiteProxyFactoryTest(unittest.TestCase):
class AllowDenyProxyFactoryTest(unittest.TestCase):

def _factory(self, **kwargs):
params = {
"proxy_list": [("proxy.crawlera.com", 8010, "username", "password")],
"whitelist": [
'allowlist': [
r".*scrapinghub\.com.*",
],
"blacklist": [
'denylist': [
r".*\.js",
r".*\.css",
]
}
params.update(kwargs)
return _BlackWhiteSplashProxyFactory(**params)
return _AllowDenySplashProxyFactory(**params)

def test_noproxy(self):
f = _BlackWhiteSplashProxyFactory()
f = _AllowDenySplashProxyFactory()
self.assertFalse(f.should_use_proxy_list('http', 'crawlera.com'))

def test_whitelist(self):
def test_allowlist(self):
self.assertUsesCustom('http://www.scrapinghub.com')
self.assertUsesDefault('http://www.google-analytics.com/ga.js')
self.assertUsesDefault('http://crawlera.com')

def test_blacklist(self):
def test_denylist(self):
self.assertUsesDefault('http://www.scrapinghub.com/static/styles/screen.css')

def test_no_whitelist(self):
self.assertUsesCustom('http://crawlera.com', whitelist=[])
self.assertUsesDefault('http://www.google-analytics.com/ga.js', whitelist=[])

def test_no_allowlist(self):
self.assertUsesCustom('http://crawlera.com', allowlist=[])
self.assertUsesDefault('http://www.google-analytics.com/ga.js', allowlist=[])

def assertUsesDefault(self, url, protocol='http', **kwargs):
f = self._factory(**kwargs)
Expand Down Expand Up @@ -95,30 +95,7 @@ def assertNotProxied(self, html):
assert 'PROXY_USED' not in html


class HtmlProxyRenderTest(BaseHtmlProxyTest):

def test_proxy_works(self):
r1 = self.request({'url': self.mockurl('jsrender')})
self.assertNotProxied(r1.text)

r2 = self.request({'url': self.mockurl('jsrender'), 'proxy': 'test'})
self.assertProxied(r2.text)

def test_blacklist(self):
params = {'url': self.mockurl('iframes'),
'proxy': 'test', 'html': 1, 'iframes': 1}
r = self.request(params, endpoint='render.json')
data = r.json()

# only 1.html is blacklisted in test.ini
self.assertProxied(data['html'])
assert any('1.html' in f['requestedUrl'] for f in data['childFrames'])

for frame in data['childFrames']:
if '1.html' in frame['requestedUrl']:
self.assertNotProxied(frame['html'])
else:
self.assertProxied(frame['html'])
class HtmlProxyBadRenderTest(BaseHtmlProxyTest):

def test_insecure(self):
r = self.request({'url': self.mockurl('jsrender'),
Expand All @@ -144,6 +121,47 @@ def test_no_proxy_settings(self):
self.assertJsonError(r, 400, 'BadOption')


class HtmlProxyRenderTest(BaseHtmlProxyTest):
profile = 'test'

def test_proxy_works(self):
r1 = self.request({'url': self.mockurl('jsrender')})
self.assertNotProxied(r1.text)

r2 = self.request({'url': self.mockurl('jsrender'),
'proxy': self.profile})
self.assertProxied(r2.text)

def test_denylist(self):
params = {'url': self.mockurl('iframes'),
'proxy': self.profile, 'html': 1, 'iframes': 1}
r = self.request(params, endpoint='render.json')
data = r.json()

# only 1.html is denylisted in test.ini
self.assertProxied(data['html'])
assert any('1.html' in f['requestedUrl'] for f in data['childFrames'])

for frame in data['childFrames']:
if '1.html' in frame['requestedUrl']:
self.assertNotProxied(frame['html'])
else:
self.assertProxied(frame['html'])


class HtmlProxyRenderDeprecatedProfileTest(HtmlProxyRenderTest):
profile = 'test_deprecated'

def test_deprecated(self):
warnings.simplefilter('always')
profiles_path = os.path.abspath(os.path.join(os.path.dirname(__file__),
'proxy_profiles'))
with pytest.warns(DeprecationWarning, match='whitelist'):
ProfilesSplashProxyFactory(profiles_path, self.profile)
with pytest.warns(DeprecationWarning, match='blacklist'):
ProfilesSplashProxyFactory(profiles_path, self.profile)


class HtmlProxyDefaultProfileTest(BaseHtmlProxyTest):

def ts2_request(self, ts2, query, endpoint='render.html'):
Expand Down
14 changes: 8 additions & 6 deletions splash/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,14 @@ def _copy_test_folder(self, src, dst=None):
return dst_path

def _fix_testproxy_port(self):
filename = os.path.join(self.proxy_profiles_path, u'test.ini')
with open(filename, 'rb') as f:
data = f.read().decode('utf-8')
data = data.replace(u'8990', str(self.mock_proxy_port))
with open(filename, 'wb') as f:
f.write(data.encode('utf-8'))
for profile in (u'test', u'test_deprecated'):
filename = os.path.join(self.proxy_profiles_path,
profile + u'.ini')
with open(filename, 'rb') as f:
data = f.read().decode('utf-8')
data = data.replace(u'8990', str(self.mock_proxy_port))
with open(filename, 'wb') as f:
f.write(data.encode('utf-8'))

def __enter__(self):
self.mockserver = MockServer(
Expand Down

0 comments on commit 802d839

Please sign in to comment.