From ac63622915c54cadf8e12eeb02520e308c8b2991 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 15:05:47 +1000 Subject: [PATCH 01/10] use duplicate_url_discarder_rules.RULE_PATHS by default if DUD_LOAD_RULE_PATHS isn't set --- duplicate_url_discarder/_fingerprinter.py | 16 +++++++++++++++- pyproject.toml | 3 +++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/duplicate_url_discarder/_fingerprinter.py b/duplicate_url_discarder/_fingerprinter.py index ae02ac5..75712e6 100644 --- a/duplicate_url_discarder/_fingerprinter.py +++ b/duplicate_url_discarder/_fingerprinter.py @@ -20,6 +20,12 @@ logger = logging.getLogger(__name__) +try: + from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths + from importlib.metadata import version +except ImportError: + default_rule_paths = None + class Fingerprinter: def __init__(self, crawler: Crawler): @@ -28,7 +34,15 @@ def __init__(self, crawler: Crawler): "DUD_LOAD_RULE_PATHS" ) if not rule_paths: - logger.warning("DUD_LOAD_RULE_PATHS is not set or is empty.") + msg = "DUD_LOAD_RULE_PATHS is not set or is empty." + if default_rule_paths: + rule_paths = default_rule_paths + v = version("duplicate-url-discarder-rules") + msg += ( + f" Using RULE_PATHS from duplicate-url-discarder-rules=={v} instead." + ) + logger.warning(msg) + self._fallback_request_fingerprinter: RequestFingerprinterProtocol = ( create_instance( load_object( diff --git a/pyproject.toml b/pyproject.toml index a717b95..5ca56f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,9 @@ dynamic = ["version"] [project.urls] Source = "https://github.com/zytedata/duplicate-url-discarder" +[project.optional-dependencies] +rules = ["duplicate-url-discarder-rules"] + [tool.setuptools.dynamic] version = {attr = "duplicate_url_discarder.__version__"} From a457b498dd5e2bc606388e01a9a3910aa6ebf6b3 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 15:19:03 +1000 Subject: [PATCH 02/10] update README regarding rule installation --- README.rst | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index c75f0d2..b4ab8eb 100644 --- a/README.rst +++ b/README.rst @@ -35,6 +35,16 @@ Installation pip install duplicate-url-discarder +Alternatively, you can also include in the installation the predefined rules in +`duplicate-url-discarder-rules`_ via: + +.. code-block:: + + pip install duplicate-url-discarder[rules] + +If such rules are installed, they would automatically be used if the +``DUD_LOAD_RULE_PATHS`` setting is left empty (see `configuration`_). + Requires **Python 3.8+**. Using @@ -131,6 +141,8 @@ All non-universal rules (ones that have non-empty include pattern) that match a request URL are applied according to their order field. If there are no non-universal rules that match the URL, the universal ones are applied. +.. _configuration: + Configuration ============= @@ -145,6 +157,9 @@ Configuration "/home/user/project/custom_rules1.json", ] -The default value of this setting is empty. +The default value of this setting is empty. However, if the package +`duplicate-url-discarder-rules`_ is installed and ``DUD_LOAD_RULE_PATHS`` +has been left empty, the rules in the said package is automatically used. .. _scrapy-zyte-api: https://github.com/scrapy-plugins/scrapy-zyte-api +.. _duplicate-url-discarder-rules: https://github.com/zytedata/duplicate-url-discarder-rules From 2a92afe2cd11be327b72f3200e62444f5c2af82c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 15:45:32 +1000 Subject: [PATCH 03/10] add tests for presence of rules --- .github/workflows/test.yml | 2 ++ duplicate_url_discarder/_fingerprinter.py | 7 +++---- tests/test_fingerprinter.py | 15 +++++++++++++++ tox.ini | 6 ++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6c12df8..b2e1cfe 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,6 +5,7 @@ name: tox on: pull_request: + branches: [ main ] push: branches: [ main ] @@ -17,6 +18,7 @@ jobs: matrix: include: - python-version: "3.8" + toxenv: rules - python-version: "3.9" - python-version: "3.10" - python-version: "3.11" diff --git a/duplicate_url_discarder/_fingerprinter.py b/duplicate_url_discarder/_fingerprinter.py index 75712e6..c641dee 100644 --- a/duplicate_url_discarder/_fingerprinter.py +++ b/duplicate_url_discarder/_fingerprinter.py @@ -21,8 +21,9 @@ logger = logging.getLogger(__name__) try: - from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths from importlib.metadata import version + + from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths except ImportError: default_rule_paths = None @@ -38,9 +39,7 @@ def __init__(self, crawler: Crawler): if default_rule_paths: rule_paths = default_rule_paths v = version("duplicate-url-discarder-rules") - msg += ( - f" Using RULE_PATHS from duplicate-url-discarder-rules=={v} instead." - ) + msg += f" Using RULE_PATHS from duplicate-url-discarder-rules=={v} instead." logger.warning(msg) self._fallback_request_fingerprinter: RequestFingerprinterProtocol = ( diff --git a/tests/test_fingerprinter.py b/tests/test_fingerprinter.py index c8dab98..025b019 100644 --- a/tests/test_fingerprinter.py +++ b/tests/test_fingerprinter.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Any, Dict +import pytest from scrapy import Request, Spider from scrapy.dupefilters import BaseDupeFilter, RFPDupeFilter from scrapy.utils.test import get_crawler @@ -95,3 +96,17 @@ def get_stat(stat: str) -> Any: ) ) assert get_stat("url_modified") == 3 + + +try: + from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths +except ImportError: + default_rule_paths = None + + +def test_default_rules(): + fingerprinter = get_fingerprinter({}) + if default_rule_paths: + assert len(fingerprinter.url_canonicalizer.processors) > 0 + else: + assert len(fingerprinter.url_canonicalizer.processors) == 0 diff --git a/tox.ini b/tox.ini index 920352e..9716195 100644 --- a/tox.ini +++ b/tox.ini @@ -20,6 +20,12 @@ deps = url-matcher==0.5.0 w3lib==1.22.0 +[testenv:rules] +basepython = python3.8 +deps = + {[pinned]deps} + duplicate-url-discarder-rules>=0.1.0 + [testenv:pre-commit] deps = pre-commit From b933eacb4f78a3aa9aeec70e6835f1f19d5da1ef Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 20:36:12 +1000 Subject: [PATCH 04/10] fix pre-commit and mypy --- duplicate_url_discarder/_fingerprinter.py | 4 ++-- tests/test_fingerprinter.py | 1 - tox.ini | 9 +++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/duplicate_url_discarder/_fingerprinter.py b/duplicate_url_discarder/_fingerprinter.py index c641dee..254416e 100644 --- a/duplicate_url_discarder/_fingerprinter.py +++ b/duplicate_url_discarder/_fingerprinter.py @@ -2,7 +2,7 @@ import logging import os -from typing import TYPE_CHECKING, List, Union +from typing import TYPE_CHECKING, Sequence, Union from scrapy import Request from scrapy.crawler import Crawler @@ -31,7 +31,7 @@ class Fingerprinter: def __init__(self, crawler: Crawler): self.crawler: Crawler = crawler - rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist( + rule_paths: Sequence[Union[str, os.PathLike]] = self.crawler.settings.getlist( "DUD_LOAD_RULE_PATHS" ) if not rule_paths: diff --git a/tests/test_fingerprinter.py b/tests/test_fingerprinter.py index 025b019..be515fe 100644 --- a/tests/test_fingerprinter.py +++ b/tests/test_fingerprinter.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Any, Dict -import pytest from scrapy import Request, Spider from scrapy.dupefilters import BaseDupeFilter, RFPDupeFilter from scrapy.utils.test import get_crawler diff --git a/tox.ini b/tox.ini index 9716195..1dc6d6b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py,pre-commit,mypy,docs,twinecheck +envlist = py,pre-commit,mypy,docs,twinecheck,rules [testenv] deps = @@ -22,9 +22,9 @@ deps = [testenv:rules] basepython = python3.8 -deps = - {[pinned]deps} - duplicate-url-discarder-rules>=0.1.0 +deps = + {[testenv:pinned]deps} + duplicate-url-discarder-rules [testenv:pre-commit] deps = @@ -33,6 +33,7 @@ commands = pre-commit run --all-files --show-diff-on-failure [testenv:mypy] deps = + duplicate-url-discarder-rules mypy==1.9.0 pytest commands = mypy {posargs:duplicate_url_discarder tests} From 5dc55ebd64b93a606a2799f1fb6753aec59743d3 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 20:48:07 +1000 Subject: [PATCH 05/10] fix tox deps --- tox.ini | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tox.ini b/tox.ini index 1dc6d6b..d7cda71 100644 --- a/tox.ini +++ b/tox.ini @@ -16,9 +16,9 @@ commands = basepython = python3.8 deps = {[testenv]deps} - Scrapy==2.7.0 - url-matcher==0.5.0 - w3lib==1.22.0 + Scrapy>=2.7.0 + url-matcher>=0.5.0 + w3lib>=1.22.0 [testenv:rules] basepython = python3.8 From 3035fb2c11de9f4f9c974b8d8d0fd4657bb35db5 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 20:51:16 +1000 Subject: [PATCH 06/10] bump deps to min latest version --- pyproject.toml | 4 ++-- tox.ini | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5ca56f8..af86b35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,9 @@ classifiers = [ ] requires-python = ">=3.8" dependencies = [ - "Scrapy >= 2.7.0", + "Scrapy >= 2.11.0", "url-matcher >= 0.5.0", - "w3lib >= 1.22.0", + "w3lib >= 2.2.0", ] dynamic = ["version"] diff --git a/tox.ini b/tox.ini index d7cda71..77054ef 100644 --- a/tox.ini +++ b/tox.ini @@ -16,9 +16,9 @@ commands = basepython = python3.8 deps = {[testenv]deps} - Scrapy>=2.7.0 + Scrapy>=2.11.0 url-matcher>=0.5.0 - w3lib>=1.22.0 + w3lib>=2.2.0 [testenv:rules] basepython = python3.8 From ec98ac60c3ce3958ff64482b4a0daa5eb835d5ae Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 20:51:35 +1000 Subject: [PATCH 07/10] add CHANGES.rst --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 4129e74..8357f95 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.1.0 (YYYY-MM-DD) +0.1.0 (2024-07-05) ------------------ * Initial version. From b48b089fc2cb73d388be123460a4211419f19c3f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 5 Jul 2024 21:40:13 +1000 Subject: [PATCH 08/10] update min dep versions --- pyproject.toml | 2 +- tox.ini | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index af86b35..f70407c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ requires-python = ">=3.8" dependencies = [ "Scrapy >= 2.11.0", "url-matcher >= 0.5.0", - "w3lib >= 2.2.0", + "w3lib >= 2.0.1", ] dynamic = ["version"] diff --git a/tox.ini b/tox.ini index 77054ef..1e4f420 100644 --- a/tox.ini +++ b/tox.ini @@ -16,9 +16,9 @@ commands = basepython = python3.8 deps = {[testenv]deps} - Scrapy>=2.11.0 - url-matcher>=0.5.0 - w3lib>=2.2.0 + Scrapy==2.11.0 + url-matcher==0.5.0 + w3lib==2.0.1 [testenv:rules] basepython = python3.8 From 9f6eb73f19646609d15feb0f64e8c2a96002affb Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 8 Jul 2024 18:23:24 +1000 Subject: [PATCH 09/10] update final release date for 0.1.0 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8357f95..1761d7a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.1.0 (2024-07-05) +0.1.0 (2024-07-08) ------------------ * Initial version. From 9b895a9825e65ffc576e54de2f8832c894abaff3 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 8 Jul 2024 18:29:45 +1000 Subject: [PATCH 10/10] add test for overriding DUD_LOAD_RULE_PATHS --- tests/test_fingerprinter.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/test_fingerprinter.py b/tests/test_fingerprinter.py index be515fe..ad5ce59 100644 --- a/tests/test_fingerprinter.py +++ b/tests/test_fingerprinter.py @@ -103,9 +103,28 @@ def get_stat(stat: str) -> Any: default_rule_paths = None -def test_default_rules(): +def test_default_rules(tmp_path): fingerprinter = get_fingerprinter({}) if default_rule_paths: assert len(fingerprinter.url_canonicalizer.processors) > 0 else: assert len(fingerprinter.url_canonicalizer.processors) == 0 + + # Regardless of the presence of the ``duplicate_url_discarder_rules`` package, + # as long as the ``DUD_LOAD_RULE_PATHS`` setting is set, rules on that will be used. + + rules_path = Path(tmp_path) / "single_rule.json" + rules_path.write_text( + json.dumps( + [ + { + "args": ["PHPSESSIONID"], + "order": 1, + "processor": "queryRemoval", + "urlPattern": {"include": []}, + }, + ] + ) + ) + fingerprinter = get_fingerprinter({"DUD_LOAD_RULE_PATHS": [str(rules_path)]}) + assert len(fingerprinter.url_canonicalizer.processors) == 1