diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6c12df8..b2e1cfe 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,6 +5,7 @@ name: tox on: pull_request: + branches: [ main ] push: branches: [ main ] @@ -17,6 +18,7 @@ jobs: matrix: include: - python-version: "3.8" + toxenv: rules - python-version: "3.9" - python-version: "3.10" - python-version: "3.11" diff --git a/CHANGES.rst b/CHANGES.rst index 4129e74..1761d7a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.1.0 (YYYY-MM-DD) +0.1.0 (2024-07-08) ------------------ * Initial version. diff --git a/README.rst b/README.rst index c75f0d2..b4ab8eb 100644 --- a/README.rst +++ b/README.rst @@ -35,6 +35,16 @@ Installation pip install duplicate-url-discarder +Alternatively, you can also include in the installation the predefined rules in +`duplicate-url-discarder-rules`_ via: + +.. code-block:: + + pip install duplicate-url-discarder[rules] + +If such rules are installed, they would automatically be used if the +``DUD_LOAD_RULE_PATHS`` setting is left empty (see `configuration`_). + Requires **Python 3.8+**. Using @@ -131,6 +141,8 @@ All non-universal rules (ones that have non-empty include pattern) that match a request URL are applied according to their order field. If there are no non-universal rules that match the URL, the universal ones are applied. +.. _configuration: + Configuration ============= @@ -145,6 +157,9 @@ Configuration "/home/user/project/custom_rules1.json", ] -The default value of this setting is empty. +The default value of this setting is empty. However, if the package +`duplicate-url-discarder-rules`_ is installed and ``DUD_LOAD_RULE_PATHS`` +has been left empty, the rules in the said package is automatically used. .. _scrapy-zyte-api: https://github.com/scrapy-plugins/scrapy-zyte-api +.. _duplicate-url-discarder-rules: https://github.com/zytedata/duplicate-url-discarder-rules diff --git a/duplicate_url_discarder/_fingerprinter.py b/duplicate_url_discarder/_fingerprinter.py index ae02ac5..254416e 100644 --- a/duplicate_url_discarder/_fingerprinter.py +++ b/duplicate_url_discarder/_fingerprinter.py @@ -2,7 +2,7 @@ import logging import os -from typing import TYPE_CHECKING, List, Union +from typing import TYPE_CHECKING, Sequence, Union from scrapy import Request from scrapy.crawler import Crawler @@ -20,15 +20,28 @@ logger = logging.getLogger(__name__) +try: + from importlib.metadata import version + + from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths +except ImportError: + default_rule_paths = None + class Fingerprinter: def __init__(self, crawler: Crawler): self.crawler: Crawler = crawler - rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist( + rule_paths: Sequence[Union[str, os.PathLike]] = self.crawler.settings.getlist( "DUD_LOAD_RULE_PATHS" ) if not rule_paths: - logger.warning("DUD_LOAD_RULE_PATHS is not set or is empty.") + msg = "DUD_LOAD_RULE_PATHS is not set or is empty." + if default_rule_paths: + rule_paths = default_rule_paths + v = version("duplicate-url-discarder-rules") + msg += f" Using RULE_PATHS from duplicate-url-discarder-rules=={v} instead." + logger.warning(msg) + self._fallback_request_fingerprinter: RequestFingerprinterProtocol = ( create_instance( load_object( diff --git a/pyproject.toml b/pyproject.toml index a717b95..f70407c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,15 +24,18 @@ classifiers = [ ] requires-python = ">=3.8" dependencies = [ - "Scrapy >= 2.7.0", + "Scrapy >= 2.11.0", "url-matcher >= 0.5.0", - "w3lib >= 1.22.0", + "w3lib >= 2.0.1", ] dynamic = ["version"] [project.urls] Source = "https://github.com/zytedata/duplicate-url-discarder" +[project.optional-dependencies] +rules = ["duplicate-url-discarder-rules"] + [tool.setuptools.dynamic] version = {attr = "duplicate_url_discarder.__version__"} diff --git a/tests/test_fingerprinter.py b/tests/test_fingerprinter.py index c8dab98..ad5ce59 100644 --- a/tests/test_fingerprinter.py +++ b/tests/test_fingerprinter.py @@ -95,3 +95,36 @@ def get_stat(stat: str) -> Any: ) ) assert get_stat("url_modified") == 3 + + +try: + from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths +except ImportError: + default_rule_paths = None + + +def test_default_rules(tmp_path): + fingerprinter = get_fingerprinter({}) + if default_rule_paths: + assert len(fingerprinter.url_canonicalizer.processors) > 0 + else: + assert len(fingerprinter.url_canonicalizer.processors) == 0 + + # Regardless of the presence of the ``duplicate_url_discarder_rules`` package, + # as long as the ``DUD_LOAD_RULE_PATHS`` setting is set, rules on that will be used. + + rules_path = Path(tmp_path) / "single_rule.json" + rules_path.write_text( + json.dumps( + [ + { + "args": ["PHPSESSIONID"], + "order": 1, + "processor": "queryRemoval", + "urlPattern": {"include": []}, + }, + ] + ) + ) + fingerprinter = get_fingerprinter({"DUD_LOAD_RULE_PATHS": [str(rules_path)]}) + assert len(fingerprinter.url_canonicalizer.processors) == 1 diff --git a/tox.ini b/tox.ini index 920352e..1e4f420 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py,pre-commit,mypy,docs,twinecheck +envlist = py,pre-commit,mypy,docs,twinecheck,rules [testenv] deps = @@ -16,9 +16,15 @@ commands = basepython = python3.8 deps = {[testenv]deps} - Scrapy==2.7.0 + Scrapy==2.11.0 url-matcher==0.5.0 - w3lib==1.22.0 + w3lib==2.0.1 + +[testenv:rules] +basepython = python3.8 +deps = + {[testenv:pinned]deps} + duplicate-url-discarder-rules [testenv:pre-commit] deps = @@ -27,6 +33,7 @@ commands = pre-commit run --all-files --show-diff-on-failure [testenv:mypy] deps = + duplicate-url-discarder-rules mypy==1.9.0 pytest commands = mypy {posargs:duplicate_url_discarder tests}