Skip to content

Commit

Permalink
Merge pull request #13 from zytedata/query-removal-except
Browse files Browse the repository at this point in the history
Add QueryRemovalExceptProcessor.
  • Loading branch information
wRAR authored May 21, 2024
2 parents b63909c + 75d5307 commit e25c27f
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 1 deletion.
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ The following URL processors are currently available:
the keys are specified in the arguments. If a given key appears multiple times
with different values in the URL, all of them are removed.

* ``queryRemovalExcept``: like ``queryRemoval``, but the keys specified in the
arguments are kept while all others are removed.

URL Rules
=========

Expand Down
2 changes: 2 additions & 0 deletions duplicate_url_discarder/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
from ..rule import UrlRule
from .base import UrlProcessorBase
from .query_removal import QueryRemovalProcessor
from .query_removal_except import QueryRemovalExceptProcessor

_PROCESSOR_CLASSES: Dict[str, Type[UrlProcessorBase]] = {
"queryRemoval": QueryRemovalProcessor,
"queryRemovalExcept": QueryRemovalExceptProcessor,
}


Expand Down
17 changes: 17 additions & 0 deletions duplicate_url_discarder/processors/query_removal_except.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from w3lib.url import url_query_cleaner

from .base import UrlProcessorBase


class QueryRemovalExceptProcessor(UrlProcessorBase):
def validate_args(self) -> None:
for arg in self.args:
if not isinstance(arg, str):
raise TypeError(
f"queryRemovalExcept args must be strings, not {type(arg)}: {arg}"
)

def process(self, input_url: str) -> str:
return url_query_cleaner(
input_url, self.args, unique=False, keep_fragments=True
)
45 changes: 44 additions & 1 deletion tests/test_processors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import pytest
from url_matcher import Patterns

from duplicate_url_discarder.processors import QueryRemovalProcessor, get_processor
from duplicate_url_discarder.processors import (
QueryRemovalExceptProcessor,
QueryRemovalProcessor,
get_processor,
)
from duplicate_url_discarder.rule import UrlRule


Expand Down Expand Up @@ -52,3 +56,42 @@ def test_query_removal_validate_args():
QueryRemovalProcessor(("a", None, ""))
QueryRemovalProcessor(("",))
QueryRemovalProcessor(())


@pytest.mark.parametrize(
["args", "url", "expected"],
[
((), "http://foo.example?foo=1&bar", "http://foo.example"),
(("a",), "http://foo.example?foo=1&bar", "http://foo.example"),
(("foo",), "http://foo.example?foo=1&bar", "http://foo.example?foo=1"),
(("bar",), "http://foo.example?foo=1&bar", "http://foo.example?bar"),
(
("bar",),
"http://foo.example?foo=1&foo=2&bar&bar=1",
"http://foo.example?bar&bar=1",
),
(
("bar",),
"http://foo.example?foo=1&bar#foo=frag",
"http://foo.example?bar#foo=frag",
),
(("foo", "baz"), "http://foo.example?foo=1&bar", "http://foo.example?foo=1"),
(
("foo", "bar"),
"http://foo.example?foo=1&bar",
"http://foo.example?foo=1&bar",
),
],
)
def test_query_removal_except(args, url, expected):
processor = QueryRemovalExceptProcessor(args)
assert processor.process(url) == expected


def test_query_removal_except_validate_args():
with pytest.raises(TypeError, match="strings, not <class 'bytes'>: b''"):
QueryRemovalExceptProcessor((b"",))
with pytest.raises(TypeError, match="strings, not <class 'NoneType'>: None"):
QueryRemovalExceptProcessor(("a", None, ""))
QueryRemovalExceptProcessor(("",))
QueryRemovalExceptProcessor(())

0 comments on commit e25c27f

Please sign in to comment.