From 193270b1ecef259645a82b9e9e797a5cb973f888 Mon Sep 17 00:00:00 2001 From: Alex Ioannidis Date: Mon, 4 Nov 2024 18:48:26 +0100 Subject: [PATCH] moderation: move domains ban/safe-list to DB table --- site/setup.cfg | 3 +- site/tests/moderation/test_domains.py | 46 ++++++++++++++++ site/zenodo_rdm/moderation/config.py | 6 --- site/zenodo_rdm/moderation/domains.py | 55 -------------------- site/zenodo_rdm/moderation/ext.py | 13 ----- site/zenodo_rdm/moderation/models.py | 75 +++++++++++++++++++++++++++ site/zenodo_rdm/moderation/proxies.py | 1 - site/zenodo_rdm/moderation/rules.py | 23 +++++--- 8 files changed, 138 insertions(+), 84 deletions(-) create mode 100644 site/tests/moderation/test_domains.py delete mode 100644 site/zenodo_rdm/moderation/domains.py create mode 100644 site/zenodo_rdm/moderation/models.py diff --git a/site/setup.cfg b/site/setup.cfg index c545553e..fafdbbb3 100644 --- a/site/setup.cfg +++ b/site/setup.cfg @@ -70,7 +70,8 @@ invenio_celery.tasks = invenio_oauth2server.scopes = deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope - +invenio_db.models = + zenodo_rdm_moderation = zenodo_rdm.moderation.models invenio_assets.webpack = zenodo_rdm_theme = zenodo_rdm.webpack:theme invenio_config.module = diff --git a/site/tests/moderation/test_domains.py b/site/tests/moderation/test_domains.py new file mode 100644 index 00000000..d8c1b327 --- /dev/null +++ b/site/tests/moderation/test_domains.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Domain moderation tests.""" + +import pytest + +from zenodo_rdm.moderation.models import LinkDomain, LinkDomainStatus + + +@pytest.fixture +def domains(db): + """Create test domains.""" + domains = [ + LinkDomain.create("blog.io", LinkDomainStatus.SAFE), + LinkDomain.create("spam.blog.io", LinkDomainStatus.BANNED), + LinkDomain.create("edu.ch", LinkDomainStatus.SAFE), + LinkDomain.create("cam", LinkDomainStatus.BANNED), + ] + db.session.commit() + return domains + + +@pytest.mark.parametrize( + "domain,expected_status", + [ + ("http://example.com/content", None), + ("https://blog.io/article", LinkDomainStatus.SAFE), + ("https://spam.blog.io/article", LinkDomainStatus.BANNED), + ("http://other.blog.io/article", LinkDomainStatus.SAFE), + ("https://physics.edu.ch/article", LinkDomainStatus.SAFE), + ("https://math.edu.ch/article", LinkDomainStatus.SAFE), + ("http://spam.cam/content", LinkDomainStatus.BANNED), + ("http://sub.spam.cam/content", LinkDomainStatus.BANNED), + ], +) +def test_lookup_domain(domains, domain, expected_status): + """Test domain lookup.""" + if expected_status is None: + assert LinkDomain.lookup_domain(domain) is None + else: + assert LinkDomain.lookup_domain(domain).status == expected_status diff --git a/site/zenodo_rdm/moderation/config.py b/site/zenodo_rdm/moderation/config.py index 5889e3ee..3cf4c258 100644 --- a/site/zenodo_rdm/moderation/config.py +++ b/site/zenodo_rdm/moderation/config.py @@ -9,12 +9,6 @@ from .rules import files_rule, links_rule, text_sanitization_rule, verified_user_rule -MODERATION_BANNED_LINK_DOMAINS = [] -"""Banned domains for links.""" - -MODERATION_SAFE_LINK_DOMAINS = [] -"""Safe domains for links.""" - MODERATION_SCORES = { "spam_link": 8, "ham_link": -3, diff --git a/site/zenodo_rdm/moderation/domains.py b/site/zenodo_rdm/moderation/domains.py deleted file mode 100644 index 27122c65..00000000 --- a/site/zenodo_rdm/moderation/domains.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2024 CERN. -# -# ZenodoRDM is free software; you can redistribute it and/or modify -# it under the terms of the MIT License; see LICENSE file for more details. - -"""Rules for moderation.""" - -import re - - -class DomainTree: - """Domain tree structure to store and check status of domains.""" - - def __init__(self): - """Initialize an empty tree to hold domains and their statuses.""" - self.tree = {} - - def add_domain(self, domain, status): - """Add a domain to the tree with its status: 'banned' or 'safe'.""" - parts = domain.strip(".").split(".") - current = self.tree - for part in parts: - current = current.setdefault(part, {}) - current["status"] = status - - def initialize_links(self, links, status): - """Helper method to add multiple links to the domain tree with a given status.""" - for domain in links: - self.add_domain(domain, status) - - @staticmethod - def extract_domain(url): - """Extract and reverse domain parts from a given URL.""" - pattern = r"^(?:https?://)?(?:www\.)?([^/]+)" - match = re.search(pattern, url) - if match: - domain = match.group(1) - domain_parts = domain.split(".") - return domain_parts[::-1] - return None - - def get_status(self, url): - """Retrieve the status of a URL's domain.""" - domain_parts = self.extract_domain(url) - current = self.tree - for part in domain_parts: - if part in current: - current = current[part] - if "status" in current: - return current["status"] - else: - break - return None diff --git a/site/zenodo_rdm/moderation/ext.py b/site/zenodo_rdm/moderation/ext.py index 907bbf99..8a221d63 100644 --- a/site/zenodo_rdm/moderation/ext.py +++ b/site/zenodo_rdm/moderation/ext.py @@ -13,7 +13,6 @@ from werkzeug.utils import cached_property from . import config -from .domains import DomainTree class ZenodoModeration: @@ -36,18 +35,6 @@ def init_app(self, app): self.init_config(app) app.extensions["zenodo-moderation"] = self - @cached_property - def domain_tree(self): - """Initialize and return the DomainTree instance with config-based links.""" - domain_tree = DomainTree() - domain_tree.initialize_links( - current_app.config.get("MODERATION_BANNED_LINK_DOMAINS", []), "banned" - ) - domain_tree.initialize_links( - current_app.config.get("MODERATION_SAFE_LINK_DOMAINS", []), "safe" - ) - return domain_tree - @cached_property def scores(self): """Return moderation score values used in rules.""" diff --git a/site/zenodo_rdm/moderation/models.py b/site/zenodo_rdm/moderation/models.py new file mode 100644 index 00000000..a2af0e68 --- /dev/null +++ b/site/zenodo_rdm/moderation/models.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Moderation models.""" + +import enum +from urllib.parse import urlparse + +from invenio_db import db +from sqlalchemy_utils import ChoiceType, Timestamp + + +class LinkDomainStatus(enum.Enum): + """Link domain status.""" + + SAFE = "S" + BANNED = "B" + MODERATED = "M" + + +class LinkDomain(db.Model, Timestamp): + """Link domain model.""" + + __tablename__ = "link_domains" + + id = db.Column(db.Integer, primary_key=True) + + domain = db.Column(db.Text, nullable=False, unique=True) + status = db.Column( + ChoiceType(LinkDomainStatus, impl=db.CHAR(1)), + nullable=False, + ) + score = db.Column(db.Integer, nullable=True) + reason = db.Column(db.Text, nullable=True) + + @classmethod + def create(cls, domain, status, score=None, reason=None): + """Create a link domain.""" + parts = domain.strip(".").split(".") + domain = "." + ".".join(parts[::-1]).lower() + ld = cls(domain=domain, status=status, score=score, reason=reason) + db.session.add(ld) + return ld + + @classmethod + def lookup_domain(cls, url): + """Lookup the status of a URL's domain.""" + try: + parsed = urlparse(url) + except ValueError: + return None + + domain = parsed.netloc or "" + domain = domain.lstrip("www.") + domain_parts = domain.split(".") + if not domain_parts: + return None + + reversed_domain = "." + ".".join(domain_parts[::-1]).lower() + return ( + cls.query.filter( + # Exact match + (LinkDomain.domain == reversed_domain) + # Or subdomain match + | db.literal(reversed_domain).like(LinkDomain.domain + ".%") + ) + # Order by length of domain to get the most specific match + .order_by(db.func.length(LinkDomain.domain).desc()) + .limit(1) + .scalar() + ) diff --git a/site/zenodo_rdm/moderation/proxies.py b/site/zenodo_rdm/moderation/proxies.py index 813e42f4..caef09e6 100644 --- a/site/zenodo_rdm/moderation/proxies.py +++ b/site/zenodo_rdm/moderation/proxies.py @@ -11,5 +11,4 @@ from werkzeug.local import LocalProxy current_moderation = LocalProxy(lambda: current_app.extensions["zenodo-moderation"]) -current_domain_tree = LocalProxy(lambda: current_moderation.domain_tree) current_scores = LocalProxy(lambda: current_moderation.scores) diff --git a/site/zenodo_rdm/moderation/rules.py b/site/zenodo_rdm/moderation/rules.py index a0091e2c..18f59ad0 100644 --- a/site/zenodo_rdm/moderation/rules.py +++ b/site/zenodo_rdm/moderation/rules.py @@ -11,9 +11,8 @@ from flask import current_app -from zenodo_rdm.moderation.proxies import current_domain_tree - -from .proxies import current_domain_tree, current_scores +from .models import LinkDomain, LinkDomainStatus +from .proxies import current_scores # # Utilities @@ -65,11 +64,19 @@ def links_rule(identity, draft=None, record=None): extracted_links = extract_links(str(record.metadata)) for link in extracted_links: - status = current_domain_tree.get_status(link) - if status == "banned": - score += current_scores.spam_link - elif status == "safe": - score += current_scores.ham_link + domain = LinkDomain.lookup_domain(link) + if domain is None: + continue + if domain.status == LinkDomainStatus.BANNED: + if domain.score is not None: + score += domain.score + else: + score += current_scores.spam_link + elif domain == LinkDomainStatus.SAFE: + if domain.score is not None: + score += domain.score + else: + score += current_scores.ham_link return score