diff --git a/site/setup.cfg b/site/setup.cfg index 58bfa192..7e30e2a2 100644 --- a/site/setup.cfg +++ b/site/setup.cfg @@ -48,12 +48,14 @@ invenio_base.apps = zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE zenodo_rdm_stats = zenodo_rdm.stats.ext:ZenodoStats + zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration invenio_base.api_apps = zenodo_rdm_legacy = zenodo_rdm.legacy.ext:ZenodoLegacy profiler = zenodo_rdm.profiler:Profiler zenodo_rdm_metrics = zenodo_rdm.metrics.ext:ZenodoMetrics zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE + zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration invenio_base.api_blueprints = zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint zenodo_rdm_legacy_records = zenodo_rdm.legacy.views:create_legacy_records_bp @@ -69,6 +71,7 @@ invenio_celery.tasks = zenodo_rdm_openaire = zenodo_rdm.openaire.tasks zenodo_rdm_moderation = zenodo_rdm.moderation.tasks zenodo_stats = zenodo_rdm.stats.tasks + zenodo_rdm_curations = zenodo_rdm.curation.tasks invenio_oauth2server.scopes = deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope diff --git a/site/zenodo_rdm/curation/__init__.py b/site/zenodo_rdm/curation/__init__.py new file mode 100644 index 00000000..5a0c501f --- /dev/null +++ b/site/zenodo_rdm/curation/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 CERN. +# +# Zenodo-RDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. +"""Curation module.""" diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py new file mode 100644 index 00000000..dfe56840 --- /dev/null +++ b/site/zenodo_rdm/curation/config.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Moderation config.""" + +from .rules import ( + award_acronym_in_description, + award_acronym_in_title, + test_phrases_in_record, +) + +CURATION_EU_RULES = { + "award_acronym_in_title": award_acronym_in_title, + "award_acronym_in_description": award_acronym_in_description, + "test_phrases_in_record": test_phrases_in_record, +} +"""Rules to run for EU Curation.""" + +CURATION_ENABLE_EU_CURATOR = False +"""Controls whether to dry run EU Curation.""" diff --git a/site/zenodo_rdm/curation/curators.py b/site/zenodo_rdm/curation/curators.py new file mode 100644 index 00000000..2a7814a4 --- /dev/null +++ b/site/zenodo_rdm/curation/curators.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Curators for ZenodoRDM Curation.""" + + +from flask import current_app +from invenio_access.permissions import system_identity +from invenio_rdm_records.proxies import current_record_communities_service +from invenio_records_resources.services.uow import UnitOfWork + + +class BaseCurator: + """Base Curator class.""" + + def __init__(self, dry=False, raise_exc=False) -> None: + """Constructor.""" + self.dry = dry + self.raise_exc = raise_exc + + def _evaluator(self, results): + """Evaluates final result for based on results dict.""" + raise NotImplementedError() + + @property + def rules(self): + """Get rules to run.""" + raise NotImplementedError() + + def run(self, record): + """Run rules for the curator and evaluate result.""" + rule_results = {} + for name, rule in self.rules.items(): + try: + rule_results[name] = rule(record) + except Exception as e: + if self.raise_exc: + raise e + rule_results[name] = None + + evaluation = self._evaluator(rule_results) + result = {"evaluation": evaluation, "rules": rule_results} + self._post_run(record, result) + return result + + def _post_run(self, record, result): + """Actions to take after calculating rules.""" + pass + + +class EURecordCurator(BaseCurator): + """Curator to check records for EC community.""" + + def _evaluator(self, results): + """Evaluate result for EC curation.""" + score = 0 + for rule, result in results.items(): + # TODO put in config? + if rule == "award_in_title" and result: + score += 5 + if rule == "award_in_description" and result: + score += 10 + if rule == "test_word_record" and result: + return False + return score >= current_app.config.get("CURATION_EU_CURATION_THRESHOLD") + + @property + def rules(self): + """Get rules to run from config.""" + return current_app.config.get("CURATION_EU_RULES", {}) + + def _post_run(self, record, result): + """Actions to take after run.""" + if self.dry: + current_app.logger.info( + f"Processed record ID: {record.pid.pid_value}", result + ) # TODO use error? Or should we log from the task + return + if result["evaluation"]: + with UnitOfWork() as uow: + current_record_communities_service.bulk_add( + system_identity, + current_app.config.get("EU_COMMUNITY_ID"), + [record.pid.pid_value], + uow=uow, + ) + uow.commit() diff --git a/site/zenodo_rdm/curation/ext.py b/site/zenodo_rdm/curation/ext.py new file mode 100644 index 00000000..460b9e5c --- /dev/null +++ b/site/zenodo_rdm/curation/ext.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""ZenodoRDM Curation module.""" + +from types import SimpleNamespace + +from flask import current_app +from werkzeug.utils import cached_property + +from . import config + + +class ZenodoCuration: + """Zenodo content curation extension.""" + + def __init__(self, app=None): + """Extension initialization.""" + if app: + self.init_app(app) + + @staticmethod + def init_config(app): + """Initialize configuration.""" + for k in dir(config): + if k.startswith("CURATION_"): + app.config.setdefault(k, getattr(config, k)) + + def init_app(self, app): + """Flask application initialization.""" + self.init_config(app) + app.extensions["zenodo-curation"] = self diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py new file mode 100644 index 00000000..6af0580c --- /dev/null +++ b/site/zenodo_rdm/curation/rules.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# ZenodoRDM is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Rules for curation.""" + +from flask import current_app +from invenio_records_resources.proxies import current_service_registry + + +def award_acronym_in_description(record): + """Check if EU award name in record description.""" + + award_service = current_service_registry.get("awards") + description = record.metadata["description"] + funding = record.metadata["funding"] + + for f in funding: + if f["funder"]["id"] == "00k4n6c32": + if "award" in f: + award = award_service.record_cls.pid.resolve(f["award"]["id"]) + if award["acronym"].lower() in description.lower(): + return True + return False + + +def award_acronym_in_title(record): + """Check if EU award name in record title.""" + + award_service = current_service_registry.get("awards") + title = record.metadata["title"] + funding = record.metadata["funding"] + + for f in funding: + if f["funder"]["id"] == "00k4n6c32": + if "award" in f: + award = award_service.record_cls.pid.resolve(f["award"]["id"]) + if award["acronym"].lower() in title.lower(): + return True + return False + + +def test_phrases_in_record(record): + """Check if test words in record.""" + + test_phrases = current_app.config.get("CURATION_TEST_PHRASES") + record_data = record.metadata["title"] + " " + record.metadata["description"] + + for word in test_phrases: + if word.lower() in record_data.lower(): + return True + return False diff --git a/site/zenodo_rdm/curation/tasks.py b/site/zenodo_rdm/curation/tasks.py new file mode 100644 index 00000000..50b46512 --- /dev/null +++ b/site/zenodo_rdm/curation/tasks.py @@ -0,0 +1,65 @@ +from datetime import datetime, timedelta + +from celery import shared_task +from flask import current_app +from invenio_access.permissions import system_identity +from invenio_rdm_records.proxies import current_rdm_records_service as records_service +from invenio_search.engine import dsl +from zenodo_rdm.curation.curators import EURecordCurator + + +@shared_task +def run_eu_record_curation(since): + """Run EC Curator.""" + ctx = {"processed": 0, "approved": 0, "failed": 0, "since": since} + dry_run = not current_app.config.get("CURATION_ENABLE_EU_CURATOR") + curator = EURecordCurator(dry=dry_run) + + query = dsl.Q( + "bool", + must=[ + dsl.Q("term", **{"metadata.funding.funder.id": "00k4n6c32"}), + dsl.Q("term", **{"is_deleted": False}), + dsl.Q( + "range", + created={ + "lte": (datetime.now() - timedelta(days=30)).isoformat(), + }, + ), + dsl.Q( + "range", + updated={ + "gte": datetime.fromisoformat(since).isoformat(), + }, + ), + ], + must_not=[ + dsl.Q( + "term", + **{"parent.communities.ids": current_app.config.get("EU_COMMUNITY_ID")}, + ) + ], + ) + search = records_service.create_search( + system_identity, + records_service.record_cls, + records_service.config.search, + extra_filter=query, + ) + + for item in search.scan(): + record = records_service.record_cls.pid.resolve(item["id"]) + try: + result = curator.run(record=record) + ctx["processed"] += 1 + except Exception: + # NOTE Since curator's raise_exc is by default false, rules would not fail. + # This catches failure due to other reasons + ctx["failed"] += 1 + if result["evaluation"]: + ctx["approved"] += 1 + + current_app.logger.error( + f"EU curation processed", + extra=ctx, + )