-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
curation: init module; add EURecordCurator
- Loading branch information
Showing
7 changed files
with
281 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2023 CERN. | ||
# | ||
# Zenodo-RDM is free software; you can redistribute it and/or modify | ||
# it under the terms of the MIT License; see LICENSE file for more details. | ||
"""Curation module.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2024 CERN. | ||
# | ||
# ZenodoRDM is free software; you can redistribute it and/or modify | ||
# it under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Moderation config.""" | ||
|
||
from .rules import ( | ||
award_acronym_in_description, | ||
award_acronym_in_title, | ||
test_phrases_in_record, | ||
) | ||
|
||
CURATION_EU_RULES = { | ||
"award_acronym_in_title": award_acronym_in_title, | ||
"award_acronym_in_description": award_acronym_in_description, | ||
"test_phrases_in_record": test_phrases_in_record, | ||
} | ||
"""Rules to run for EU Curation.""" | ||
|
||
CURATION_ENABLE_EU_CURATOR = False | ||
"""Controls whether to dry run EU Curation.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2024 CERN. | ||
# | ||
# ZenodoRDM is free software; you can redistribute it and/or modify | ||
# it under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Curators for ZenodoRDM Curation.""" | ||
|
||
|
||
from flask import current_app | ||
from invenio_access.permissions import system_identity | ||
from invenio_rdm_records.proxies import current_record_communities_service | ||
from invenio_records_resources.services.uow import UnitOfWork | ||
|
||
|
||
class BaseCurator: | ||
"""Base Curator class.""" | ||
|
||
def __init__(self, dry=False, raise_exc=False) -> None: | ||
"""Constructor.""" | ||
self.dry = dry | ||
self.raise_exc = raise_exc | ||
|
||
def _evaluator(self, results): | ||
"""Evaluates final result for based on results dict.""" | ||
raise NotImplementedError() | ||
|
||
@property | ||
def rules(self): | ||
"""Get rules to run.""" | ||
raise NotImplementedError() | ||
|
||
def run(self, record): | ||
"""Run rules for the curator and evaluate result.""" | ||
rule_results = {} | ||
for name, rule in self.rules.items(): | ||
try: | ||
rule_results[name] = rule(record) | ||
except Exception as e: | ||
if self.raise_exc: | ||
raise e | ||
rule_results[name] = None | ||
|
||
evaluation = self._evaluator(rule_results) | ||
result = {"evaluation": evaluation, "rules": rule_results} | ||
self._post_run(record, result) | ||
return result | ||
|
||
def _post_run(self, record, result): | ||
"""Actions to take after calculating rules.""" | ||
pass | ||
|
||
|
||
class EURecordCurator(BaseCurator): | ||
"""Curator to check records for EC community.""" | ||
|
||
def _evaluator(self, results): | ||
"""Evaluate result for EC curation.""" | ||
score = 0 | ||
for rule, result in results.items(): | ||
# TODO put in config? | ||
if rule == "award_in_title" and result: | ||
score += 5 | ||
if rule == "award_in_description" and result: | ||
score += 10 | ||
if rule == "test_word_record" and result: | ||
return False | ||
return score >= current_app.config.get("CURATION_EU_CURATION_THRESHOLD") | ||
|
||
@property | ||
def rules(self): | ||
"""Get rules to run from config.""" | ||
return current_app.config.get("CURATION_EU_RULES", {}) | ||
|
||
def _post_run(self, record, result): | ||
"""Actions to take after run.""" | ||
if self.dry: | ||
current_app.logger.info( | ||
f"Processed record ID: {record.pid.pid_value}", result | ||
) # TODO use error? Or should we log from the task | ||
return | ||
if result["evaluation"]: | ||
with UnitOfWork() as uow: | ||
current_record_communities_service.bulk_add( | ||
system_identity, | ||
current_app.config.get("EU_COMMUNITY_ID"), | ||
[record.pid.pid_value], | ||
uow=uow, | ||
) | ||
uow.commit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2024 CERN. | ||
# | ||
# ZenodoRDM is free software; you can redistribute it and/or modify | ||
# it under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""ZenodoRDM Curation module.""" | ||
|
||
from types import SimpleNamespace | ||
|
||
from flask import current_app | ||
from werkzeug.utils import cached_property | ||
|
||
from . import config | ||
|
||
|
||
class ZenodoCuration: | ||
"""Zenodo content curation extension.""" | ||
|
||
def __init__(self, app=None): | ||
"""Extension initialization.""" | ||
if app: | ||
self.init_app(app) | ||
|
||
@staticmethod | ||
def init_config(app): | ||
"""Initialize configuration.""" | ||
for k in dir(config): | ||
if k.startswith("CURATION_"): | ||
app.config.setdefault(k, getattr(config, k)) | ||
|
||
def init_app(self, app): | ||
"""Flask application initialization.""" | ||
self.init_config(app) | ||
app.extensions["zenodo-curation"] = self |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# -*- coding: utf-8 -*- | ||
Check failure on line 1 in site/zenodo_rdm/curation/rules.py
|
||
# | ||
# Copyright (C) 2024 CERN. | ||
# | ||
# ZenodoRDM is free software; you can redistribute it and/or modify | ||
# it under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Rules for curation.""" | ||
|
||
from flask import current_app | ||
from invenio_records_resources.proxies import current_service_registry | ||
|
||
|
||
def award_acronym_in_description(record): | ||
"""Check if EU award name in record description.""" | ||
|
||
award_service = current_service_registry.get("awards") | ||
description = record.metadata["description"] | ||
funding = record.metadata["funding"] | ||
|
||
for f in funding: | ||
if f["funder"]["id"] == "00k4n6c32": | ||
if "award" in f: | ||
award = award_service.record_cls.pid.resolve(f["award"]["id"]) | ||
if award["acronym"].lower() in description.lower(): | ||
return True | ||
return False | ||
|
||
|
||
def award_acronym_in_title(record): | ||
"""Check if EU award name in record title.""" | ||
|
||
award_service = current_service_registry.get("awards") | ||
title = record.metadata["title"] | ||
funding = record.metadata["funding"] | ||
|
||
for f in funding: | ||
if f["funder"]["id"] == "00k4n6c32": | ||
if "award" in f: | ||
award = award_service.record_cls.pid.resolve(f["award"]["id"]) | ||
if award["acronym"].lower() in title.lower(): | ||
return True | ||
return False | ||
|
||
|
||
def test_phrases_in_record(record): | ||
"""Check if test words in record.""" | ||
|
||
test_phrases = current_app.config.get("CURATION_TEST_PHRASES") | ||
record_data = record.metadata["title"] + " " + record.metadata["description"] | ||
|
||
for word in test_phrases: | ||
if word.lower() in record_data.lower(): | ||
return True | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from datetime import datetime, timedelta | ||
Check failure on line 1 in site/zenodo_rdm/curation/tasks.py
|
||
|
||
from celery import shared_task | ||
from flask import current_app | ||
from invenio_access.permissions import system_identity | ||
from invenio_rdm_records.proxies import current_rdm_records_service as records_service | ||
from invenio_search.engine import dsl | ||
from zenodo_rdm.curation.curators import EURecordCurator | ||
|
||
|
||
@shared_task | ||
def run_eu_record_curation(since): | ||
"""Run EC Curator.""" | ||
ctx = {"processed": 0, "approved": 0, "failed": 0, "since": since} | ||
dry_run = not current_app.config.get("CURATION_ENABLE_EU_CURATOR") | ||
curator = EURecordCurator(dry=dry_run) | ||
|
||
query = dsl.Q( | ||
"bool", | ||
must=[ | ||
dsl.Q("term", **{"metadata.funding.funder.id": "00k4n6c32"}), | ||
dsl.Q("term", **{"is_deleted": False}), | ||
dsl.Q( | ||
"range", | ||
created={ | ||
"lte": (datetime.now() - timedelta(days=30)).isoformat(), | ||
}, | ||
), | ||
dsl.Q( | ||
"range", | ||
updated={ | ||
"gte": datetime.fromisoformat(since).isoformat(), | ||
}, | ||
), | ||
], | ||
must_not=[ | ||
dsl.Q( | ||
"term", | ||
**{"parent.communities.ids": current_app.config.get("EU_COMMUNITY_ID")}, | ||
) | ||
], | ||
) | ||
search = records_service.create_search( | ||
system_identity, | ||
records_service.record_cls, | ||
records_service.config.search, | ||
extra_filter=query, | ||
) | ||
|
||
for item in search.scan(): | ||
record = records_service.record_cls.pid.resolve(item["id"]) | ||
try: | ||
result = curator.run(record=record) | ||
ctx["processed"] += 1 | ||
except Exception: | ||
# NOTE Since curator's raise_exc is by default false, rules would not fail. | ||
# This catches failure due to other reasons | ||
ctx["failed"] += 1 | ||
if result["evaluation"]: | ||
ctx["approved"] += 1 | ||
|
||
current_app.logger.error( | ||
f"EU curation processed", | ||
extra=ctx, | ||
) |