Skip to content

Commit

Permalink
curation: init module; add EURecordCurator
Browse files Browse the repository at this point in the history
  • Loading branch information
yashlamba committed Nov 26, 2024
1 parent 5ffb2ce commit 60108af
Show file tree
Hide file tree
Showing 7 changed files with 281 additions and 0 deletions.
3 changes: 3 additions & 0 deletions site/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,14 @@ invenio_base.apps =
zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration
invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE
zenodo_rdm_stats = zenodo_rdm.stats.ext:ZenodoStats
zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration
invenio_base.api_apps =
zenodo_rdm_legacy = zenodo_rdm.legacy.ext:ZenodoLegacy
profiler = zenodo_rdm.profiler:Profiler
zenodo_rdm_metrics = zenodo_rdm.metrics.ext:ZenodoMetrics
zenodo_rdm_moderation = zenodo_rdm.moderation.ext:ZenodoModeration
invenio_openaire = zenodo_rdm.openaire.ext:OpenAIRE
zenodo_rdm_curation = zenodo_rdm.curation.ext:ZenodoCuration
invenio_base.api_blueprints =
zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint
zenodo_rdm_legacy_records = zenodo_rdm.legacy.views:create_legacy_records_bp
Expand All @@ -69,6 +71,7 @@ invenio_celery.tasks =
zenodo_rdm_openaire = zenodo_rdm.openaire.tasks
zenodo_rdm_moderation = zenodo_rdm.moderation.tasks
zenodo_stats = zenodo_rdm.stats.tasks
zenodo_rdm_curations = zenodo_rdm.curation.tasks
invenio_oauth2server.scopes =
deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope
deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope
Expand Down
7 changes: 7 additions & 0 deletions site/zenodo_rdm/curation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 CERN.
#
# Zenodo-RDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Curation module."""
24 changes: 24 additions & 0 deletions site/zenodo_rdm/curation/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Moderation config."""

from .rules import (
award_acronym_in_description,
award_acronym_in_title,
test_phrases_in_record,
)

CURATION_EU_RULES = {
"award_acronym_in_title": award_acronym_in_title,
"award_acronym_in_description": award_acronym_in_description,
"test_phrases_in_record": test_phrases_in_record,
}
"""Rules to run for EU Curation."""

CURATION_ENABLE_EU_CURATOR = False
"""Controls whether to dry run EU Curation."""
91 changes: 91 additions & 0 deletions site/zenodo_rdm/curation/curators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Curators for ZenodoRDM Curation."""


from flask import current_app
from invenio_access.permissions import system_identity
from invenio_rdm_records.proxies import current_record_communities_service
from invenio_records_resources.services.uow import UnitOfWork


class BaseCurator:
"""Base Curator class."""

def __init__(self, dry=False, raise_exc=False) -> None:
"""Constructor."""
self.dry = dry
self.raise_exc = raise_exc

def _evaluator(self, results):
"""Evaluates final result for based on results dict."""
raise NotImplementedError()

@property
def rules(self):
"""Get rules to run."""
raise NotImplementedError()

def run(self, record):
"""Run rules for the curator and evaluate result."""
rule_results = {}
for name, rule in self.rules.items():
try:
rule_results[name] = rule(record)
except Exception as e:
if self.raise_exc:
raise e
rule_results[name] = None

evaluation = self._evaluator(rule_results)
result = {"evaluation": evaluation, "rules": rule_results}
self._post_run(record, result)
return result

def _post_run(self, record, result):
"""Actions to take after calculating rules."""
pass


class EURecordCurator(BaseCurator):
"""Curator to check records for EC community."""

def _evaluator(self, results):
"""Evaluate result for EC curation."""
score = 0
for rule, result in results.items():
# TODO put in config?
if rule == "award_in_title" and result:
score += 5
if rule == "award_in_description" and result:
score += 10
if rule == "test_word_record" and result:
return False
return score >= current_app.config.get("CURATION_EU_CURATION_THRESHOLD")

@property
def rules(self):
"""Get rules to run from config."""
return current_app.config.get("CURATION_EU_RULES", {})

def _post_run(self, record, result):
"""Actions to take after run."""
if self.dry:
current_app.logger.info(
f"Processed record ID: {record.pid.pid_value}", result
) # TODO use error? Or should we log from the task
return
if result["evaluation"]:
with UnitOfWork() as uow:
current_record_communities_service.bulk_add(
system_identity,
current_app.config.get("EU_COMMUNITY_ID"),
[record.pid.pid_value],
uow=uow,
)
uow.commit()
36 changes: 36 additions & 0 deletions site/zenodo_rdm/curation/ext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""ZenodoRDM Curation module."""

from types import SimpleNamespace

from flask import current_app
from werkzeug.utils import cached_property

from . import config


class ZenodoCuration:
"""Zenodo content curation extension."""

def __init__(self, app=None):
"""Extension initialization."""
if app:
self.init_app(app)

@staticmethod
def init_config(app):
"""Initialize configuration."""
for k in dir(config):
if k.startswith("CURATION_"):
app.config.setdefault(k, getattr(config, k))

def init_app(self, app):
"""Flask application initialization."""
self.init_config(app)
app.extensions["zenodo-curation"] = self
55 changes: 55 additions & 0 deletions site/zenodo_rdm/curation/rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-

Check failure on line 1 in site/zenodo_rdm/curation/rules.py

View workflow job for this annotation

GitHub Actions / Python (site, 3.9, postgresql14, opensearch2)

pydocstyle-check /home/runner/work/zenodo-rdm/zenodo-rdm/site/zenodo_rdm/curation/rules.py:15 in public function `award_acronym_in_description`: D202: No blank lines allowed after function docstring (found 1) /home/runner/work/zenodo-rdm/zenodo-rdm/site/zenodo_rdm/curation/rules.py:31 in public function `award_acronym_in_title`: D202: No blank lines allowed after function docstring (found 1) /home/runner/work/zenodo-rdm/zenodo-rdm/site/zenodo_rdm/curation/rules.py:47 in public function `test_phrases_in_record`: D202: No blank lines allowed after function docstring (found 1)
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Rules for curation."""

from flask import current_app
from invenio_records_resources.proxies import current_service_registry


def award_acronym_in_description(record):
"""Check if EU award name in record description."""

award_service = current_service_registry.get("awards")
description = record.metadata["description"]
funding = record.metadata["funding"]

for f in funding:
if f["funder"]["id"] == "00k4n6c32":
if "award" in f:
award = award_service.record_cls.pid.resolve(f["award"]["id"])
if award["acronym"].lower() in description.lower():
return True
return False


def award_acronym_in_title(record):
"""Check if EU award name in record title."""

award_service = current_service_registry.get("awards")
title = record.metadata["title"]
funding = record.metadata["funding"]

for f in funding:
if f["funder"]["id"] == "00k4n6c32":
if "award" in f:
award = award_service.record_cls.pid.resolve(f["award"]["id"])
if award["acronym"].lower() in title.lower():
return True
return False


def test_phrases_in_record(record):
"""Check if test words in record."""

test_phrases = current_app.config.get("CURATION_TEST_PHRASES")
record_data = record.metadata["title"] + " " + record.metadata["description"]

for word in test_phrases:
if word.lower() in record_data.lower():
return True
return False
65 changes: 65 additions & 0 deletions site/zenodo_rdm/curation/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from datetime import datetime, timedelta

Check failure on line 1 in site/zenodo_rdm/curation/tasks.py

View workflow job for this annotation

GitHub Actions / Python (site, 3.9, postgresql14, opensearch2)

pydocstyle-check /home/runner/work/zenodo-rdm/zenodo-rdm/site/zenodo_rdm/curation/tasks.py:1 at module level: D100: Missing docstring in public module

Check failure on line 1 in site/zenodo_rdm/curation/tasks.py

View workflow job for this annotation

GitHub Actions / Python (site, 3.9, postgresql14, opensearch2)

isort-check from invenio_access.permissions import system_identity from invenio_rdm_records.proxies import current_rdm_records_service as records_service from invenio_search.engine import dsl + from zenodo_rdm.curation.curators import EURecordCurator

from celery import shared_task
from flask import current_app
from invenio_access.permissions import system_identity
from invenio_rdm_records.proxies import current_rdm_records_service as records_service
from invenio_search.engine import dsl
from zenodo_rdm.curation.curators import EURecordCurator


@shared_task
def run_eu_record_curation(since):
"""Run EC Curator."""
ctx = {"processed": 0, "approved": 0, "failed": 0, "since": since}
dry_run = not current_app.config.get("CURATION_ENABLE_EU_CURATOR")
curator = EURecordCurator(dry=dry_run)

query = dsl.Q(
"bool",
must=[
dsl.Q("term", **{"metadata.funding.funder.id": "00k4n6c32"}),
dsl.Q("term", **{"is_deleted": False}),
dsl.Q(
"range",
created={
"lte": (datetime.now() - timedelta(days=30)).isoformat(),
},
),
dsl.Q(
"range",
updated={
"gte": datetime.fromisoformat(since).isoformat(),
},
),
],
must_not=[
dsl.Q(
"term",
**{"parent.communities.ids": current_app.config.get("EU_COMMUNITY_ID")},
)
],
)
search = records_service.create_search(
system_identity,
records_service.record_cls,
records_service.config.search,
extra_filter=query,
)

for item in search.scan():
record = records_service.record_cls.pid.resolve(item["id"])
try:
result = curator.run(record=record)
ctx["processed"] += 1
except Exception:
# NOTE Since curator's raise_exc is by default false, rules would not fail.
# This catches failure due to other reasons
ctx["failed"] += 1
if result["evaluation"]:
ctx["approved"] += 1

current_app.logger.error(
f"EU curation processed",
extra=ctx,
)

0 comments on commit 60108af

Please sign in to comment.