Skip to content

Commit

Permalink
First application commit
Browse files Browse the repository at this point in the history
  • Loading branch information
pdelboca committed Dec 20, 2023
1 parent 472f3fd commit 4d81ba8
Show file tree
Hide file tree
Showing 30 changed files with 1,043 additions and 0 deletions.
28 changes: 28 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
ARG PYTHON_VERSION=3.11-slim-bullseye

FROM python:${PYTHON_VERSION}

ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1

RUN mkdir -p /code

WORKDIR /code

# Install dependencies
# wget: django-dcat dump command requires wget
# git: to install django-dcat from github
RUN apt-get update && apt-get install -y wget git

COPY requirements.txt /tmp/requirements.txt
RUN set -ex && \
pip install --upgrade pip && \
pip install -r /tmp/requirements.txt && \
rm -rf /root/.cache/
COPY . /code

RUN python manage.py collectstatic --noinput

EXPOSE 8000

CMD ["gunicorn", "--bind", ":8000", "--workers", "2", "catalogosocial.wsgi"]
Empty file added catalogo/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions catalogo/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from django.contrib import admin

# Register your models here.
6 changes: 6 additions & 0 deletions catalogo/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from django.apps import AppConfig


class CatalogoConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "catalogo"
Empty file.
184 changes: 184 additions & 0 deletions catalogo/management/commands/importar_desde_datajson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
Este script importa los datos de un fichero data.json en el modelo de datos de la aplicación.
Es una copia del script original import_from_datajson.py de django-dcat, pero en este caso se importa la
URL original de cada distribución y el ID de la distribución en el portal de datos abierto
oficial (para tener trazabilidad a los archivos data.json).
"""

import json
import pathlib

from os import listdir

from django.core.exceptions import ObjectDoesNotExist
from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand

from dcat.models import (
Catalog,
Dataset,
Distribution,
Agent,
MediaType,
LicenceDocument,
DataTheme,
)

from catalogo.models import DatasetExtras, DistributionExtras


class Command(BaseCommand):
help = "Import data from a DCAT-US file provided by ckanext-datajson."

def _get_content_file(self, dataset, distribution, datapath="data"):
"""Returns a ContentFile to be added to the django model.
This takes into consideration the following contents in the folder
where the command is executed:
- data.json
- data/
- {dataset_identifier}
- {distribution_identifier}
- some-file.csv
"""
file_folder = (
f'{datapath}/{dataset.get("identifier")}/{distribution.get("identifier")}'
)
file = None
try:
local_file_name = listdir(file_folder)[0]
file_path = f"{file_folder}/{local_file_name}"
file = ContentFile(
open(file_path, mode="rb").read(), name=distribution.get("fileName")
)
except IndexError:
msg = f'{distribution.get("identifier")} folder does not have a file'
self.stdout.write(self.style.ERROR(msg))
return file

def add_arguments(self, parser):
parser.add_argument(
"--file", type=open, help="Path to the data.json file", default="data.json"
)
parser.add_argument(
"--datapath",
type=pathlib.Path,
help="Path to the data folder",
default="data",
)

def handle(self, *args, **options):
datapath = options.get("datapath")
if not datapath.exists():
msg = f"{datapath} path to data does not exist."
self.stdout.write(self.style.ERROR(msg))
return

data = json.load(options.get("file"))

# Import Catalog
title = data.get("title")
description = data.get("description")
publisher, _ = Agent.objects.get_or_create(
name=data.get("publisher").get("name"),
mbox=data.get("publisher").get("mbox", ""),
)
catalog_licence, _ = LicenceDocument.objects.get_or_create(
label=data.get("license")
)
catalog = Catalog.objects.create(
title=title,
description=description,
publisher=publisher,
licence=catalog_licence,
)

for theme in data.get("themeTaxonomy", []):
theme_id = theme.get("id")
theme_label = theme.get("label")
theme_description = theme.get("description")

theme_obj, _ = DataTheme.objects.get_or_create(
code=theme_id,
label=theme_label,
description=theme_description,
)
catalog.themes.add(theme_obj)

# Import Datasets
datasets = data.get("dataset")
for dataset in datasets:
dataset_info = {}
dataset_info["title"] = dataset.get("title")
dataset_info["description"] = dataset.get("description")
dataset_info["publisher"], _ = Agent.objects.get_or_create(
name=dataset.get("publisher").get("name"),
mbox=dataset.get("publisher").get("mbox", ""),
)
dataset_info["catalog"] = catalog
dataset_created = Dataset.objects.create(**dataset_info)

_dataset_extras = DatasetExtras.objects.create(
dataset=dataset_created,
original_landing_page=dataset.get("landingPage"),
original_id=dataset.get("identifier", ""),
)
dataset_created.extras = _dataset_extras
dataset_created.save()

for theme in dataset.get("theme", []):
try:
dataset_theme = DataTheme.objects.get(code=theme)
except ObjectDoesNotExist:
msg = (
f"Theme of {dataset.get('identifier')} does not existed a theme"
)
self.stdout.write(self.style.WARNING(msg))
dataset_created.themes.add(dataset_theme)

# Import Distributions
distributions = dataset.get("distribution", [])
for distribution in distributions:
distribution_info = {}
distribution_info["dataset"] = dataset_created
distribution_info["title"] = distribution.get("title")
distribution_info["description"] = distribution.get("description", "")
distribution_info["file"] = self._get_content_file(
dataset, distribution, datapath=options.get("datapath")
)
file_name = distribution.get("fileName")
if not file_name:
# If the file name is not provided, the dataset is hosted
# in another portal. We add the download_url instead.
external_download = distribution.get("downloadURL")
if external_download:
distribution_info["external_download_url"] = distribution.get(
"downloadURL"
)
else:
msg = f'{distribution.get("identifier")} does not have a file name or a download url'
self.stdout.write(self.style.ERROR(msg))

_format = distribution.get("format", "").strip(". ").upper()
if _format:
format, _ = MediaType.objects.get_or_create(extension=_format)
distribution_info["format"] = format

_licence = distribution.get("license")
if _licence:
licence, _ = LicenceDocument.objects.get_or_create(label=_licence)
distribution_info["licence"] = licence

distribution_created = Distribution.objects.create(**distribution_info)

_distribution_extras = DistributionExtras.objects.create(
distribution=distribution_created,
original_access_url=distribution.get("accessURL"),
original_download_url=distribution.get("downloadURL"),
original_id=distribution.get("identifier", ""),
)
distribution_created.extras = _distribution_extras
distribution_created.save()

self.stdout.write(self.style.SUCCESS("Data imported successfully"))
64 changes: 64 additions & 0 deletions catalogo/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Generated by Django 5.0 on 2023-12-18 20:23

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
initial = True

dependencies = [
("dcat", "0001_initial"),
]

operations = [
migrations.CreateModel(
name="DatasetExtras",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("original_landing_page", models.URLField(blank=True, null=True)),
("original_id", models.CharField(blank=True, max_length=255)),
(
"dataset",
models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
related_name="extras",
to="dcat.dataset",
),
),
],
),
migrations.CreateModel(
name="DistributionExtras",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("original_access_url", models.URLField(blank=True, null=True)),
("original_download_url", models.URLField(blank=True, null=True)),
("original_id", models.CharField(blank=True, max_length=255)),
(
"distribution",
models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
related_name="extras",
to="dcat.distribution",
),
),
],
),
]
Empty file added catalogo/migrations/__init__.py
Empty file.
19 changes: 19 additions & 0 deletions catalogo/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from django.db import models
from dcat.models import Dataset, Distribution


class DatasetExtras(models.Model):
dataset = models.OneToOneField(
Dataset, on_delete=models.CASCADE, related_name="extras"
)
original_landing_page = models.URLField(blank=True, null=True)
original_id = models.CharField(max_length=255, blank=True)


class DistributionExtras(models.Model):
distribution = models.OneToOneField(
Distribution, on_delete=models.CASCADE, related_name="extras"
)
original_access_url = models.URLField(blank=True, null=True)
original_download_url = models.URLField(blank=True, null=True)
original_id = models.CharField(max_length=255, blank=True)
Loading

0 comments on commit 4d81ba8

Please sign in to comment.