From 43203b83e87cba3bf5d3d6b4d04f47484faffaaa Mon Sep 17 00:00:00 2001 From: Paul Walsh Date: Mon, 18 Nov 2024 10:23:58 +0200 Subject: [PATCH 1/7] Proof of concept for xloader site url --- ckanext/xloader/action.py | 23 ++++++++++++----------- ckanext/xloader/command.py | 11 +++++------ ckanext/xloader/config_declaration.yaml | 8 ++++++-- ckanext/xloader/plugin.py | 12 +++++------- ckanext/xloader/tests/test_jobs.py | 23 ++++++++++++----------- 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index c0f3f84f..f159d2f3 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -140,17 +140,18 @@ def xloader_submit(context, data_dict): qualified=True ) data = { - 'api_key': utils.get_xloader_user_apitoken(), - 'job_type': 'xloader_to_datastore', - 'result_url': callback_url, - 'metadata': { - 'ignore_hash': data_dict.get('ignore_hash', False), - 'ckan_url': config['ckan.site_url'], - 'resource_id': res_id, - 'set_url_type': data_dict.get('set_url_type', False), - 'task_created': task['last_updated'], - 'original_url': resource_dict.get('url'), - } + "api_key": utils.get_xloader_user_apitoken(), + "job_type": "xloader_to_datastore", + "result_url": callback_url, + "metadata": { + "ignore_hash": data_dict.get("ignore_hash", False), + "ckan_url": config.get("ckanext.xloader.site_url") + or config["ckan.site_url"], + "resource_id": res_id, + "set_url_type": data_dict.get("set_url_type", False), + "task_created": task["last_updated"], + "original_url": resource_dict.get("url"), + }, } if custom_queue != rq_jobs.DEFAULT_QUEUE_NAME: # Don't automatically retry if it's a custom run diff --git a/ckanext/xloader/command.py b/ckanext/xloader/command.py index 4c0f2d2f..91a39acf 100644 --- a/ckanext/xloader/command.py +++ b/ckanext/xloader/command.py @@ -114,12 +114,11 @@ def _submit_resource(self, resource, user, indent=0, sync=False, queue=None): 'ignore_hash': True, } if sync: - data_dict['ckan_url'] = tk.config.get('ckan.site_url') - input_dict = { - 'metadata': data_dict, - 'api_key': 'TODO' - } - logger = logging.getLogger('ckanext.xloader.cli') + data_dict["ckan_url"] = tk.config.get( + "ckanext.xloader.site_url" + ) or tk.config.get("ckan.site_url") + input_dict = {"metadata": data_dict, "api_key": "TODO"} + logger = logging.getLogger("ckanext.xloader.cli") xloader_data_into_datastore_(input_dict, None, logger) else: if queue: diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 6d85e896..be404c93 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -2,6 +2,12 @@ version: 1 groups: - annotation: ckanext-xloader settings options: + - key: ckanext.xloader.site_url + default: + description: | + Provide an alternate site URL for the xloader_submit action. + This is useful, for example, when the site is running within a docker network. + required: false - key: ckanext.xloader.jobs_db.uri default: sqlite:////tmp/xloader_jobs.db description: | @@ -152,5 +158,3 @@ groups: they will also display "complete", "active", "inactive", and "unknown". type: bool required: false - - diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 07af8db7..a7869e60 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -61,13 +61,11 @@ def configure(self, config_): else: self.ignore_hash = False - for config_option in ("ckan.site_url",): - if not config_.get(config_option): - raise Exception( - "Config option `{0}` must be set to use ckanext-xloader.".format( - config_option - ) - ) + site_url_configs = ("ckan.site_url", "ckanext.xloader.site_url") + if not any(site_url_configs): + raise Exception( + f"One of config options {site_url_configs} must be set to use ckanext-xloader." + ) # IDomainObjectModification diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index e819dad9..498eaa56 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -59,17 +59,18 @@ def data(create_with_upload, apikey): "api.action", ver=3, logic_function="xloader_hook", qualified=True ) return { - 'api_key': apikey, - 'job_type': 'xloader_to_datastore', - 'result_url': callback_url, - 'metadata': { - 'ignore_hash': True, - 'ckan_url': toolkit.config.get('ckan.site_url'), - 'resource_id': resource["id"], - 'set_url_type': False, - 'task_created': datetime.utcnow().isoformat(), - 'original_url': resource["url"], - } + "api_key": apikey, + "job_type": "xloader_to_datastore", + "result_url": callback_url, + "metadata": { + "ignore_hash": True, + "ckan_url": toolkit.config.get("ckanext.xloader.site_url") + or toolkit.config.get("ckan.site_url"), + "resource_id": resource["id"], + "set_url_type": False, + "task_created": datetime.utcnow().isoformat(), + "original_url": resource["url"], + }, } From da5c031693d842d9cd1396bd17e388d6ded2e92a Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 14 Jan 2025 10:35:13 +0100 Subject: [PATCH 2/7] Updates to config_declaration.yaml, jobs.py --- ckanext/xloader/config_declaration.yaml | 4 +++- ckanext/xloader/jobs.py | 22 +++++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index be404c93..4d1b6d50 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -3,10 +3,12 @@ groups: - annotation: ckanext-xloader settings options: - key: ckanext.xloader.site_url - default: + example: http://ckan-dev:5000 + default: http://ckan-dev:5000 description: | Provide an alternate site URL for the xloader_submit action. This is useful, for example, when the site is running within a docker network. + validators: configured_default("ckan.site_url",None) required: false - key: ckanext.xloader.jobs_db.uri default: sqlite:////tmp/xloader_jobs.db diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 3ac8ebba..3d77e16c 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -12,11 +12,13 @@ import sys from psycopg2 import errors -from six.moves.urllib.parse import urlsplit +from six.moves.urllib.parse import urlsplit, urlparse, urlunparse import requests from rq import get_current_job import sqlalchemy as sa +from urllib.parse import urljoin, urlunsplit + from ckan import model from ckan.plugins.toolkit import get_action, asbool, enqueue_job, ObjectNotFound, config @@ -79,9 +81,15 @@ def xloader_data_into_datastore(input): # First flag that this task is running, to indicate the job is not # stillborn, for when xloader_submit is deciding whether another job would # be a duplicate or not + + callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = urljoin( + callback_url.rstrip('/'), '/api/3/action/xloader_hook') + result_url = callback_url + job_dict = dict(metadata=input['metadata'], status='running') - callback_xloader_hook(result_url=input['result_url'], + callback_xloader_hook(result_url=result_url, api_key=input['api_key'], job_dict=job_dict) @@ -143,7 +151,7 @@ def xloader_data_into_datastore(input): errored = True finally: # job_dict is defined in xloader_hook's docstring - is_saved_ok = callback_xloader_hook(result_url=input['result_url'], + is_saved_ok = callback_xloader_hook(result_url=result_url, api_key=input['api_key'], job_dict=job_dict) errored = errored or not is_saved_ok @@ -285,6 +293,14 @@ def _download_resource_data(resource, data, api_key, logger): 'Only http, https, and ftp resources may be fetched.' ) + resource_uri = urlunsplit(('', '', url_parts.path, url_parts.query, url_parts.fragment)) + callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = urljoin( + callback_url.rstrip('/'), resource_uri) + + url = callback_url + url_parts = urlsplit(url) # reparse the url after the callback_url is set + # fetch the resource data logger.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) From 93c9e5fdcba5869f9dfdabd95d7e5a2011d8e7d1 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 15 Jan 2025 11:07:25 +0100 Subject: [PATCH 3/7] Changes for config_declaration.yaml and jobs.py as per PR --- ckanext/xloader/config_declaration.yaml | 2 +- ckanext/xloader/jobs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 4d1b6d50..948a4050 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -4,7 +4,7 @@ groups: options: - key: ckanext.xloader.site_url example: http://ckan-dev:5000 - default: http://ckan-dev:5000 + default: description: | Provide an alternate site URL for the xloader_submit action. This is useful, for example, when the site is running within a docker network. diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 3d77e16c..f0e4776e 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -12,7 +12,7 @@ import sys from psycopg2 import errors -from six.moves.urllib.parse import urlsplit, urlparse, urlunparse +from six.moves.urllib.parse import urlsplit import requests from rq import get_current_job import sqlalchemy as sa From deab5cbd9fbfb84252ed625134fcf00f33c7ac9d Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 16 Jan 2025 11:18:26 +0100 Subject: [PATCH 4/7] Update jobs.py missed a callback_xloader_hook call, need to update result_url here too --- ckanext/xloader/jobs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index f0e4776e..b93c5599 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -212,7 +212,11 @@ def direct_load(): set_datastore_active(data, resource, logger) if 'result_url' in input: job_dict['status'] = 'running_but_viewable' - callback_xloader_hook(result_url=input['result_url'], + callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = urljoin( + callback_url.rstrip('/'), '/api/3/action/xloader_hook') + result_url = callback_url + callback_xloader_hook(result_url=result_url, api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: %s', resource_ckan_url) From 729d5b3d0ed04ce3e203e6e620d2b31b234f8e8f Mon Sep 17 00:00:00 2001 From: Brett Date: Sun, 19 Jan 2025 15:21:53 +0100 Subject: [PATCH 5/7] Updates to jobs.py and utils.py hopefully getting closer --- ckanext/xloader/jobs.py | 19 ++++++++----------- ckanext/xloader/utils.py | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index b93c5599..baad2028 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -24,7 +24,7 @@ from . import db, loader from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError -from .utils import datastore_resource_exists, set_resource_metadata +from .utils import datastore_resource_exists, set_resource_metadata, get_ckan_url try: from ckan.lib.api_token import get_user_from_token @@ -82,14 +82,13 @@ def xloader_data_into_datastore(input): # stillborn, for when xloader_submit is deciding whether another job would # be a duplicate or not - callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = get_ckan_url() callback_url = urljoin( callback_url.rstrip('/'), '/api/3/action/xloader_hook') - result_url = callback_url job_dict = dict(metadata=input['metadata'], status='running') - callback_xloader_hook(result_url=result_url, + callback_xloader_hook(result_url=callback_url, api_key=input['api_key'], job_dict=job_dict) @@ -151,7 +150,7 @@ def xloader_data_into_datastore(input): errored = True finally: # job_dict is defined in xloader_hook's docstring - is_saved_ok = callback_xloader_hook(result_url=result_url, + is_saved_ok = callback_xloader_hook(result_url=callback_url, api_key=input['api_key'], job_dict=job_dict) errored = errored or not is_saved_ok @@ -212,11 +211,10 @@ def direct_load(): set_datastore_active(data, resource, logger) if 'result_url' in input: job_dict['status'] = 'running_but_viewable' - callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = get_ckan_url() callback_url = urljoin( callback_url.rstrip('/'), '/api/3/action/xloader_hook') - result_url = callback_url - callback_xloader_hook(result_url=result_url, + callback_xloader_hook(result_url=callback_url, api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: %s', resource_ckan_url) @@ -298,11 +296,10 @@ def _download_resource_data(resource, data, api_key, logger): ) resource_uri = urlunsplit(('', '', url_parts.path, url_parts.query, url_parts.fragment)) - callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') - callback_url = urljoin( + callback_url = get_ckan_url() + url = urljoin( callback_url.rstrip('/'), resource_uri) - url = callback_url url_parts = urlsplit(url) # reparse the url after the callback_url is set # fetch the resource data diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index db8ff06f..1dd7ae0b 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -107,6 +107,28 @@ def get_xloader_user_apitoken(): return site_user["apikey"] +def get_ckan_url(): + """ Returns the CKAN URL. + + ckan may be behind a proxy, or more likely, within a docker network. + This method returns the URL set in the config file for the CKAN instance. + Containers within the same network ie: XLoader will be able to communicate with CKAN using this URL. + """ + ckan_url = config.get('ckanext.xloader.site_url', None) + if ckan_url: + return ckan_url + + # Fall back to mandatory ckan.site_url + ckan_url = config.get('ckan.site_url') + if not ckan_url: + raise ValueError( + "The ckan.site_url configuration option is required but not set. " + "Please set this value in your CKAN configuration file." + ) + + return ckan_url + + def set_resource_metadata(update_dict): ''' Set appropriate datastore_active flag on CKAN resource. From 03a4e67a78917e81c0e2dc1a7027037fc519244d Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 28 Jan 2025 13:00:15 +0100 Subject: [PATCH 6/7] Updates to jobs.py, utils.py and tests in test_jobs.py --- ckanext/xloader/jobs.py | 32 +++++--------- ckanext/xloader/tests/test_jobs.py | 60 +++++++++++++++++++++++++ ckanext/xloader/utils.py | 70 ++++++++++++++++++++++-------- 3 files changed, 124 insertions(+), 38 deletions(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index baad2028..0e3d0dfd 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -24,7 +24,7 @@ from . import db, loader from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError -from .utils import datastore_resource_exists, set_resource_metadata, get_ckan_url +from .utils import datastore_resource_exists, set_resource_metadata, modify_resource_url, modify_ckan_url try: from ckan.lib.api_token import get_user_from_token @@ -82,13 +82,10 @@ def xloader_data_into_datastore(input): # stillborn, for when xloader_submit is deciding whether another job would # be a duplicate or not - callback_url = get_ckan_url() - callback_url = urljoin( - callback_url.rstrip('/'), '/api/3/action/xloader_hook') - job_dict = dict(metadata=input['metadata'], status='running') - callback_xloader_hook(result_url=callback_url, + + callback_xloader_hook(result_url=input['result_url'], api_key=input['api_key'], job_dict=job_dict) @@ -150,7 +147,7 @@ def xloader_data_into_datastore(input): errored = True finally: # job_dict is defined in xloader_hook's docstring - is_saved_ok = callback_xloader_hook(result_url=callback_url, + is_saved_ok = callback_xloader_hook(result_url=input['result_url'], api_key=input['api_key'], job_dict=job_dict) errored = errored or not is_saved_ok @@ -211,10 +208,7 @@ def direct_load(): set_datastore_active(data, resource, logger) if 'result_url' in input: job_dict['status'] = 'running_but_viewable' - callback_url = get_ckan_url() - callback_url = urljoin( - callback_url.rstrip('/'), '/api/3/action/xloader_hook') - callback_xloader_hook(result_url=callback_url, + callback_xloader_hook(result_url=input['result_url'], api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: %s', resource_ckan_url) @@ -286,22 +280,17 @@ def _download_resource_data(resource, data, api_key, logger): data['datastore_contains_all_records_of_source_file'] = False which will be saved to the resource later on. ''' - # check scheme + url = resource.get('url') - url_parts = urlsplit(url) + url = modify_resource_url(url) + # check scheme + url_parts = urlsplit(url) scheme = url_parts.scheme if scheme not in ('http', 'https', 'ftp'): raise JobError( 'Only http, https, and ftp resources may be fetched.' ) - resource_uri = urlunsplit(('', '', url_parts.path, url_parts.query, url_parts.fragment)) - callback_url = get_ckan_url() - url = urljoin( - callback_url.rstrip('/'), resource_uri) - - url_parts = urlsplit(url) # reparse the url after the callback_url is set - # fetch the resource data logger.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) @@ -455,7 +444,8 @@ def callback_xloader_hook(result_url, api_key, job_dict): else: header, key = 'Authorization', api_key headers[header] = key - + + result_url = modify_ckan_url(result_url, job_dict['metadata']['ckan_url']) try: result = requests.post( result_url, diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index 498eaa56..ab4cae87 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -90,6 +90,66 @@ def test_xloader_data_into_datastore(self, cli, data): resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) assert resource["datastore_contains_all_records_of_source_file"] + def test_download_resource_data_with_ckanext_xloader_site_url(self, cli, data): + # Set the ckanext.xloader.site_url in the config + with mock.patch.dict(toolkit.config, {'ckanext.xloader.site_url': 'http://xloader-site-url'}): + data['metadata']['original_url'] = 'http://xloader-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_download_resource_data_with_ckan_site_url(self, cli, data): + # Set the ckan.site_url in the config + with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): + data['metadata']['original_url'] = 'http://ckan-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_download_resource_data_with_different_original_url(self, cli, data): + # Set the ckan.site_url in the config + with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): + data['metadata']['original_url'] = 'http://external-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_callback_xloader_hook_with_ckanext_xloader_site_url(self, cli, data): + # Set the ckanext.xloader.site_url in the config + with mock.patch.dict(toolkit.config, {'ckanext.xloader.site_url': 'http://xloader-site-url'}): + data['result_url'] = 'http://xloader-site-url/api/3/action/xloader_hook' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_callback_xloader_hook_with_ckan_site_url(self, cli, data): + # Set the ckan.site_url in the config + with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): + data['result_url'] = 'http://ckan-site-url/api/3/action/xloader_hook' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + def test_xloader_ignore_hash(self, cli, data): self.enqueue(jobs.xloader_data_into_datastore, [data]) with mock.patch("ckanext.xloader.jobs.get_response", get_response): diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index 1dd7ae0b..f9eb34a1 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -2,6 +2,7 @@ import json import datetime +import re from six import text_type as str, binary_type @@ -13,6 +14,8 @@ import ckan.plugins as p from ckan.plugins.toolkit import config +from urllib.parse import urljoin, urlunsplit, urlparse + # resource.formats accepted by ckanext-xloader. Must be lowercase here. DEFAULT_FORMATS = [ "csv", @@ -107,26 +110,59 @@ def get_xloader_user_apitoken(): return site_user["apikey"] -def get_ckan_url(): - """ Returns the CKAN URL. +def modify_ckan_url(result_url: str, ckan_url: str) -> str: + """ Modifies a URL based on CKAN site URL comparison. - ckan may be behind a proxy, or more likely, within a docker network. - This method returns the URL set in the config file for the CKAN instance. - Containers within the same network ie: XLoader will be able to communicate with CKAN using this URL. + This function compares the base URL of a given result URL against a CKAN site URL. + If they differ, the result URL is modified to use the CKAN site URL as its base + while preserving the original path. + + Args: + result_url (str): The original URL to potentially modify + ckan_url (str): The base CKAN site URL to compare against + Returns: + str: The modified URL if base URLs differ, otherwise returns original URL unchanged """ - ckan_url = config.get('ckanext.xloader.site_url', None) - if ckan_url: - return ckan_url - - # Fall back to mandatory ckan.site_url - ckan_url = config.get('ckan.site_url') - if not ckan_url: - raise ValueError( - "The ckan.site_url configuration option is required but not set. " - "Please set this value in your CKAN configuration file." - ) + parsed_url = urlparse(result_url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + path_url = parsed_url.path + + if base_url != ckan_url: + result_url = urljoin(ckan_url, path_url) + + return result_url + + +def modify_resource_url(orig_ckan_url: str) -> str: + """Returns a potentially modified CKAN URL. + + This function takes a CKAN URL and potentially modifies its base URL while preserving the path, + query parameters, and fragments. The modification occurs only if two conditions are met: + 1. The base URL of the input matches the configured CKAN site URL + 2. An xloader_site_url is configured in the settings - return ckan_url + Args: + orig_ckan_url (str): The original CKAN URL to potentially modify + Returns: + str: Either the modified URL with new base URL from xloader_site_url, + or the original URL if conditions aren't met + """ + xloader_site_url = config.get('ckanext.xloader.site_url') + ckan_site_url = config.get('ckan.site_url') + + parsed_url = urlparse(orig_ckan_url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + # If the base URL matches the CKAN site URL and xloader_site_url is set, modify the URL + if base_url == ckan_site_url and xloader_site_url: + modified_ckan_url = urljoin(xloader_site_url, parsed_url.path) + if parsed_url.query: + modified_ckan_url += f"?{parsed_url.query}" + if parsed_url.fragment: + modified_ckan_url += f"#{parsed_url.fragment}" + return modified_ckan_url + + return orig_ckan_url def set_resource_metadata(update_dict): From 4b64d4e86cbf29f532d3ccdcdd85a1de059d2e32 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 29 Jan 2025 15:48:31 +0100 Subject: [PATCH 7/7] updates to action.py and utils.py --- ckanext/xloader/action.py | 2 +- ckanext/xloader/utils.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index f159d2f3..22529559 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -146,7 +146,7 @@ def xloader_submit(context, data_dict): "metadata": { "ignore_hash": data_dict.get("ignore_hash", False), "ckan_url": config.get("ckanext.xloader.site_url") - or config["ckan.site_url"], + or config["ckan.site_url"], "resource_id": res_id, "set_url_type": data_dict.get("set_url_type", False), "task_created": task["last_updated"], diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index f9eb34a1..e7978682 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -125,9 +125,8 @@ def modify_ckan_url(result_url: str, ckan_url: str) -> str: """ parsed_url = urlparse(result_url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - path_url = parsed_url.path - if base_url != ckan_url: + path_url = parsed_url.path result_url = urljoin(ckan_url, path_url) return result_url