From b5980e4a1c726c2e6d98f968424d0be7b1fc3cb0 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 20 Nov 2024 12:35:40 +0100 Subject: [PATCH 01/41] Update README, migrate it to Markdown Reformatted the README as Markdown to make it easier to update. Did a full review and updated broken links and added config options to increase visibility (generated with `ckan config docs --enabled -f md`) --- MANIFEST.in | 2 +- README.md | 559 ++++++++++++++++++++++++++++++++++++++++++++++++++++ README.rst | 380 ----------------------------------- setup.py | 4 +- 4 files changed, 562 insertions(+), 383 deletions(-) create mode 100644 README.md delete mode 100644 README.rst diff --git a/MANIFEST.in b/MANIFEST.in index 4aeb22fa..852a43d9 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ include *requirements*.txt include CHANGELOG include LICENSE -include README.rst +include README.md include ckanext/xloader/config_declaration.yaml recursive-include ckanext/xloader/templates *.html diff --git a/README.md b/README.md new file mode 100644 index 00000000..e961d7af --- /dev/null +++ b/README.md @@ -0,0 +1,559 @@ +# XLoader - ckanext-xloader + +[![Tests](https://github.com/ckan/ckanext-xloader/workflows/Tests/badge.svg?branch=master)](https://github.com/ckan/ckanext-xloader/actions) + +[![Latest Version](https://img.shields.io/pypi/v/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) + +[![Supported Python versions](https://img.shields.io/pypi/pyversions/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) + +[![Development Status](https://img.shields.io/pypi/status/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) + +[![License](https://img.shields.io/pypi/l/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) + + +Loads CSV (and similar) data into CKAN's DataStore. Designed as a +replacement for DataPusher because it offers ten times the speed and +more robustness (hence the name, derived from "Express Loader") + +**OpenGov Inc.** has sponsored this development, with the aim of +benefitting open data infrastructure worldwide. +* [Key differences from DataPusher](#key-differences-from-datapusher) +* [Requirements](#requirements) +* [Installation](#installation) +* [Config settings](#config-settings) +* [Developer installation](#developer-installation) +* [Upgrading from DataPusher](#upgrading-from-datapusher) +* [Command-line interface](#command-line-interface) + * [Jobs and workers](#jobs-and-workers) +* [Troubleshooting](#troubleshooting) +* [Running the Tests](#running-the-tests) +* [Releasing a New Version of XLoader](#releasing-a-new-version-of-xloader) + +## Key differences from DataPusher + +### Speed of loading + +DataPusher - parses CSV rows, converts to detected column types, +converts the data to a JSON string, calls datastore_create for each +batch of rows, which reformats the data into an INSERT statement string, +which is passed to PostgreSQL. + +XLoader - pipes the CSV file directly into PostgreSQL using COPY. + +In [tests](https://github.com/ckan/ckanext-xloader/issues/25), XLoader +is over ten times faster than DataPusher. + +### Robustness + +DataPusher - one cause of failure was when casting cells to a guessed +type. The type of a column was decided by looking at the values of only +the first few rows. So if a column is mainly numeric or dates, but a +string (like "N/A") comes later on, then this will cause the load to +error at that point, leaving it half-loaded into DataStore. + +XLoader - loads all the cells as text, before allowing the admin to +convert columns to the types they want (using the Data Dictionary +feature). In future it could do automatic detection and conversion. + +### Simpler queueing tech + +DataPusher - job queue is done by ckan-service-provider which is +bespoke, complicated and stores jobs in its own database (sqlite by +default). + +XLoader - job queue is done by RQ, which is simpler, is backed by Redis, +allows access to the CKAN model and is CKAN's default queue technology. +You can also debug jobs easily using pdb. Job results are stored in +Sqlite by default, and for production simply specify CKAN's database in +the config and it's held there - easy. + +(The other obvious candidate is Celery, but we don't need its +heavyweight architecture and its jobs are not debuggable with pdb.) + +### Separate web server + +DataPusher - has the complication that the queue jobs are done by a +separate (Flask) web app, apart from CKAN. This was the design because +the job requires intensive processing to convert every line of the data +into JSON. However it means more complicated code as info needs to be +passed between the services in http requests, more for the user to +set-up and manage - another app config, another apache config, separate +log files. + +XLoader - the job runs in a worker process, in the same app as CKAN, so +can access the CKAN config, db and logging directly and avoids many HTTP +calls. This simplification makes sense because the xloader job doesn't +need to do much processing - mainly it is streaming the CSV file from +disk into PostgreSQL. + +It is still entirely possible to run the XLoader worker on a separate +server, if that is desired. The worker needs the following: + +- A copy of CKAN installed in the same Python virtualenv (but not + running). +- A copy of the CKAN config file. +- Access to the Redis instance that the running CKAN app uses to store + jobs. +- Access to the database. + +You can then run it via ckan jobs worker as below. + +### Caveat - column types + +Note: With XLoader, all columns are stored in DataStore's database as +'text' type (whereas DataPusher did some rudimentary type guessing - +see 'Robustness' above). However once a resource is xloaded, an admin +can use the resource's Data Dictionary tab to change these types to +numeric or datestamp and re-load the file. When migrating from +DataPusher to XLoader you can preserve the types of existing resources +by using the `migrate_types` command. + +There is scope to add functionality for automatically guessing column +type -offers to contribute this are welcomed. + +## Requirements + +Compatibility with core CKAN versions: + + | CKAN version | Compatibility | + | -------------- | -------------------------------------------------------------------------------------------------------------- | + | 2.7 | no longer supported (last supported version: 0.12.2) | + | 2.8 | no longer supported (last supported version: 0.12.2) | + | 2.9 | yes (Python3) (last supported version for Python 2.7: 0.12.2)), Must: `pip install "setuptools>=44.1.0,<71"` | + | 2.10 | yes | + | 2.11 | yes | + +## Installation + +To install XLoader: + +1. Activate your CKAN virtual environment, for example: + + . /usr/lib/ckan/default/bin/activate + +2. Install the ckanext-xloader Python package into your virtual + environment: + + pip install ckanext-xloader + +3. Install dependencies: + + pip install -r https://raw.githubusercontent.com/ckan/ckanext-xloader/master/requirements.txt + pip install -U requests[security] + +4. Add `xloader` to the `ckan.plugins` setting in your CKAN config file + (by default the config file is located at + `/etc/ckan/default/production.ini`). + + You should also remove `datapusher` if it is in the list, to avoid + them both trying to load resources into the DataStore. + + Ensure `datastore` is also listed, to enable CKAN DataStore. + +5. Starting CKAN 2.10 you will need to set an API Token to be able to + execute jobs against the server: + + ckanext.xloader.api_token = + +6. If it is a production server, you'll want to store jobs info in a + more robust database than the default sqlite file. It can happily + use the main CKAN postgres db by adding this line to the config, but + with the same value as you have for `sqlalchemy.url`: + + ckanext.xloader.jobs_db.uri = postgresql://ckan_default:pass@localhost/ckan_default + + (This step can be skipped when just developing or testing.) + +7. Restart CKAN. For example if you've deployed CKAN with Apache on + Ubuntu: + + sudo service apache2 reload + +8. Run the worker: + + ckan -c /etc/ckan/default/ckan.ini jobs worker + +## Config settings + +Configuration: + + +This plugin supports the [`ckan.download_proxy`](https://docs.ckan.org/en/latest/maintaining/configuration.html#ckan-download-proxy) setting, +to use a proxy server when downloading files. This setting is shared +with other plugins that download resource files, such as +ckanext-archiver. Eg: + + ckan.download_proxy = + +You may also wish to configure the database to use your preferred date +input style on COPY. For example, to make +[PostgreSQL]() +expect European (day-first) dates, you could add to `postgresql.conf`: + + datestyle=ISO,DMY + +All configurations below are defined in the +[config_declaration.yaml](ckanext/xloader/config_declaration.yaml) file. + + +#### ckanext.xloader.jobs_db.uri + +Default value: `sqlite:////tmp/xloader_jobs.db` + +The connection string for the jobs database used by XLoader. The +default of an sqlite file is fine for development. For production use a +Postgresql database. + + +#### ckanext.xloader.api_token + +Example: + +``` +ckanext.xloader.api_token = eyJ0eXAiOiJKV1QiLCJh.eyJqdGkiOiJ0M2VNUFlQWFg0VU.8QgV8em4RA +``` + +Default value: none + +Uses a specific API token for the xloader_submit action instead of the +apikey of the site_user. It's mandatory starting from CKAN 2.10. You can get one +running the command `ckan user token add {USER_NAME} xloader -q` + + +#### ckanext.xloader.formats + +Example: + +``` +ckanext.xloader.formats = csv application/csv xls application/vnd.ms-excel +``` + +Default value: none + +The formats that are accepted. If the value of the resource.format is +anything else then it won't be 'xloadered' to DataStore (and will therefore +only be available to users in the form of the original download/link). +Case insensitive. Defaults are listed in utils.py. + + +#### ckanext.xloader.max_content_length + +Example: + +``` +ckanext.xloader.max_content_length = 100000 +``` + +Default value: `1000000000` + +The maximum file size that XLoader will attempt to load. + + +#### ckanext.xloader.use_type_guessing + +Default value: `False` + +By default, xloader will first try to add tabular data to the DataStore +with a direct PostgreSQL COPY. This is relatively fast, but does not +guess column types. If this fails, xloader falls back to a method more +like DataPusher's behaviour. This has the advantage that the column types +are guessed. However it is more error prone and far slower. +To always skip the direct PostgreSQL COPY and use type guessing, set +this option to True. + + +#### ckanext.xloader.strict_type_guessing + +Default value: `True` + +Use with ckanext.xloader.use_type_guessing to set strict true or false +for type guessing. If set to False, the types will always fallback to string type. + +Strict means that a type will not be guessed if parsing fails for a single cell in the column. + + +#### ckanext.xloader.max_type_guessing_length + +Example: + +``` +ckanext.xloader.max_type_guessing_length = 100000 +``` + +Default value: `0` + +The maximum file size that will be passed to Tabulator if the +use_type_guessing flag is enabled. Larger files will use COPY even if +the flag is set. Defaults to 1/10 of the maximum content length. + + +#### ckanext.xloader.parse_dates_dayfirst + +Default value: `False` + +Whether ambiguous dates should be parsed day first. Defaults to False. +If set to True, dates like '01.02.2022' will be parsed as day = 01, +month = 02. +NB: isoformat dates like '2022-01-02' will be parsed as YYYY-MM-DD, and +this option will not override that. +See [dateutil docs](https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse) +for more details. + + +#### ckanext.xloader.parse_dates_yearfirst + +Default value: `False` + +Whether ambiguous dates should be parsed year first. Defaults to False. +If set to True, dates like '01.02.03' will be parsed as year = 2001, +month = 02, day = 03. See [dateutil docs](https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse) +for more details. + + +#### ckanext.xloader.job_timeout + +Example: + +``` +ckanext.xloader.job_timeout = 3600 +``` + +Default value: `3600` + +The maximum time for the loading of a resource before it is aborted. +Give an amount in seconds. Default is 60 minutes + + +#### ckanext.xloader.ignore_hash + +Default value: `False` + +Ignore the file hash when submitting to the DataStore, if set to True +resources are always submitted (if their format matches), if set to +False (default), resources are only submitted if their hash has changed. + + +#### ckanext.xloader.max_excerpt_lines + +Example: + +``` +ckanext.xloader.max_excerpt_lines = 100 +``` + +Default value: `0` + +When loading a file that is bigger than `max_content_length`, xloader can +still try and load some of the file, which is useful to display a +preview. Set this option to the desired number of lines/rows that it +loads in this case. +If the file-type is supported (CSV, TSV) an excerpt with the number of +`max_excerpt_lines` lines will be submitted while the `max_content_length` +is not exceeded. +If set to 0 (default) files that exceed the `max_content_length` will +not be loaded into the datastore. + + +#### ckanext.xloader.ssl_verify + +Example: + +``` +ckanext.xloader.ssl_verify = True +``` + +Default value: `True` + +Requests verifies SSL certificates for HTTPS requests. Setting verify to +False should only be enabled during local development or testing. Default +to True. + + +#### ckanext.xloader.clean_datastore_tables + +Example: + +``` +ckanext.xloader.clean_datastore_tables = True +``` + +Default value: `False` + +Enqueue jobs to remove Datastore tables from Resources that have a format +that is not in ckanext.xloader.formats after a Resource is updated. + + +#### ckanext.xloader.show_badges + +Default value: `True` + +Controls whether or not the status badges display in the front end. + + +#### ckanext.xloader.debug_badges + +Example: + +``` +ckanext.xloader.debug_badges = True +``` + +Default value: `False` + +Controls whether or not the status badges display all of the statuses. By default, +the badges will display "pending", "running", and "error". With debug_badges enabled, +they will also display "complete", "active", "inactive", and "unknown". + +## Developer installation + +To install XLoader for development, activate your CKAN virtualenv and in +the directory up from your local ckan repo: + + git clone https://github.com/ckan/ckanext-xloader.git + cd ckanext-xloader + pip install -e . + pip install -r requirements.txt + pip install -r dev-requirements.txt + +## Upgrading from DataPusher + +To upgrade from DataPusher to XLoader: + +1. Install XLoader as above, including running the xloader worker. + +2. (Optional) For existing datasets that have been datapushed to + datastore, freeze the column types (in the data dictionaries), so + that XLoader doesn't change them back to string on next xload: + + ckan -c /etc/ckan/default/ckan.ini migrate_types + +3. If you've not already, change the enabled plugin in your config - + on the `ckan.plugins` line replace `datapusher` with `xloader`. + +4. (Optional) If you wish, you can disable the direct loading and + continue to just use tabulator - for more about this see the docs on + config option: `ckanext.xloader.use_type_guessing` + +5. Stop the datapusher worker: + + sudo a2dissite datapusher + +6. Restart CKAN: + + sudo service apache2 reload + sudo service nginx reload + +## Command-line interface + +You can submit single or multiple resources to be xloaded using the +command-line interface. + +e.g. : + + ckan -c /etc/ckan/default/ckan.ini xloader submit + +For debugging you can try xloading it synchronously (which does the load +directly, rather than asking the worker to do it) with the `-s` option: + + ckan -c /etc/ckan/default/ckan.ini xloader submit -s + +See the status of jobs: + + ckan -c /etc/ckan/default/ckan.ini xloader status + +Submit all datasets' resources to the DataStore: + + ckan -c /etc/ckan/default/ckan.ini xloader submit all + +Re-submit all the resources already in the DataStore (Ignores any +resources that have not been stored in DataStore e.g. because they are +not tabular): + + ckan -c /etc/ckan/default/ckan.ini xloader submit all-existing + +**Full list of XLoader CLI commands**: + + ckan -c /etc/ckan/default/ckan.ini xloader --help + +### Jobs and workers + +Main docs for managing jobs: + +https://docs.ckan.org/en/latest/maintaining/background-tasks.html#managing-background-jobs + +Main docs for running and managing workers are here: + +https://docs.ckan.org/en/latest/maintaining/background-tasks.html#running-background-jobs + +Useful commands: + +Clear (delete) all outstanding jobs: + + ckan -c /etc/ckan/default/ckan.ini jobs clear [QUEUES] + +If having trouble with the worker process, restarting it can help: + + sudo supervisorctl restart ckan-worker:* + +## Troubleshooting + +**KeyError: "Action 'datastore_search' not found"** + +You need to enable the [datastore]{.title-ref} plugin in your CKAN +config. See 'Installation' section above to do this and restart the +worker. + +**ProgrammingError: (ProgrammingError) relation "\_table_metadata" +does not exist** + +Your DataStore permissions have not been set-up - see: +https://docs.ckan.org/en/latest/maintaining/datastore.html#set-permissions + +## Running the Tests + +The first time, your test datastore database needs the trigger applied: + + sudo -u postgres psql datastore_test -f full_text_function.sql + +To run the tests, do: + + pytest ckan-ini=test.ini ckanext/xloader/tests + +## Releasing a New Version of XLoader + +XLoader is available on PyPI as +. + +To publish a new version to PyPI follow these steps: + +1. Update the version number in the `setup.py` file. See [PEP + 440](http://legacy.python.org/dev/peps/pep-0440/#public-version-identifiers) + for how to choose version numbers. + +2. Update the CHANGELOG. + +3. Make sure you have the latest version of necessary packages: + + pip install --upgrade setuptools wheel twine + +4. Create source and binary distributions of the new version: + + python setup.py sdist bdist_wheel && twine check dist/* + + Fix any errors you get. + +5. Upload the source distribution to PyPI: + + twine upload dist/* + +6. Commit any outstanding changes: + + git commit -a + git push + +7. Tag the new release of the project on GitHub with the version number + from the `setup.py` file. For example if the version number in + `setup.py` is 0.0.1 then do: + + git tag 0.0.1 + git push --tags diff --git a/README.rst b/README.rst deleted file mode 100644 index 4ec2e11b..00000000 --- a/README.rst +++ /dev/null @@ -1,380 +0,0 @@ -.. You should enable this project on travis-ci.org and coveralls.io to make - these badges work. The necessary Travis and Coverage config files have been - generated for you. - -.. image:: https://travis-ci.org/ckan/ckanext-xloader.svg?branch=master - :target: https://travis-ci.org/ckan/ckanext-xloader - -.. image:: https://img.shields.io/pypi/v/ckanext-xloader.svg - :target: https://pypi.org/project/ckanext-xloader/ - :alt: Latest Version - -.. image:: https://img.shields.io/pypi/pyversions/ckanext-xloader.svg - :target: https://pypi.org/project/ckanext-xloader/ - :alt: Supported Python versions - -.. image:: https://img.shields.io/pypi/status/ckanext-xloader.svg - :target: https://pypi.org/project/ckanext-xloader/ - :alt: Development Status - -.. image:: https://img.shields.io/pypi/l/ckanext-xloader.svg - :target: https://pypi.org/project/ckanext-xloader/ - :alt: License - -========================= -XLoader - ckanext-xloader -========================= - -Loads CSV (and similar) data into CKAN's DataStore. Designed as a replacement -for DataPusher because it offers ten times the speed and more robustness -(hence the name, derived from "Express Loader") - -**OpenGov Inc.** has sponsored this development, with the aim of benefitting -open data infrastructure worldwide. - -------------------------------- -Key differences from DataPusher -------------------------------- - -Speed of loading ----------------- - -DataPusher - parses CSV rows, converts to detected column types, converts the -data to a JSON string, calls datastore_create for each batch of rows, which -reformats the data into an INSERT statement string, which is passed to -PostgreSQL. - -XLoader - pipes the CSV file directly into PostgreSQL using COPY. - -In `tests `_, XLoader -is over ten times faster than DataPusher. - -Robustness ----------- - -DataPusher - one cause of failure was when casting cells to a guessed type. The -type of a column was decided by looking at the values of only the first few -rows. So if a column is mainly numeric or dates, but a string (like "N/A") -comes later on, then this will cause the load to error at that point, leaving -it half-loaded into DataStore. - -XLoader - loads all the cells as text, before allowing the admin to -convert columns to the types they want (using the Data Dictionary feature). In -future it could do automatic detection and conversion. - -Simpler queueing tech ---------------------- - -DataPusher - job queue is done by ckan-service-provider which is bespoke, -complicated and stores jobs in its own database (sqlite by default). - -XLoader - job queue is done by RQ, which is simpler, is backed by Redis, allows -access to the CKAN model and is CKAN's default queue technology. -You can also debug jobs easily using pdb. Job results are stored in -Sqlite by default, and for production simply specify CKAN's database in the -config and it's held there - easy. - -(The other obvious candidate is Celery, but we don't need its heavyweight -architecture and its jobs are not debuggable with pdb.) - -Separate web server -------------------- - -DataPusher - has the complication that the queue jobs are done by a separate -(Flask) web app, apart from CKAN. This was the design because the job requires -intensive processing to convert every line of the data into JSON. However it -means more complicated code as info needs to be passed between the services in -http requests, more for the user to set-up and manage - another app config, -another apache config, separate log files. - -XLoader - the job runs in a worker process, in the same app as CKAN, so -can access the CKAN config, db and logging directly and avoids many HTTP calls. -This simplification makes sense because the xloader job doesn't need to do much -processing - mainly it is streaming the CSV file from disk into PostgreSQL. - -It is still entirely possible to run the XLoader worker on a separate server, -if that is desired. The worker needs the following: - -- A copy of CKAN installed in the same Python virtualenv (but not running). -- A copy of the CKAN config file. -- Access to the Redis instance that the running CKAN app uses to store jobs. -- Access to the database. - -You can then run it via `ckan jobs worker` as below. - -Caveat - column types ---------------------- - -Note: With XLoader, all columns are stored in DataStore's database as 'text' -type (whereas DataPusher did some rudimentary type guessing - see 'Robustness' -above). However once a resource is xloaded, an admin can use the resource's -Data Dictionary tab to change these types to numeric or -datestamp and re-load the file. When migrating from DataPusher to XLoader you -can preserve the types of existing resources by using the ``migrate_types`` -command. - -There is scope to add functionality for automatically guessing column type - -offers to contribute this are welcomed. - - ------------- -Requirements ------------- - -Compatibility with core CKAN versions: - -=============== ============= -CKAN version Compatibility -=============== ============= -2.7 no longer supported (last supported version: 0.12.2) -2.8 no longer supported (last supported version: 0.12.2) -2.9 yes (Python3) (last supported version for Python 2.7: 0.12.2)), Must: ``pip install "setuptools>=44.1.0,<71"`` -2.10 yes -2.11 yes -=============== ============= - ------------- -Installation ------------- - -To install XLoader: - -1. Activate your CKAN virtual environment, for example:: - - . /usr/lib/ckan/default/bin/activate - -2. Install the ckanext-xloader Python package into your virtual environment:: - - pip install ckanext-xloader - -3. Install dependencies:: - - pip install -r https://raw.githubusercontent.com/ckan/ckanext-xloader/master/requirements.txt - pip install -U requests[security] - -4. Add ``xloader`` to the ``ckan.plugins`` setting in your CKAN - config file (by default the config file is located at - ``/etc/ckan/default/production.ini``). - - You should also remove ``datapusher`` if it is in the list, to avoid them - both trying to load resources into the DataStore. - - Ensure ``datastore`` is also listed, to enable CKAN DataStore. - -5. Starting CKAN 2.10 you will need to set an API Token to be able to - execute jobs against the server:: - - ckanext.xloader.api_token = - -6. If it is a production server, you'll want to store jobs info in a more - robust database than the default sqlite file. It can happily use the main - CKAN postgres db by adding this line to the config, but with the same value - as you have for ``sqlalchemy.url``:: - - ckanext.xloader.jobs_db.uri = postgresql://ckan_default:pass@localhost/ckan_default - - (This step can be skipped when just developing or testing.) - -7. Restart CKAN. For example if you've deployed CKAN with Apache on Ubuntu:: - - sudo service apache2 reload - -8. Run the worker:: - - ckan -c /etc/ckan/default/ckan.ini jobs worker - - ---------------- -Config settings ---------------- - -Configuration: - -See the extension's `config_declaration.yaml `_ file. - -This plugin also supports the `ckan.download_proxy` setting, to use a proxy server when downloading files. -This setting is shared with other plugins that download resource files, such as ckanext-archiver. Eg: - - ckan.download_proxy = http://my-proxy:1234/ - -You may also wish to configure the database to use your preferred date input style on COPY. -For example, to make [PostgreSQL](https://www.postgresql.org/docs/current/runtime-config-client.html#RUNTIME-CONFIG-CLIENT-FORMAT) -expect European (day-first) dates, you could add to ``postgresql.conf``: - - datestyle=ISO,DMY - -External Database credentials for datastore - - ``ckanext.xloader.jobs_db.uri = postgresql://ckan_default:pass@localhost/ckan_default`` - -API Key requires for xloader interaction CKAN 2.10 onwards, to generate ``TOKEN=ckan -c /etc/ckan/default/production.ini user token add $ACCOUNT xloader | tail -1 | tr -d '[:space:]')`` - - ``ckanext.xloader.api_token = `` - -Badge notification on what xloader is doing - - ``ckanext.xloader.show_badges = True|False (default True)`` - - ``ckanext.xloader.debug_badges = True|False (default False)`` - ------------------------- -Developer installation ------------------------- - -To install XLoader for development, activate your CKAN virtualenv and -in the directory up from your local ckan repo:: - - git clone https://github.com/ckan/ckanext-xloader.git - cd ckanext-xloader - pip install -e . - pip install -r requirements.txt - pip install -r dev-requirements.txt - - -------------------------- -Upgrading from DataPusher -------------------------- - -To upgrade from DataPusher to XLoader: - -1. Install XLoader as above, including running the xloader worker. - -2. (Optional) For existing datasets that have been datapushed to datastore, freeze the column types (in the data dictionaries), so that XLoader doesn't change them back to string on next xload:: - - ckan -c /etc/ckan/default/ckan.ini migrate_types - -3. If you've not already, change the enabled plugin in your config - on the - ``ckan.plugins`` line replace ``datapusher`` with ``xloader``. - -4. (Optional) If you wish, you can disable the direct loading and continue to - just use tabulator - for more about this see the docs on config option: - ``ckanext.xloader.use_type_guessing`` - -5. Stop the datapusher worker:: - - sudo a2dissite datapusher - -6. Restart CKAN:: - - sudo service apache2 reload - sudo service nginx reload - ----------------------- -Command-line interface ----------------------- - -You can submit single or multiple resources to be xloaded using the -command-line interface. - -e.g. :: - - ckan -c /etc/ckan/default/ckan.ini xloader submit - -For debugging you can try xloading it synchronously (which does the load -directly, rather than asking the worker to do it) with the ``-s`` option:: - - ckan -c /etc/ckan/default/ckan.ini xloader submit -s - -See the status of jobs:: - - ckan -c /etc/ckan/default/ckan.ini xloader status - -Submit all datasets' resources to the DataStore:: - - ckan -c /etc/ckan/default/ckan.ini xloader submit all - -Re-submit all the resources already in the DataStore (Ignores any resources -that have not been stored in DataStore e.g. because they are not tabular):: - - ckan -c /etc/ckan/default/ckan.ini xloader submit all-existing - - -**Full list of XLoader CLI commands**:: - - ckan -c /etc/ckan/default/ckan.ini xloader --help - - -Jobs and workers ----------------- - -Main docs for managing jobs: - -Main docs for running and managing workers are here: https://docs.ckan.org/en/latest/maintaining/background-tasks.html#running-background-jobs - -Useful commands: - -Clear (delete) all outstanding jobs:: - - ckan -c /etc/ckan/default/ckan.ini jobs clear [QUEUES] - -If having trouble with the worker process, restarting it can help:: - - sudo supervisorctl restart ckan-worker:* - ---------------- -Troubleshooting ---------------- - -**KeyError: "Action 'datastore_search' not found"** - -You need to enable the `datastore` plugin in your CKAN config. See -'Installation' section above to do this and restart the worker. - -**ProgrammingError: (ProgrammingError) relation "_table_metadata" does not -exist** - -Your DataStore permissions have not been set-up - see: - - ------------------ -Running the Tests ------------------ - -The first time, your test datastore database needs the trigger applied:: - - sudo -u postgres psql datastore_test -f full_text_function.sql - -To run the tests, do:: - - pytest ckan-ini=test.ini ckanext/xloader/tests - - ----------------------------------- -Releasing a New Version of XLoader ----------------------------------- - -XLoader is available on PyPI as https://pypi.org/project/ckanext-xloader. - -To publish a new version to PyPI follow these steps: - -1. Update the version number in the ``setup.py`` file. - See `PEP 440 `_ - for how to choose version numbers. - -2. Update the CHANGELOG. - -3. Make sure you have the latest version of necessary packages:: - - pip install --upgrade setuptools wheel twine - -4. Create source and binary distributions of the new version:: - - python setup.py sdist bdist_wheel && twine check dist/* - - Fix any errors you get. - -5. Upload the source distribution to PyPI:: - - twine upload dist/* - -6. Commit any outstanding changes:: - - git commit -a - git push - -7. Tag the new release of the project on GitHub with the version number from - the ``setup.py`` file. For example if the version number in ``setup.py`` is - 0.0.1 then do:: - - git tag 0.0.1 - git push --tags diff --git a/setup.py b/setup.py index e656c509..73ad8008 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ here = path.abspath(path.dirname(__file__)) # Get the long description from the relevant file -with open(path.join(here, 'README.rst'), encoding='utf-8') as f: +with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() setup( @@ -19,7 +19,7 @@ description='Express Loader - quickly load data into CKAN DataStore''', long_description=long_description, - long_description_content_type='text/x-rst', + long_description_content_type='text/markdown', # The project's main homepage. url='https://github.com/ckan/ckanext-xloader', From 37f2d1398e98cd733fd3b1055c350759d75466f4 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 20 Nov 2024 12:39:21 +0100 Subject: [PATCH 02/41] Whitespace --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index e961d7af..e51caa56 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,9 @@ # XLoader - ckanext-xloader [![Tests](https://github.com/ckan/ckanext-xloader/workflows/Tests/badge.svg?branch=master)](https://github.com/ckan/ckanext-xloader/actions) - [![Latest Version](https://img.shields.io/pypi/v/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) - [![Supported Python versions](https://img.shields.io/pypi/pyversions/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) - [![Development Status](https://img.shields.io/pypi/status/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) - [![License](https://img.shields.io/pypi/l/ckanext-xloader.svg)](https://pypi.org/project/ckanext-xloader/) @@ -17,6 +13,7 @@ more robustness (hence the name, derived from "Express Loader") **OpenGov Inc.** has sponsored this development, with the aim of benefitting open data infrastructure worldwide. + * [Key differences from DataPusher](#key-differences-from-datapusher) * [Requirements](#requirements) * [Installation](#installation) From c9318f72d1b6d5f01b41cc2f69dfe8e5a0b8f633 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Thu, 21 Nov 2024 14:24:38 +1000 Subject: [PATCH 03/41] fix: pypi.org is not on .com --- .github/workflows/publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ef7612bf..20bceb8c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -138,10 +138,10 @@ jobs: echo "reponame=${reponame}" >> $GITHUB_OUTPUT if [ "$env.ENVIRONMENT" == "testpypi" ]; then - url="https://test.pypi.com/p/$reponame" + url="https://test.pypi.org/project/$reponame/$TAG_VALUE/" echo "environment=${env.ENVIRONMENT}" >> $GITHUB_OUTPUT else - url="https://pypi.com/p/$reponame" + url="https://pypi.org/project/$reponame/$TAG_VALUE/" echo "environment=pypi" >> $GITHUB_OUTPUT fi From e32369460288ea78ba870e9b13dc9fb0958ab76b Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 21 Nov 2024 11:04:15 +0100 Subject: [PATCH 04/41] Review changes --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index e51caa56..859eee03 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,14 @@ All configurations below are defined in the #### ckanext.xloader.jobs_db.uri +Example: + +``` +postgresql://ckan_default:pass@localhost/ckan_default + +``` + + Default value: `sqlite:////tmp/xloader_jobs.db` The connection string for the jobs database used by XLoader. The From 9ae3b5bd7de17f632f6ae98bb8203140a6c81521 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Mon, 25 Nov 2024 08:08:47 +1000 Subject: [PATCH 05/41] chore: switch to pyproject.toml Updated authors --- pyproject.toml | 145 +++++++++++++++++++++++++++++++++++++++++++++++++ setup.cfg | 8 --- setup.py | 85 +---------------------------- 3 files changed, 146 insertions(+), 92 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..bee3c33c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,145 @@ +[build-system] +requires = [ "setuptools",] +build-backend = "setuptools.build_meta" + +[project] +name = "ckanext-xloader" +version = "1.2.0" +description = "Express Loader - quickly load data into CKAN DataStore" +classifiers = [ "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10",] +keywords = [ "CKAN", "extension", "datastore",] +dependencies = [ "typing_extensions",] +authors = [ + {name = "ThrawnCA", email = "carl.antuar@smartservice.qld.gov.au"}, + {name = "Jesse Vickery (JVickery-TBS)", email = "jesse.vickery@tbs-sct.gc.ca"}, + {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"}, + {name = "David Read (davidread)"}, + {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"}, + {name = "Patricio Del Boca (pdelboca)", email = "patriciodelboca@gmail.com"}, + {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"}, +# {name = "", email = ""}, +] +maintainers = [ + {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"}, + {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"}, + {name = "Ian Ward (wardi)", email = "ian@excess.org"}, + {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"}, +] + +[project.readme] +file = "README.md" +content-type = "text/markdown" + +[project.license] +text = "AGPL" + +[project.urls] +Homepage = "https://github.com/ckan/ckanext-xloader" + +[project.optional-dependencies] +test = [ "pytest-factoryboy",] + +[project.entry-points."ckan.plugins"] +xloader = "ckanext.xloader.plugin:xloaderPlugin" + +[project.entry-points."babel.extractors"] +ckan = "ckan.lib.extract:extract_ckan" + +[tool.setuptools.packages] +find = {} + +[tool.black] +line-length = 79 +preview = true + +[tool.isort] +known_ckan = "ckan" +known_ckanext = "ckanext" +known_self = "ckanext.xloader" +sections = "FUTURE,STDLIB,FIRSTPARTY,THIRDPARTY,CKAN,CKANEXT,SELF,LOCALFOLDER" + +[tool.pytest.ini_options] +addopts = "--ckan-ini test.ini" +filterwarnings = [ + "ignore::sqlalchemy.exc.SADeprecationWarning", + "ignore::sqlalchemy.exc.SAWarning", + "ignore::DeprecationWarning", +] + +[tool.pyright] +pythonVersion = "3.7" +include = ["ckanext"] +exclude = [ + "**/test*", + "**/migration", +] +strict = [] + +strictParameterNoneValue = true # type must be Optional if default value is None + +# Check the meaning of rules here +# https://github.com/microsoft/pyright/blob/main/docs/configuration.md +reportFunctionMemberAccess = true # non-standard member accesses for functions +reportMissingImports = true +reportMissingModuleSource = true +reportMissingTypeStubs = false +reportImportCycles = true +reportUnusedImport = true +reportUnusedClass = true +reportUnusedFunction = true +reportUnusedVariable = true +reportDuplicateImport = true +reportOptionalSubscript = true +reportOptionalMemberAccess = true +reportOptionalCall = true +reportOptionalIterable = true +reportOptionalContextManager = true +reportOptionalOperand = true +reportTypedDictNotRequiredAccess = false # We are using Context in a way that conflicts with this check +reportConstantRedefinition = true +reportIncompatibleMethodOverride = true +reportIncompatibleVariableOverride = true +reportOverlappingOverload = true +reportUntypedFunctionDecorator = false +reportUnknownParameterType = true +reportUnknownArgumentType = false +reportUnknownLambdaType = false +reportUnknownMemberType = false +reportMissingTypeArgument = true +reportInvalidTypeVarUse = true +reportCallInDefaultInitializer = true +reportUnknownVariableType = true +reportUntypedBaseClass = true +reportUnnecessaryIsInstance = true +reportUnnecessaryCast = true +reportUnnecessaryComparison = true +reportAssertAlwaysTrue = true +reportSelfClsParameterName = true +reportUnusedCallResult = false # allow function calls for side-effect only (like logic.check_acces) +useLibraryCodeForTypes = true +reportGeneralTypeIssues = true +reportPropertyTypeMismatch = true +reportWildcardImportFromLibrary = true +reportUntypedClassDecorator = false # authenticator relies on repoze.who class-decorator +reportUntypedNamedTuple = true +reportPrivateUsage = true +reportPrivateImportUsage = true +reportInconsistentConstructor = true +reportMissingSuperCall = false +reportUninitializedInstanceVariable = true +reportInvalidStringEscapeSequence = true +reportMissingParameterType = true +reportImplicitStringConcatenation = false +reportUndefinedVariable = true +reportUnboundVariable = true +reportInvalidStubStatement = true +reportIncompleteStub = true +reportUnsupportedDunderAll = true +reportUnusedCoroutine = true +reportUnnecessaryTypeIgnoreComment = true +reportMatchNotExhaustive = true \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 5e2fb6fd..168d5b0b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,11 +19,3 @@ previous = true domain = ckanext-xloader directory = ckanext/xloader/i18n statistics = true - -[tool:pytest] - -filterwarnings = - ignore::sqlalchemy.exc.SADeprecationWarning - ignore::sqlalchemy.exc.SAWarning - ignore::DeprecationWarning -addopts = --pdbcls=IPython.terminal.debugger:TerminalPdb \ No newline at end of file diff --git a/setup.py b/setup.py index 7d49c89e..cb65266c 100644 --- a/setup.py +++ b/setup.py @@ -1,91 +1,8 @@ # -*- coding: utf-8 -*- -from setuptools import setup, find_packages # Always prefer setuptools over distutils -from codecs import open # To use a consistent encoding -from os import path +from setuptools import setup -here = path.abspath(path.dirname(__file__)) - -# Get the long description from the relevant file -with open(path.join(here, 'README.md'), encoding='utf-8') as f: - long_description = f.read() setup( - name='''ckanext-xloader''', - - # Versions should comply with PEP440. For a discussion on single-sourcing - # the version across setup.py and the project code, see - # http://packaging.python.org/en/latest/tutorial.html#version - version='1.2.0', - - description='Express Loader - quickly load data into CKAN DataStore''', - long_description=long_description, - long_description_content_type='text/markdown', - - # The project's main homepage. - url='https://github.com/ckan/ckanext-xloader', - - # Author details - author='''David Read''', - author_email='''david.read@hackneyworkshop.com''', - - # Choose your license - license='AGPL', - - # See https://pypi.python.org/pypi?%3Aaction=list_classifiers - classifiers=[ - 'Development Status :: 5 - Production/Stable', - - # Pick your license as you wish (should match "license" above) - 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)', - - # Specify the Python versions you support here. - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - ], - - - # What does your project relate to? - keywords='''CKAN extension datastore''', - - # You can just specify the packages manually here if your project is - # simple. Or you can use find_packages(). - packages=find_packages(exclude=['contrib', 'docs', 'tests*']), - namespace_packages=['ckanext'], - - install_requires=[ - # CKAN extensions should not list dependencies here, but in a separate - # ``requirements.txt`` file. - # - # http://docs.ckan.org/en/latest/extensions/best-practices.html#add-third-party-libraries-to-requirements-txt - ], - - # If there are data files included in your packages that need to be - # installed, specify them here. If using Python 2.6 or less, then these - # have to be included in MANIFEST.in as well. - include_package_data=True, - package_data={ - }, - - # Although 'package_data' is the preferred approach, in some case you may - # need to place data files outside of your packages. - # see http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files - # In this case, 'data_file' will be installed into '/my_data' - data_files=[], - - # To provide executable scripts, use entry points in preference to the - # "scripts" keyword. Entry points provide cross-platform support and allow - # pip to create the appropriate form of executable for the target platform. - entry_points=''' - [ckan.plugins] - xloader=ckanext.xloader.plugin:xloaderPlugin - - [babel.extractors] - ckan = ckan.lib.extract:extract_ckan - - ''', - # If you are changing from the default layout of your extension, you may # have to change the message extractors, you can read more about babel # message extraction at From b65906c6b93ceb15f0b7faa6b1250b294a8b63b2 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Thu, 28 Nov 2024 06:11:16 +1000 Subject: [PATCH 06/41] chore: Update authors/maintainers --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bee3c33c..858db4ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,14 +20,14 @@ authors = [ {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"}, {name = "David Read (davidread)"}, {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"}, - {name = "Patricio Del Boca (pdelboca)", email = "patriciodelboca@gmail.com"}, + {name = "Patricio Del Boca (pdelboca)"}, {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"}, # {name = "", email = ""}, ] maintainers = [ {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"}, {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"}, - {name = "Ian Ward (wardi)", email = "ian@excess.org"}, + {name = "Ian Ward (wardi)"}, {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"}, ] From 1bbd6430072694777add938110fa28673adc9868 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 9 Dec 2024 16:25:18 +0000 Subject: [PATCH 07/41] squash(pr): 200; - Squashed version of https://github.com/ckan/ckanext-xloader/pull/200. --- CHANGELOG | 13 ++++++++-- README.md | 32 +++++++++++++++++++++++-- ckanext/xloader/action.py | 18 +++++++++++--- ckanext/xloader/config_declaration.yaml | 2 ++ ckanext/xloader/plugin.py | 14 +++++++---- ckanext/xloader/schema.py | 2 ++ ckanext/xloader/utils.py | 7 +++--- 7 files changed, 74 insertions(+), 14 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 26bce2d0..e804e0ad 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,12 @@ +1.1.3 2024-12-09 +================ + +Feat: +* Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` controls whether a resource requires a successful validation report to be XLoadered. By default, a resource would also require a Validation Schema, which can be turned off with `ckanext.xloader.validation.enforce_schema`. + + +**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.2...1.1.3 + 1.1.2 2024-10-25 ================ @@ -21,7 +30,7 @@ Fix: 1.1.0 2024-10-16 ================ -Fixes: +Fixes: * feat: Add pypi cicd publish via github action via environment controls by @duttonw in https://github.com/ckan/ckanext-xloader/pull/228 @@ -32,7 +41,7 @@ Fixes: ================ -Fixes: +Fixes: * add README note about running on separate server, #191 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/192 * Use IDomainObjectModification Implementation by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/198 diff --git a/README.md b/README.md index 859eee03..db30c160 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ Compatibility with core CKAN versions: | CKAN version | Compatibility | | -------------- | -------------------------------------------------------------------------------------------------------------- | | 2.7 | no longer supported (last supported version: 0.12.2) | - | 2.8 | no longer supported (last supported version: 0.12.2) | + | 2.8 | no longer supported (last supported version: 0.12.2) | | 2.9 | yes (Python3) (last supported version for Python 2.7: 0.12.2)), Must: `pip install "setuptools>=44.1.0,<71"` | | 2.10 | yes | | 2.11 | yes | @@ -189,7 +189,7 @@ expect European (day-first) dates, you could add to `postgresql.conf`: datestyle=ISO,DMY -All configurations below are defined in the +All configurations below are defined in the [config_declaration.yaml](ckanext/xloader/config_declaration.yaml) file. @@ -409,6 +409,34 @@ Controls whether or not the status badges display all of the statuses. By defaul the badges will display "pending", "running", and "error". With debug_badges enabled, they will also display "complete", "active", "inactive", and "unknown". +#### ckanext.xloader.validation.requires_successful_report + +Supports: __ckanext-validation__ + +Example: + +``` +ckanext.xloader.validation.requires_successful_report = True +``` + +Default value: `False` + +Controls whether or not a resource requires a successful validation report from the ckanext-validation plugin in order to be XLoadered. + +#### ckanext.xloader.validation.enforce_schema + +Supports: __ckanext-validation__ + +Example: + +``` +ckanext.xloader.validation.enforce_schema = False +``` + +Default value: `True` + +Controls whether or not a resource requires a Validation Schema to be present from the ckanext-validation plugin to be XLoadered. + ## Developer installation To install XLoader for development, activate your CKAN virtualenv and in diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index c0f3f84f..0ce1aaed 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -53,6 +53,13 @@ def xloader_submit(context, data_dict): if errors: raise p.toolkit.ValidationError(errors) + p.toolkit.check_access('xloader_submit', context, data_dict) + + # If sync is set to True, the xloader callback will be executed right + # away, instead of a job being enqueued. It will also delete any existing jobs + # for the given resource. This is only controlled by sysadmins or the system. + sync = data_dict.pop('sync', False) + res_id = data_dict['resource_id'] try: resource_dict = p.toolkit.get_action('resource_show')(context, { @@ -166,15 +173,20 @@ def xloader_submit(context, data_dict): job = enqueue_job( jobs.xloader_data_into_datastore, [data], queue=custom_queue, title="xloader_submit: package: {} resource: {}".format(resource_dict.get('package_id'), res_id), - rq_kwargs=dict(timeout=timeout) + rq_kwargs=dict(timeout=timeout, at_front=sync) ) except Exception: - log.exception('Unable to enqueued xloader res_id=%s', res_id) + if sync: + log.exception('Unable to xloader res_id=%s', res_id) + else: + log.exception('Unable to enqueue xloader res_id=%s', res_id) return False log.debug('Enqueued xloader job=%s res_id=%s', job.id, res_id) - value = json.dumps({'job_id': job.id}) + if sync: + log.debug('Pushed xloader sync mode job=%s res_id=%s to front of queue', job.id, res_id) + task['value'] = value task['state'] = 'pending' task['last_updated'] = str(datetime.datetime.utcnow()) diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 9487999d..6d6bf21b 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -141,8 +141,10 @@ groups: example: False description: | Resources are expected to have a Validation Schema, or use the default ones if not. + If this option is set to `False`, Resources that do not have a Validation Schema will be treated like they do not require Validation. + See https://github.com/frictionlessdata/ckanext-validation?tab=readme-ov-file#data-schema for more details. - key: ckanext.xloader.clean_datastore_tables diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index d916cc54..ca4bfb6e 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -183,7 +183,7 @@ def before_show(self, resource_dict): def after_update(self, context, resource_dict): self.after_resource_update(context, resource_dict) - def _submit_to_xloader(self, resource_dict): + def _submit_to_xloader(self, resource_dict, sync=False): context = {"ignore_auth": True, "defer_commit": True} resource_format = resource_dict.get("format") if not XLoaderFormats.is_it_an_xloader_format(resource_format): @@ -203,14 +203,20 @@ def _submit_to_xloader(self, resource_dict): return try: - log.debug( - "Submitting resource %s to be xloadered", resource_dict["id"] - ) + if sync: + log.debug( + "xloadering resource %s in sync mode", resource_dict["id"] + ) + else: + log.debug( + "Submitting resource %s to be xloadered", resource_dict["id"] + ) toolkit.get_action("xloader_submit")( context, { "resource_id": resource_dict["id"], "ignore_hash": self.ignore_hash, + "sync": sync, }, ) except toolkit.ValidationError as e: diff --git a/ckanext/xloader/schema.py b/ckanext/xloader/schema.py index c0e8d938..47ae65a5 100644 --- a/ckanext/xloader/schema.py +++ b/ckanext/xloader/schema.py @@ -16,6 +16,7 @@ boolean_validator = get_validator('boolean_validator') int_validator = get_validator('int_validator') OneOf = get_validator('OneOf') +ignore_not_sysadmin = get_validator('ignore_not_sysadmin') if p.toolkit.check_ckan_version('2.9'): unicode_safe = get_validator('unicode_safe') @@ -29,6 +30,7 @@ def xloader_submit_schema(): 'id': [ignore_missing], 'set_url_type': [ignore_missing, boolean_validator], 'ignore_hash': [ignore_missing, boolean_validator], + 'sync': [ignore_missing, boolean_validator, ignore_not_sysadmin], '__junk': [empty], '__before': [dsschema.rename('id', 'resource_id')] } diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index bcba510e..12ce2fbf 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -2,6 +2,7 @@ import json import datetime +from rq import get_current_job from six import text_type as str, binary_type @@ -33,8 +34,6 @@ "application/vnd.oasis.opendocument.spreadsheet", ] -from .job_exceptions import JobError - class XLoaderFormats(object): formats = None @@ -61,8 +60,10 @@ def awaiting_validation(res_dict): """ Checks the existence of a logic action from the ckanext-validation plugin, thus supporting any extending of the Validation Plugin class. + Checks ckanext.xloader.validation.requires_successful_report config option value. + Checks ckanext.xloader.validation.enforce_schema config option value. Then checks the Resource's validation_status. """ @@ -273,7 +274,7 @@ def type_guess(rows, types=TYPES, strict=False): at_least_one_value = [] for ri, row in enumerate(rows): diff = len(row) - len(guesses) - for _ in range(diff): + for _i in range(diff): typesdict = {} for type in types: typesdict[type] = 0 From 453ecae17518e541bda00318182e22a8b41633ab Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 9 Dec 2024 21:59:13 +0000 Subject: [PATCH 08/41] revert(misc): import; - Removed unused import. --- ckanext/xloader/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index 12ce2fbf..4c90c84d 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -2,7 +2,6 @@ import json import datetime -from rq import get_current_job from six import text_type as str, binary_type From a4963aebca53ddf665eba39cc20777abfc76c753 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 9 Dec 2024 23:00:32 +0000 Subject: [PATCH 09/41] squash(pr): 215; - Squash of https://github.com/ckan/ckanext-xloader/pull/215 --- CHANGELOG | 1 + README.md | 6 + ckanext/xloader/loader.py | 78 +- ckanext/xloader/parser.py | 12 +- ckanext/xloader/plugin.py | 25 + .../datastore/snippets/dictionary_form.html | 17 + .../tests/samples/boston_311_sample.csv | 8 +- ckanext/xloader/tests/test_jobs.py | 2 +- ckanext/xloader/tests/test_loader.py | 908 +++++++++++++----- requirements.txt | 2 +- 10 files changed, 781 insertions(+), 278 deletions(-) create mode 100644 ckanext/xloader/templates/datastore/snippets/dictionary_form.html diff --git a/CHANGELOG b/CHANGELOG index e804e0ad..0cddd66b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ ================ Feat: +* Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field). This will strip surrounding white space from data values prior to inserting them into the database. * Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` controls whether a resource requires a successful validation report to be XLoadered. By default, a resource would also require a Validation Schema, which can be turned off with `ckanext.xloader.validation.enforce_schema`. diff --git a/README.md b/README.md index db30c160..338102fa 100644 --- a/README.md +++ b/README.md @@ -437,6 +437,12 @@ Default value: `True` Controls whether or not a resource requires a Validation Schema to be present from the ckanext-validation plugin to be XLoadered. +## Data Dictionary Fields + +#### strip_extra_white + +This plugin adds the `Strip Extra Leading and Trailing White Space` field to Data Dictionary fields. This controls whether or not to trim whitespace from data values prior to inserting into the database. Default for each field is `True` (it will trim whitespace). + ## Developer installation To install XLoader for development, activate your CKAN virtualenv and in diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 46814181..f4aa7af8 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -171,17 +171,6 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False) try: - save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} - try: - with UnknownEncodingStream(csv_filepath, file_format, decoding_result, - skip_rows=skip_rows) as stream: - stream.save(**save_args) - except (EncodingError, UnicodeDecodeError): - with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, - skip_rows=skip_rows) as stream: - stream.save(**save_args) - csv_filepath = f_write.name - # datastore db connection engine = get_write_engine() @@ -189,10 +178,16 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = {} if existing: - existing_fields = existing.get('fields', []) + if p.toolkit.check_ckan_version(min_version='2.11'): + ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) + existing_fields = ds_info.get('fields', []) + else: + existing_fields = existing.get('fields', []) existing_info = dict((f['id'], f['info']) for f in existing_fields if 'info' in f) + existing_fields_by_headers = dict((f['id'], f) + for f in existing_fields) # Column types are either set (overridden) in the Data Dictionary page # or default to text type (which is robust) @@ -207,6 +202,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): for f in fields: if f['id'] in existing_info: f['info'] = existing_info[f['id']] + f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \ + else existing_fields_by_headers[f['id']].get('strip_extra_white', True) ''' Delete or truncate existing datastore table before proceeding, @@ -223,11 +220,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): else: fields = [ {'id': header_name, - 'type': 'text'} + 'type': 'text', + 'strip_extra_white': True,} for header_name in headers] logger.info('Fields: %s', fields) + save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} + try: + with UnknownEncodingStream(csv_filepath, file_format, decoding_result, + skip_rows=skip_rows) as stream: + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + if len(row) == len(fields): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) + except (EncodingError, UnicodeDecodeError): + with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, + skip_rows=skip_rows) as stream: + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + if len(row) == len(fields): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) + csv_filepath = f_write.name + # Create table from ckan import model context = {'model': model, 'ignore_auth': True} @@ -383,10 +412,16 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = None if existing: - existing_fields = existing.get('fields', []) + if p.toolkit.check_ckan_version(min_version='2.11'): + ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) + existing_fields = ds_info.get('fields', []) + else: + existing_fields = existing.get('fields', []) existing_info = dict( (f['id'], f['info']) for f in existing_fields if 'info' in f) + existing_fields_by_headers = dict((f['id'], f) + for f in existing_fields) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) @@ -400,6 +435,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): strict_guessing = p.toolkit.asbool( config.get('ckanext.xloader.strict_type_guessing', True)) types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing) + fields = [] # override with types user requested if existing_info: @@ -410,6 +446,12 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): 'timestamp': datetime.datetime, }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] + for h in headers: + fields.append(existing_fields_by_headers.get(h, {})) + else: + # default strip_extra_white + for h in headers: + fields.append({'strip_extra_white': True}) # Strip leading and trailing whitespace, then truncate to maximum length, # then strip again in case the truncation exposed a space. @@ -419,7 +461,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): if header and header.strip() ] header_count = len(headers) - type_converter = TypeConverter(types=types) + type_converter = TypeConverter(types=types, fields=fields) with UnknownEncodingStream(table_filepath, file_format, decoding_result, skip_rows=skip_rows, @@ -451,10 +493,16 @@ def row_iterator(): for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] + h['strip_extra_white'] = existing_info[h['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[h['id']] \ + else existing_fields_by_headers[h['id']].get('strip_extra_white', True) # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override + else: + # default strip_extra_white + for h in headers_dicts: + h['strip_extra_white'] = True # preserve any types that we have sniffed unless told otherwise _save_type_overrides(headers_dicts) diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index 11e756cd..d57b7313 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -18,8 +18,9 @@ class TypeConverter: as desired. """ - def __init__(self, types=None): + def __init__(self, types=None, fields=None): self.types = types + self.fields = fields def convert_types(self, extended_rows): """ Try converting cells to numbers or timestamps if applicable. @@ -31,7 +32,16 @@ def convert_types(self, extended_rows): for cell_index, cell_value in enumerate(row): if cell_value is None: row[cell_index] = '' + if self.fields: + # only strip white space if strip_extra_white is True + if self.fields[cell_index].get('info', {}).get('strip_extra_white', True) and isinstance(cell_value, six.text_type): + cell_value = cell_value.strip() + row[cell_index] = cell_value.strip() if not cell_value: + # load_csv parody: empty of string type should be None + if self.types and self.types[cell_index] == six.text_type: + cell_value = None + row[cell_index] = None continue cell_type = self.types[cell_index] if self.types else None if cell_type in [Decimal, None]: diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index ca4bfb6e..5c0084d9 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -25,6 +25,12 @@ def config_declarations(cls): return cls +if toolkit.check_ckan_version(min_version='2.11'): + from ckanext.datastore.interfaces import IDataDictionaryForm + has_idata_dictionary_form = True +else: + has_idata_dictionary_form = False + log = logging.getLogger(__name__) @@ -39,6 +45,8 @@ class xloaderPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IResourceController, inherit=True) plugins.implements(plugins.IClick) plugins.implements(plugins.IBlueprint) + if has_idata_dictionary_form: + plugins.implements(IDataDictionaryForm, inherit=True) if HAS_IPIPE_VALIDATION: plugins.implements(IPipeValidation) @@ -252,6 +260,23 @@ def get_helpers(self): "xloader_badge": xloader_helpers.xloader_badge, } + # IDataDictionaryForm + + def update_datastore_create_schema(self, schema): + default = toolkit.get_validator('default') + boolean_validator = toolkit.get_validator('boolean_validator') + to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data') + schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')] + return schema + + def update_datastore_info_field(self, field, plugin_data): + # expose all our non-secret plugin data in the field + field.update(plugin_data.get('xloader', {})) + # CKAN version parody + if '_info' in plugin_data: + field.update({'info': plugin_data['_info']}) + return field + def _should_remove_unsupported_resource_from_datastore(res_dict): if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)): diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html new file mode 100644 index 00000000..6c2f576c --- /dev/null +++ b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html @@ -0,0 +1,17 @@ +{% ckan_extends %} +{% import 'macros/form.html' as form %} + +{% block additional_fields %} + {{ super() }} + {% if h.check_ckan_version(min_version='2.11') %} + {% set field_prefix = 'fields__' %} + {% else %} + {% set field_prefix = 'info__' %} + {% endif %} + {% set selected_value = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white', true)) %} + {{ form.select(field_prefix ~ position ~ '__strip_extra_white', + label=_('Strip Extra Leading and Trailing White Space'), options=[ + {'text': _('Yes'), 'value': true}, + {'text': _('No'), 'value': false}, + ], selected=selected_value) }} +{% endblock %} diff --git a/ckanext/xloader/tests/samples/boston_311_sample.csv b/ckanext/xloader/tests/samples/boston_311_sample.csv index 83e0d5f2..e3a7e5be 100644 --- a/ckanext/xloader/tests/samples/boston_311_sample.csv +++ b/ckanext/xloader/tests/samples/boston_311_sample.csv @@ -1,4 +1,4 @@ -CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source -101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App -101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App -101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App +CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source +101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App +101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App +101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index e819dad9..62ae7174 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -81,7 +81,7 @@ def test_xloader_data_into_datastore(self, cli, data): with mock.patch("ckanext.xloader.jobs.get_response", get_response): stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout - assert "Fields: [{'id': 'x', 'type': 'text'}, {'id': 'y', 'type': 'text'}]" in stdout + assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout assert "Copying to database..." in stdout assert "Creating search index..." in stdout assert "Express Load completed" in stdout diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 5cc080a0..4cc8d86a 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -102,6 +102,20 @@ def test_simple(self, Session): logger=logger, ) + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) assert self._get_records( Session, resource_id, limit=1, exclude_full_text_column=False ) == [ @@ -113,7 +127,8 @@ def test_simple(self, Session): u"Galway", ) ] - assert self._get_records(Session, resource_id) == [ + print(records) + assert records == [ (1, u"2011-01-01", u"1", u"Galway"), (2, u"2011-01-02", u"-1", u"Galway"), (3, u"2011-01-03", u"0", u"Galway"), @@ -121,20 +136,6 @@ def test_simple(self, Session): (5, None, None, u"Berkeley"), (6, u"2011-01-03", u"5", None), ] - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"date", - u"temperature", - u"place", - ] - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"text", - u"text", - u"text", - ] def test_simple_with_indexing(self, Session): csv_filepath = get_sample_filepath("simple.csv") @@ -217,6 +218,45 @@ def test_boston_311(self, Session): ) records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) print(records) assert records == [ ( @@ -227,9 +267,9 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Street Light Outages", - u"Public Works Department", + u"Public Works Department", # " " trailing whitespace gets trimmed u"Street Lights", u"Street Light Outages", u"PWDx_Street Light Outages", @@ -259,14 +299,14 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces None, u"522 Saratoga St East Boston MA 02128", u"1", @@ -291,14 +331,14 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces None, u"965 Bennington St East Boston MA 02128", u"1", @@ -316,45 +356,6 @@ def test_boston_311(self, Session): u"Citizens Connect App", ), ] # noqa - print(self._get_column_names(Session, resource_id)) - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"CASE_ENQUIRY_ID", - u"open_dt", - u"target_dt", - u"closed_dt", - u"OnTime_Status", - u"CASE_STATUS", - u"CLOSURE_REASON", - u"CASE_TITLE", - u"SUBJECT", - u"REASON", - u"TYPE", - u"QUEUE", - u"Department", - u"SubmittedPhoto", - u"ClosedPhoto", - u"Location", - u"Fire_district", - u"pwd_district", - u"city_council_district", - u"police_district", - u"neighborhood", - u"neighborhood_services_district", - u"ward", - u"precinct", - u"LOCATION_STREET_NAME", - u"LOCATION_ZIPCODE", - u"Latitude", - u"Longitude", - u"Source", - ] # noqa - print(self._get_column_types(Session, resource_id)) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - ] + [u"text"] * (len(records[0]) - 1) def test_brazilian(self, Session): csv_filepath = get_sample_filepath("brazilian_sample.csv") @@ -368,105 +369,6 @@ def test_brazilian(self, Session): ) records = self._get_records(Session, resource_id) - print(records) - assert records[0] == ( - 1, - u"01/01/1996 12:00:00 AM", - u"1100015", - u"ALTA FLORESTA D'OESTE", - u"RO", - None, - u"128", - u"0", - u"8", - u"119", - u"1", - u"0", - u"3613", - u"3051", - u"130", - u"7", - u"121", - u"3716", - u"3078", - u"127", - u"7", - None, - None, - None, - None, - u"6794", - u"5036", - u"1758", - None, - None, - None, - None, - None, - None, - u"337", - u"0.26112759", - u"0.17210683", - u"0.43323442", - u"0.13353115", - u"24.833692447908199", - None, - None, - u"22.704964", - u"67.080006197818605", - u"65.144188573097907", - u"74.672390253375497", - u"16.7913561569619", - u"19.4894563570641", - u"8.649237411458509", - u"7.60165422117368", - u"11.1540090366186", - u"17.263407056738099", - u"8.5269823", - u"9.2213373", - u"5.3085136", - u"52.472769803217503", - None, - None, - None, - None, - None, - None, - u"25.0011414302354", - u"22.830887000000001", - u"66.8150490097632", - u"64.893674212235595", - u"74.288246611754104", - u"17.0725384713319", - u"19.8404105332814", - u"8.856561911292371", - u"7.74275834336647", - u"11.357671741889", - u"17.9410577459881", - u"8.3696527", - u"8.9979973", - u"5.0570836", - u"53.286314230720798", - None, - None, - None, - None, - None, - u"122988", - None, - u"10.155015000000001", - u"14.826086999999999", - u"11.671533", - u"9.072917", - None, - None, - None, - None, - None, - None, - None, - None, - ) # noqa print(self._get_column_names(Session, resource_id)) assert self._get_column_names(Session, resource_id) == [ u"_id", @@ -572,6 +474,105 @@ def test_brazilian(self, Session): u"int4", u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records[0] == ( + 1, + u"01/01/1996 12:00:00 AM", + u"1100015", + u"ALTA FLORESTA D'OESTE", + u"RO", + None, + u"128", + u"0", + u"8", + u"119", + u"1", + u"0", + u"3613", + u"3051", + u"130", + u"7", + u"121", + u"3716", + u"3078", + u"127", + u"7", + None, + None, + None, + None, + u"6794", + u"5036", + u"1758", + None, + None, + None, + None, + None, + None, + u"337", + u"0.26112759", + u"0.17210683", + u"0.43323442", + u"0.13353115", + u"24.833692447908199", + None, + None, + u"22.704964", + u"67.080006197818605", + u"65.144188573097907", + u"74.672390253375497", + u"16.7913561569619", + u"19.4894563570641", + u"8.649237411458509", + u"7.60165422117368", + u"11.1540090366186", + u"17.263407056738099", + u"8.5269823", + u"9.2213373", + u"5.3085136", + u"52.472769803217503", + None, + None, + None, + None, + None, + None, + u"25.0011414302354", + u"22.830887000000001", + u"66.8150490097632", + u"64.893674212235595", + u"74.288246611754104", + u"17.0725384713319", + u"19.8404105332814", + u"8.856561911292371", + u"7.74275834336647", + u"11.357671741889", + u"17.9410577459881", + u"8.3696527", + u"8.9979973", + u"5.0570836", + u"53.286314230720798", + None, + None, + None, + None, + None, + u"122988", + None, + u"10.155015000000001", + u"14.826086999999999", + u"11.671533", + u"9.072917", + None, + None, + None, + None, + None, + None, + None, + None, + ) # noqa def test_german(self, Session): csv_filepath = get_sample_filepath("german_sample.csv") @@ -585,20 +586,6 @@ def test_german(self, Session): ) records = self._get_records(Session, resource_id) - print(records) - assert records[0] == ( - 1, - u"Zürich", - u"68260", - u"65444", - u"62646", - u"6503", - u"28800", - u"1173", - u"6891", - u"24221", - u"672", - ) print(self._get_column_names(Session, resource_id)) assert self._get_column_names(Session, resource_id) == [ u"_id", @@ -619,6 +606,20 @@ def test_german(self, Session): u"int4", u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records[0] == ( + 1, + u"Zürich", + u"68260", + u"65444", + u"62646", + u"6503", + u"28800", + u"1173", + u"6891", + u"24221", + u"672", + ) def test_with_blanks(self, Session): csv_filepath = get_sample_filepath("sample_with_blanks.csv") @@ -699,7 +700,6 @@ def test_reload(self, Session): logger=logger, ) - assert len(self._get_records(Session, resource_id)) == 6 assert self._get_column_names(Session, resource_id) == [ u"_id", u"_full_text", @@ -714,6 +714,7 @@ def test_reload(self, Session): u"text", u"text", ] + assert len(self._get_records(Session, resource_id)) == 6 @pytest.mark.skipif( not p.toolkit.check_ckan_version(min_version="2.7"), @@ -752,7 +753,6 @@ def test_reload_with_overridden_types(self, Session): fields=fields, resource_id=resource_id, logger=logger ) - assert len(self._get_records(Session, resource_id)) == 6 assert self._get_column_names(Session, resource_id) == [ u"_id", u"_full_text", @@ -767,6 +767,7 @@ def test_reload_with_overridden_types(self, Session): u"numeric", u"text", ] + assert len(self._get_records(Session, resource_id)) == 6 # check that rows with nulls are indexed correctly records = self._get_records( @@ -816,6 +817,181 @@ def test_column_names(self, Session): u"Galway", ) + def test_load_with_no_strip_white(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + + # Change strip_extra_white, as it would be done by Data Dictionary + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + for field in fields: + field["info"] = {"strip_extra_white": False} # <=2.10 + field["strip_extra_white"] = False # >=2.11 + p.toolkit.get_action("datastore_create")( + {"ignore_auth": True}, + {"resource_id": resource_id, "force": True, "fields": fields}, + ) + + # Load it again with new strip_extra_white + fields = loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=logger + ) + + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records == [ + ( + 1, # ds auto increment id / primary key + u"101002153891", + u"2017-07-06 23:38:43", + u"2017-07-21 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Street Light Outages", + u"Public Works Department ", # no strip_extra_white + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + None, + None, + u"480 Harvard St Dorchester MA 02124", + u"8", + u"07", + u"4", + u"B3", + u"Greater Mattapan", + u"9", + u"Ward 14", + u"1411", + u"480 Harvard St", + u"02124", + u"42.288", + u"-71.0927", + u"Citizens Connect App", + ), # noqa + ( + 2, # ds auto increment id / primary key + u"101002153890", + u"2017-07-06 23:29:13", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # no strip_extra_white + None, + u"522 Saratoga St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0110", + u"522 Saratoga St", + u"02128", + u"42.3807", + u"-71.0259", + u"Citizens Connect App", + ), # noqa + ( + 3, # ds auto increment id / primary key + u"101002153889", + u"2017-07-06 23:24:20", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # no strip_extra_white + None, + u"965 Bennington St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0112", + u"965 Bennington St", + u"02128", + u"42.386", + u"-71.008", + u"Citizens Connect App", + ), + ] # noqa + class TestLoadUnhandledTypes(TestLoadBase): def test_kml(self): @@ -919,6 +1095,20 @@ def test_simple(self, Session): # "'-01':4,5 '00':6,7,8 '1':1 '2011':3 'galway':2" # "'-01':2,3 '00':5,6 '1':7 '2011':1 'galway':8 't00':4" + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + u"timestamp", + u"numeric", + u"text", + ] assert self._get_records(Session, resource_id) == [ (1, datetime.datetime(2011, 1, 1, 0, 0), Decimal("1"), u"Galway",), ( @@ -1085,6 +1275,74 @@ def test_boston_311(self, Session): ) records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", # int4 + u"_full_text", # tsvector + u"CASE_ENQUIRY_ID", # numeric + u"open_dt", # timestamp + u"target_dt", # timestamp + u"closed_dt", # text + u"OnTime_Status", # text + u"CASE_STATUS", # text + u"CLOSURE_REASON", # text + u"CASE_TITLE", # text + u"SUBJECT", # text + u"REASON", # text + u"TYPE", # text + u"QUEUE", # text + u"Department", # text + u"SubmittedPhoto", # text + u"ClosedPhoto", # text + u"Location", # text + u"Fire_district", # numeric + u"pwd_district", # numeric + u"city_council_district", # numeric + u"police_district", # text + u"neighborhood", # text + u"neighborhood_services_district", # numeric + u"ward", # text + u"precinct", # numeric + u"LOCATION_STREET_NAME", # text + u"LOCATION_ZIPCODE", # numeric + u"Latitude", # numeric + u"Longitude", # numeric + u"Source", # text + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", # _id + u"tsvector", # _full_text + u"numeric", # CASE_ENQUIRY_ID + u"timestamp", # open_dt + u"timestamp", # target_dt + u"text", # closed_dt + u"text", # OnTime_Status + u"text", # CASE_STATUS + u"text", # CLOSURE_REASON + u"text", # CASE_TITLE + u"text", # SUBJECT + u"text", # REASON + u"text", # TYPE + u"text", # QUEUE + u"text", # Department + u"text", # SubmittedPhoto + u"text", # ClosedPhoto + u"text", # Location + u"numeric", # Fire_district + u"numeric", # pwd_district + u"numeric", # city_council_district + u"text", # police_district + u"text", # neighborhood + u"numeric", # neighborhood_services_district + u"text", # ward + u"numeric", # precinct + u"text", # LOCATION_STREET_NAME + u"numeric", # LOCATION_ZIPCODE + u"numeric", # Latitude + u"numeric", # Longitude + u"text", # Source + ] # noqa print(records) assert records == [ ( @@ -1092,18 +1350,18 @@ def test_boston_311(self, Session): Decimal("101002153891"), datetime.datetime(2017, 7, 6, 23, 38, 43), datetime.datetime(2017, 7, 21, 8, 30), - u"", + None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Street Light Outages", - u"Public Works Department", + u"Public Works Department", # " " trailing whitespace gets trimmed u"Street Lights", u"Street Light Outages", u"PWDx_Street Light Outages", u"PWDx", - u"", - u"", + None, + None, u"480 Harvard St Dorchester MA 02124", Decimal("8"), Decimal("7"), @@ -1124,18 +1382,18 @@ def test_boston_311(self, Session): Decimal("101002153890"), datetime.datetime(2017, 7, 6, 23, 29, 13), datetime.datetime(2017, 9, 11, 8, 30), - u"", + None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", - u"", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces + None, u"522 Saratoga St East Boston MA 02128", Decimal("1"), Decimal("9"), @@ -1156,18 +1414,18 @@ def test_boston_311(self, Session): Decimal("101002153889"), datetime.datetime(2017, 7, 6, 23, 24, 20), datetime.datetime(2017, 9, 11, 8, 30), - u"", + None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", - u"", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces + None, u"965 Bennington St East Boston MA 02128", Decimal("1"), Decimal("9"), @@ -1184,74 +1442,6 @@ def test_boston_311(self, Session): u"Citizens Connect App", ), ] # noqa - print(self._get_column_names(Session, resource_id)) - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"CASE_ENQUIRY_ID", - u"open_dt", - u"target_dt", - u"closed_dt", - u"OnTime_Status", - u"CASE_STATUS", - u"CLOSURE_REASON", - u"CASE_TITLE", - u"SUBJECT", - u"REASON", - u"TYPE", - u"QUEUE", - u"Department", - u"SubmittedPhoto", - u"ClosedPhoto", - u"Location", - u"Fire_district", - u"pwd_district", - u"city_council_district", - u"police_district", - u"neighborhood", - u"neighborhood_services_district", - u"ward", - u"precinct", - u"LOCATION_STREET_NAME", - u"LOCATION_ZIPCODE", - u"Latitude", - u"Longitude", - u"Source", - ] # noqa - print(self._get_column_types(Session, resource_id)) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"numeric", - u"timestamp", - u"timestamp", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"numeric", - u"numeric", - u"numeric", - u"text", - u"text", - u"numeric", - u"text", - u"numeric", - u"text", - u"numeric", - u"numeric", - u"numeric", - u"text", - ] # noqa def test_no_entries(self): csv_filepath = get_sample_filepath("no_entries.csv") @@ -1359,3 +1549,209 @@ def test_preserving_time_ranges(self, Session): (3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"), "9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20)) ] + + def test_load_with_no_strip_white(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=logger, + ) + + # Change strip_extra_white, as it would be done by Data Dictionary + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + for field in fields: + if "info" not in field: + field["info"] = {} + field["info"]["strip_extra_white"] = False # <=2.10 + field["strip_extra_white"] = False # >=2.11 + p.toolkit.get_action("datastore_create")( + {"ignore_auth": True}, + {"resource_id": resource_id, "force": True, "fields": fields}, + ) + + # Load it again with new strip_extra_white + fields = loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=logger, + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=logger + ) + + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", # int4 + u"_full_text", # tsvector + u"CASE_ENQUIRY_ID", # numeric + u"open_dt", # timestamp + u"target_dt", # timestamp + u"closed_dt", # text + u"OnTime_Status", # text + u"CASE_STATUS", # text + u"CLOSURE_REASON", # text + u"CASE_TITLE", # text + u"SUBJECT", # text + u"REASON", # text + u"TYPE", # text + u"QUEUE", # text + u"Department", # text + u"SubmittedPhoto", # text + u"ClosedPhoto", # text + u"Location", # text + u"Fire_district", # numeric + u"pwd_district", # numeric + u"city_council_district", # numeric + u"police_district", # text + u"neighborhood", # text + u"neighborhood_services_district", # numeric + u"ward", # text + u"precinct", # numeric + u"LOCATION_STREET_NAME", # text + u"LOCATION_ZIPCODE", # numeric + u"Latitude", # numeric + u"Longitude", # numeric + u"Source", # text + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", # _id + u"tsvector", # _full_text + u"numeric", # CASE_ENQUIRY_ID + u"timestamp", # open_dt + u"timestamp", # target_dt + u"text", # closed_dt + u"text", # OnTime_Status + u"text", # CASE_STATUS + u"text", # CLOSURE_REASON + u"text", # CASE_TITLE + u"text", # SUBJECT + u"text", # REASON + u"text", # TYPE + u"text", # QUEUE + u"text", # Department + u"text", # SubmittedPhoto + u"text", # ClosedPhoto + u"text", # Location + u"numeric", # Fire_district + u"numeric", # pwd_district + u"numeric", # city_council_district + u"text", # police_district + u"text", # neighborhood + u"numeric", # neighborhood_services_district + u"text", # ward + u"numeric", # precinct + u"text", # LOCATION_STREET_NAME + u"numeric", # LOCATION_ZIPCODE + u"numeric", # Latitude + u"numeric", # Longitude + u"text", # Source + ] # noqa + print(records) + assert records == [ + ( + 1, # ds auto increment id / primary key + Decimal("101002153891"), + datetime.datetime(2017, 7, 6, 23, 38, 43), + datetime.datetime(2017, 7, 21, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Street Light Outages", + u"Public Works Department ", # no strip_extra_white + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + None, + None, + u"480 Harvard St Dorchester MA 02124", + Decimal("8"), + Decimal("7"), + Decimal("4"), + u"B3", + u"Greater Mattapan", + Decimal("9"), + u"Ward 14", + Decimal("1411"), + u"480 Harvard St", + Decimal("2124"), + Decimal("42.288"), + Decimal("-71.0927"), + u"Citizens Connect App", + ), # noqa + ( + 2, # ds auto increment id / primary key + Decimal("101002153890"), + datetime.datetime(2017, 7, 6, 23, 29, 13), + datetime.datetime(2017, 9, 11, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # no strip_extra_white + None, + u"522 Saratoga St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("110"), + u"522 Saratoga St", + Decimal("2128"), + Decimal("42.3807"), + Decimal("-71.0259"), + u"Citizens Connect App", + ), # noqa + ( + 3, # ds auto increment id / primary key + Decimal("101002153889"), + datetime.datetime(2017, 7, 6, 23, 24, 20), + datetime.datetime(2017, 9, 11, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # no strip_extra_white + None, + u"965 Bennington St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("112"), + u"965 Bennington St", + Decimal("2128"), + Decimal("42.386"), + Decimal("-71.008"), + u"Citizens Connect App", + ), + ] # noqa diff --git a/requirements.txt b/requirements.txt index 484f4d2e..e5fe8b9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ckantoolkit +ckantoolkit>=0.0.4 requests>=2.31.0 six>=1.12.0 tabulator==1.53.5 From 5cb342557a06301f49793696bcb32681d8198794 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 9 Dec 2024 23:05:17 +0000 Subject: [PATCH 10/41] squash(pr): 223; - Squash of https://github.com/ckan/ckanext-xloader/pull/223 --- CHANGELOG | 3 +++ ckanext/xloader/jobs.py | 14 ++++++++++++++ ckanext/xloader/tests/test_jobs.py | 26 +++++++++++++++++++++++++- test.ini | 6 +++--- 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index e804e0ad..7e6a9c42 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,9 @@ Feat: * Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` controls whether a resource requires a successful validation report to be XLoadered. By default, a resource would also require a Validation Schema, which can be turned off with `ckanext.xloader.validation.enforce_schema`. +Fix: +* Properly handle REDIS queue timeouts to close/delete any temporary files. + **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.2...1.1.3 diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 85c51936..aa23fb86 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -15,6 +15,7 @@ from six.moves.urllib.parse import urlsplit import requests from rq import get_current_job +from rq.timeouts import JobTimeoutException import sqlalchemy as sa from ckan import model @@ -259,6 +260,13 @@ def tabulator_load(): logger.warning('Load using COPY failed: %s', e) logger.info('Trying again with tabulator') tabulator_load() + except JobTimeoutException as e: + try: + tmp_file.close() + except FileNotFoundError: + pass + logger.warning('Job timed out after %ss', RETRIED_JOB_TIMEOUT) + raise JobError('Job timed out after {}s'.format(RETRIED_JOB_TIMEOUT)) except FileCouldNotBeLoadedError as e: logger.warning('Loading excerpt for this format not supported.') logger.error('Loading file raised an error: %s', e) @@ -357,6 +365,7 @@ def _download_resource_data(resource, data, api_key, logger): response.close() data['datastore_contains_all_records_of_source_file'] = False except requests.exceptions.HTTPError as error: + tmp_file.close() # status code error logger.debug('HTTP error: %s', error) raise HTTPError( @@ -368,6 +377,7 @@ def _download_resource_data(resource, data, api_key, logger): raise JobError('Connection timed out after {}s'.format( DOWNLOAD_TIMEOUT)) except requests.exceptions.RequestException as e: + tmp_file.close() try: err_message = str(e.reason) except AttributeError: @@ -376,6 +386,10 @@ def _download_resource_data(resource, data, api_key, logger): raise HTTPError( message=err_message, status_code=None, request_url=url, response=None) + except JobTimeoutException as e: + tmp_file.close() + logger.warning('Job timed out after %ss', RETRIED_JOB_TIMEOUT) + raise JobError('Job timed out after {}s'.format(RETRIED_JOB_TIMEOUT)) logger.info('Downloaded ok - %s', printable_file_size(length)) file_hash = m.hexdigest() diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index e819dad9..4c15fb54 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -1,5 +1,6 @@ import pytest import io +import os from datetime import datetime @@ -16,6 +17,7 @@ _TEST_FILE_CONTENT = "x, y\n1,2\n2,4\n3,6\n4,8\n5,10" +_TEST_LARGE_FILE_CONTENT = "\n1,2\n2,4\n3,6\n4,8\n5,10" def get_response(download_url, headers): @@ -25,7 +27,6 @@ def get_response(download_url, headers): resp.headers = headers return resp - def get_large_response(download_url, headers): """Mock jobs.get_response() method to fake a large file.""" resp = Response() @@ -33,6 +34,17 @@ def get_large_response(download_url, headers): resp.headers = {'content-length': 2000000000} return resp +def get_large_data_response(download_url, headers): + """Mock jobs.get_response() method.""" + resp = Response() + f_content = _TEST_FILE_CONTENT + (_TEST_LARGE_FILE_CONTENT * 500000) + resp.raw = io.BytesIO(f_content.encode()) + resp.headers = headers + return resp + +def _get_temp_files(dir='/tmp'): + return [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))] + @pytest.fixture def apikey(): @@ -74,6 +86,8 @@ def data(create_with_upload, apikey): @pytest.mark.usefixtures("clean_db", "with_plugins") +@pytest.mark.ckan_config("ckanext.xloader.job_timeout", 2) +@pytest.mark.ckan_config("ckan.jobs.timeout", 2) class TestXLoaderJobs(helpers.FunctionalRQTestBase): def test_xloader_data_into_datastore(self, cli, data): @@ -123,6 +137,16 @@ def test_data_max_excerpt_lines_config(self, cli, data): resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) assert resource["datastore_contains_all_records_of_source_file"] is False + def test_data_with_rq_job_timeout(self, cli, data): + file_suffix = 'multiplication_2.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=2)) + with mock.patch("ckanext.xloader.jobs.get_response", get_large_data_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Job timed out after" in stdout + for f in _get_temp_files(): + # make sure that the tmp file has been closed/deleted in job timeout exception handling + assert file_suffix not in f + @pytest.mark.usefixtures("clean_db") class TestSetResourceMetadata(object): diff --git a/test.ini b/test.ini index 7bfab684..c02827bd 100644 --- a/test.ini +++ b/test.ini @@ -34,13 +34,13 @@ handlers = console [logger_ckan] qualname = ckan -handlers = +handlers = console level = INFO [logger_ckanext_xloader] qualname = ckanext.xloader -handlers = -level = DEBUG +handlers = console +level = WARN [logger_sqlalchemy] handlers = From 856be9374db5f1ea7f95a47808078fca4d11f66f Mon Sep 17 00:00:00 2001 From: Sergey Motornyuk Date: Tue, 10 Sep 2024 20:03:04 +0300 Subject: [PATCH 11/41] feat: SQLAlchemy v2 support --- ckanext/xloader/db.py | 31 +++++++----- ckanext/xloader/loader.py | 73 +++++++++++++++------------- ckanext/xloader/tests/test_loader.py | 22 ++++----- 3 files changed, 68 insertions(+), 58 deletions(-) diff --git a/ckanext/xloader/db.py b/ckanext/xloader/db.py index a93eb0d8..6bb25d94 100644 --- a/ckanext/xloader/db.py +++ b/ckanext/xloader/db.py @@ -39,8 +39,8 @@ def init(config, echo=False): global ENGINE, _METADATA, JOBS_TABLE, METADATA_TABLE, LOGS_TABLE db_uri = config.get('ckanext.xloader.jobs_db.uri', 'sqlite:////tmp/xloader_jobs.db') - ENGINE = sqlalchemy.create_engine(db_uri, echo=echo, convert_unicode=True) - _METADATA = sqlalchemy.MetaData(ENGINE) + ENGINE = sqlalchemy.create_engine(db_uri, echo=echo) + _METADATA = sqlalchemy.MetaData() JOBS_TABLE = _init_jobs_table() METADATA_TABLE = _init_metadata_table() LOGS_TABLE = _init_logs_table() @@ -111,8 +111,10 @@ def get_job(job_id): if job_id: job_id = six.text_type(job_id) - result = ENGINE.execute( - JOBS_TABLE.select().where(JOBS_TABLE.c.job_id == job_id)).first() + with ENGINE.connect() as conn: + result = conn.execute( + JOBS_TABLE.select().where(JOBS_TABLE.c.job_id == job_id) + ).first() if not result: return None @@ -298,10 +300,11 @@ def _update_job(job_id, job_dict): if "data" in job_dict: job_dict["data"] = six.text_type(job_dict["data"]) - ENGINE.execute( - JOBS_TABLE.update() - .where(JOBS_TABLE.c.job_id == job_id) - .values(**job_dict)) + with ENGINE.begin() as conn: + conn.execute( + JOBS_TABLE.update() + .where(JOBS_TABLE.c.job_id == job_id) + .values(**job_dict)) def mark_job_as_completed(job_id, data=None): @@ -443,9 +446,10 @@ def _get_metadata(job_id): # warnings. job_id = six.text_type(job_id) - results = ENGINE.execute( - METADATA_TABLE.select().where( - METADATA_TABLE.c.job_id == job_id)).fetchall() + with ENGINE.connect() as conn: + results = conn.execute( + METADATA_TABLE.select().where( + METADATA_TABLE.c.job_id == job_id)).fetchall() metadata = {} for row in results: value = row['value'] @@ -461,8 +465,9 @@ def _get_logs(job_id): # warnings. job_id = six.text_type(job_id) - results = ENGINE.execute( - LOGS_TABLE.select().where(LOGS_TABLE.c.job_id == job_id)).fetchall() + with ENGINE.connect() as conn: + results = conn.execute( + LOGS_TABLE.select().where(LOGS_TABLE.c.job_id == job_id)).fetchall() results = [dict(result) for result in results] diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index f4aa7af8..bb9dda48 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -14,6 +14,7 @@ from six.moves import zip from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException from unidecode import unidecode +import sqlalchemy as sa import ckan.plugins as p @@ -118,8 +119,8 @@ def _clear_datastore_resource(resource_id): ''' engine = get_write_engine() with engine.begin() as conn: - conn.execute("SET LOCAL lock_timeout = '15s'") - conn.execute('TRUNCATE TABLE "{}" RESTART IDENTITY'.format(resource_id)) + conn.execute(sa.text("SET LOCAL lock_timeout = '15s'")) + conn.execute(sa.text('TRUNCATE TABLE "{}" RESTART IDENTITY'.format(resource_id))) def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): @@ -282,12 +283,17 @@ def strip_white_space_iter(): except Exception as e: raise LoaderError('Could not create the database table: {}' .format(e)) - connection = context['connection'] = engine.connect() + # datstore_active is switched on by datastore_create - TODO temporarily # disable it until the load is complete - _disable_fulltext_trigger(connection, resource_id) - _drop_indexes(context, data_dict, False) + + with engine.begin() as conn: + _disable_fulltext_trigger(conn, resource_id) + + with engine.begin() as conn: + context['connection'] = conn + _drop_indexes(context, data_dict, False) logger.info('Copying to database...') @@ -305,9 +311,8 @@ def strip_white_space_iter(): # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids # the superuser issue. <-- picked - raw_connection = engine.raw_connection() - try: - cur = raw_connection.cursor() + with engine.begin() as conn: + cur = conn.connection.cursor() try: with open(csv_filepath, 'rb') as f: # can't use :param for table name because params are only @@ -337,15 +342,14 @@ def strip_white_space_iter(): finally: cur.close() - finally: - raw_connection.commit() finally: os.remove(csv_filepath) # i.e. the tempfile logger.info('...copying done') logger.info('Creating search index...') - _populate_fulltext(connection, resource_id, fields=fields) + with engine.begin() as conn: + _populate_fulltext(conn, resource_id, fields=fields) logger.info('...search index created') return fields @@ -631,9 +635,9 @@ def fulltext_function_exists(connection): https://github.com/ckan/ckan/pull/3786 or otherwise it is checked on startup of this plugin. ''' - res = connection.execute(''' + res = connection.execute(sa.text(''' select * from pg_proc where proname = 'populate_full_text_trigger'; - ''') + ''')) return bool(res.rowcount) @@ -642,24 +646,25 @@ def fulltext_trigger_exists(connection, resource_id): This will only be the case if your CKAN is new enough to have: https://github.com/ckan/ckan/pull/3786 ''' - res = connection.execute(''' + res = connection.execute(sa.text(''' SELECT pg_trigger.tgname FROM pg_class JOIN pg_trigger ON pg_class.oid=pg_trigger.tgrelid WHERE pg_class.relname={table} AND pg_trigger.tgname='zfulltext'; '''.format( - table=literal_string(resource_id))) + table=literal_string(resource_id)))) return bool(res.rowcount) def _disable_fulltext_trigger(connection, resource_id): - connection.execute('ALTER TABLE {table} DISABLE TRIGGER zfulltext;' - .format(table=identifier(resource_id))) + connection.execute(sa.text('ALTER TABLE {table} DISABLE TRIGGER zfulltext;' + .format(table=identifier(resource_id, True)))) def _enable_fulltext_trigger(connection, resource_id): - connection.execute('ALTER TABLE {table} ENABLE TRIGGER zfulltext;' - .format(table=identifier(resource_id))) + connection.execute(sa.text( + 'ALTER TABLE {table} ENABLE TRIGGER zfulltext;' + .format(table=identifier(resource_id, True)))) def _populate_fulltext(connection, resource_id, fields): @@ -672,14 +677,9 @@ def _populate_fulltext(connection, resource_id, fields): fields: list of dicts giving the each column's 'id' (name) and 'type' (text/numeric/timestamp) ''' - sql = \ - u''' - UPDATE {table} - SET _full_text = to_tsvector({cols}); - '''.format( - # coalesce copes with blank cells - table=identifier(resource_id), - cols=" || ' ' || ".join( + stmt = sa.update(sa.table(resource_id, sa.column("_full_text"))).values( + _full_text=sa.text("to_tsvector({})".format( + " || ' ' || ".join( 'coalesce({}, \'\')'.format( identifier(field['id']) + ('::text' if field['type'] != 'text' else '') @@ -687,8 +687,10 @@ def _populate_fulltext(connection, resource_id, fields): for field in fields if not field['id'].startswith('_') ) - ) - connection.execute(sql) + )) + ) + + connection.execute(stmt) def calculate_record_count(resource_id, logger): @@ -700,15 +702,18 @@ def calculate_record_count(resource_id, logger): logger.info('Calculating record count (running ANALYZE on the table)') engine = get_write_engine() conn = engine.connect() - conn.execute("ANALYZE \"{resource_id}\";" - .format(resource_id=resource_id)) + conn.execute(sa.text("ANALYZE \"{resource_id}\";" + .format(resource_id=resource_id))) -def identifier(s): +def identifier(s, escape_binds=False): # "%" needs to be escaped, otherwise connection.execute thinks it is for # substituting a bind parameter - return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\ - + u'"' + escaped = s.replace(u'"', u'""').replace(u'\0', '') + if escape_binds: + escaped = escaped.replace('%', '%%') + + return u'"' + escaped + u'"' def literal_string(s): diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 4cc8d86a..7ff73446 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -4,6 +4,7 @@ import os import pytest import six +import sqlalchemy as sa import sqlalchemy.orm as orm import datetime import logging @@ -47,17 +48,16 @@ def _get_records( c = Session.connection() if exclude_full_text_column: cols = self._get_column_names(Session, table_name) - cols = ", ".join( - loader.identifier(col) for col in cols if col != "_full_text" - ) + cols = [ + sa.column(col) for col in cols if col != "_full_text" + ] else: - cols = "*" - sql = 'SELECT {cols} FROM "{table_name}"'.format( - cols=cols, table_name=table_name - ) + cols = [sa.text("*")] + stmt = sa.select(*cols).select_from(sa.table(table_name)) + if limit is not None: - sql += " LIMIT {}".format(limit) - results = c.execute(sql) + stmt = stmt.limit(limit) + results = c.execute(stmt) return results.fetchall() def _get_column_names(self, Session, table_name): @@ -71,7 +71,7 @@ def _get_column_names(self, Session, table_name): ORDER BY ordinal_position; """.format(table_name) ) - results = c.execute(sql) + results = c.execute(sa.text(sql)) records = results.fetchall() return [r[0] for r in records] @@ -85,7 +85,7 @@ def _get_column_types(self, Session, table_name): ORDER BY ordinal_position; """.format(table_name) ) - results = c.execute(sql) + results = c.execute(sa.text(sql)) records = results.fetchall() return [r[0] for r in records] From aa26129192efec7c726d90c9d82c357428f7b32c Mon Sep 17 00:00:00 2001 From: William Dutton Date: Tue, 15 Oct 2024 14:53:17 +1000 Subject: [PATCH 12/41] docs: SQLAlchemy v2 support, update removal of 2.9 from support BREAKING CHANGE: SQLAlchemy v2 is not supported in CKAN 2.9.x. CKAN 2.9.x support dropped --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 338102fa..c9b36383 100644 --- a/README.md +++ b/README.md @@ -112,13 +112,13 @@ type -offers to contribute this are welcomed. Compatibility with core CKAN versions: - | CKAN version | Compatibility | - | -------------- | -------------------------------------------------------------------------------------------------------------- | - | 2.7 | no longer supported (last supported version: 0.12.2) | - | 2.8 | no longer supported (last supported version: 0.12.2) | - | 2.9 | yes (Python3) (last supported version for Python 2.7: 0.12.2)), Must: `pip install "setuptools>=44.1.0,<71"` | - | 2.10 | yes | - | 2.11 | yes | + | CKAN version | Compatibility | + | -------------- |-------------------------------------------------------| + | 2.7 | no longer supported (last supported version: 0.12.2) | + | 2.8 | no longer supported (last supported version: 0.12.2) | + | 2.9 | no longer supported (last supported version: 1.2.x) | + | 2.10 | yes | + | 2.11 | yes | ## Installation From 873a7f6347b6db9a99b8e61bfbcf95bc47493477 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Thu, 21 Nov 2024 14:47:39 +1000 Subject: [PATCH 13/41] chore: drop 2.9 ckan --- .github/workflows/publish.yml | 2 -- .github/workflows/test.yml | 3 --- 2 files changed, 5 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 20bceb8c..7b6c16c3 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -53,8 +53,6 @@ jobs: ckan-image: "2.11-py3.10" - ckan-version: "2.10" ckan-image: "2.10-py3.10" - - ckan-version: "2.9" - ckan-image: "2.9-py3.9" #- ckan-version: "master" Publish does not care about master # ckan-image: "master" fail-fast: false diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 53ec0c5f..487e5838 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,9 +30,6 @@ jobs: - ckan-version: "2.10" ckan-image: "2.10-py3.10" experimental: false - - ckan-version: "2.9" - ckan-image: "2.9-py3.9" - experimental: false - ckan-version: "master" ckan-image: "master" experimental: true # master is unstable, good to know if we are compatible or not From df0debf9d1047fc90c8b7d448b291a92863fa5c8 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Tue, 10 Dec 2024 10:54:00 +1000 Subject: [PATCH 14/41] chore: prepare v2.0.0 with changelog updates --- CHANGELOG | 59 ++++++++++++++++++++++++++++++++++++++------------ pyproject.toml | 2 +- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 82329e8b..faa8c2b2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,27 +1,58 @@ -1.1.3 2024-12-09 +CHANGELOG +========= +See: https://github.com/ckan/ckanext-xloader/releases if this file has drifted. + +2.0.0 2024-12-10 ================ -Feat: -* Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field). This will strip surrounding white space from data values prior to inserting them into the database. -* Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` controls whether a resource requires a successful validation report to be XLoadered. By default, a resource would also require a Validation Schema, which can be turned off with `ckanext.xloader.validation.enforce_schema`. +## Major +Dropped CKAN 2.9.x and Python2. -Fix: -* Properly handle REDIS queue timeouts to close/delete any temporary files. +## Feat: +* Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field). + This will strip surrounding white space from data values prior to inserting them into the database. +* Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` + controls whether a resource requires a successful validation report to be XLoadered. + By default, a resource would also require a Validation Schema, which can be turned off with + `ckanext.xloader.validation.enforce_schema`. +* Frontend Status Badges by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/224 -**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.2...1.1.3 -1.1.2 2024-10-25 -================ +## Fix: +* Properly handle REDIS queue timeouts to close/delete any temporary files. +* Fix automated PyPI publishing by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/231 -Feat: -* Frontend Status Badges by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/224 +## What's Changed +* Update README, migrate it to Markdown by @amercader in https://github.com/ckan/ckanext-xloader/pull/235 +* chore: switch to pyproject.toml by @duttonw in https://github.com/ckan/ckanext-xloader/pull/236 +* Validation Extension Support (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/237 +* Strip White Space from Cell Values (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/238 +* RQ Job Timeout Handling (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/239 +* SQLAlchemy v2 support by @smotornyuk in https://github.com/ckan/ckanext-xloader/pull/225 -Fix: -* Fix automated PyPI publishing by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/231 +**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.2.0...2.0.0 + +1.2.0 2024-11-21 +================ +## What's Changed +* Fix PyPI publishing by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/233 +* Enhancement/Bugfix: Downstream qld-gov-au fix's by @duttonw in https://github.com/ckan/ckanext-xloader/pull/232 + * feat: @JVickery-TBS work on validation integration (await successful validation prior to doing datastore work via 'IPipeValidation' + * fix: handle gracefully if tabulator load fails by trying 'direct load' + * fix: Excel blank header row bug + * fix: Datastore truncate, restart identity so numbering restarts from 0 again (when imported data has same columns and types + * fix: parital fix on DB deadlock by adding timeouts on DDL events + * test: test_simple_large_file, test_with_blanks, test_with_empty_lines, test_with_extra_blank_cells + * test: test_require_validation, test_enforce_validation_schema + * chore: min version requirements for cve's, + * requests>=2.32.0 + * urllib3>=2.2.2 + * zipp>=3.19.1 + +**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.2...1.2.0 -**Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.1...1.1.2 1.1.1 2024-10-16 ================ diff --git a/pyproject.toml b/pyproject.toml index 858db4ee..ce8f9eec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ckanext-xloader" -version = "1.2.0" +version = "2.0.0" description = "Express Loader - quickly load data into CKAN DataStore" classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", From bc23a20d752875b80602a4528a308f9dfe00279e Mon Sep 17 00:00:00 2001 From: William Dutton Date: Tue, 10 Dec 2024 11:16:36 +1000 Subject: [PATCH 15/41] publish via pyproject.toml instead of setup.py for version details --- .github/workflows/publish.yml | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7b6c16c3..26225538 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -31,16 +31,40 @@ on: jobs: + + validateVersion: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Validate tag version + if: ${{ startsWith(github.ref, 'refs/tags') }} + run: | + PYTHON_VERSION=$(grep -E '\bversion\s?=\s?"[^"]+"' pyproject.toml | awk -F '"' '{print $2}') + echo "Tag version is [${{ steps.version.outputs.version }}], Python version is [$PYTHON_VERSION]" + if [ "${{ steps.version.outputs.version }}" != "$PYTHON_VERSION" ]; then + echo "Version mismatch; tag version is [${{ steps.version.outputs.version }}] but Python version is [$PYTHON_VERSION]" >> $GITHUB_STEP_SUMMARY + exit 1 + fi + lint: + needs: validateVersion if: github.repository == 'ckan/ckanext-xloader' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' + - name: Install requirements run: pip install flake8 pycodestyle + - name: Check syntax run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --extend-exclude ckan @@ -148,16 +172,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Validate tag version - if: ${{ startsWith(github.ref, 'refs/tags') }} - run: | - PYTHON_VERSION=$(grep -E "\bversion='[^']+'" setup.py | awk -F "'" '{print $2}') - echo "Tag version is [${{ steps.version.outputs.version }}], Python version is [$PYTHON_VERSION]" - if [ "${{ steps.version.outputs.version }}" != "$PYTHON_VERSION" ]; then - echo "Version mismatch; tag version is [${{ steps.version.outputs.version }}] but Python version is [$PYTHON_VERSION]" >> $GITHUB_STEP_SUMMARY - exit 1 - fi - - name: Build package ${{ steps.version.outputs.reponame }} @ ${{ steps.version.outputs.version }} run: | pip install build From 8d8cf3432ae8e609e14bd38e9ccf4d49d49cad32 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Tue, 10 Dec 2024 11:22:56 +1000 Subject: [PATCH 16/41] fix: publish via pyproject.toml instead of setup.py for version details --- .github/workflows/publish.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 26225538..df99051b 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -44,10 +44,11 @@ jobs: - name: Validate tag version if: ${{ startsWith(github.ref, 'refs/tags') }} run: | + TAG_VALUE=${GITHUB_REF/refs\/tags\//} PYTHON_VERSION=$(grep -E '\bversion\s?=\s?"[^"]+"' pyproject.toml | awk -F '"' '{print $2}') echo "Tag version is [${{ steps.version.outputs.version }}], Python version is [$PYTHON_VERSION]" - if [ "${{ steps.version.outputs.version }}" != "$PYTHON_VERSION" ]; then - echo "Version mismatch; tag version is [${{ steps.version.outputs.version }}] but Python version is [$PYTHON_VERSION]" >> $GITHUB_STEP_SUMMARY + if [ "$TAG_VALUE" != "$PYTHON_VERSION" ]; then + echo "Version mismatch; tag version is [$TAG_VALUE] but Python version is [$PYTHON_VERSION]" >> $GITHUB_STEP_SUMMARY exit 1 fi From dbc7a3c870a0a7c1c115dcfdde0c4a2a89867565 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Tue, 10 Dec 2024 11:23:59 +1000 Subject: [PATCH 17/41] fix: publish via pyproject.toml instead of setup.py for version details --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index df99051b..be0f5a20 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -46,7 +46,7 @@ jobs: run: | TAG_VALUE=${GITHUB_REF/refs\/tags\//} PYTHON_VERSION=$(grep -E '\bversion\s?=\s?"[^"]+"' pyproject.toml | awk -F '"' '{print $2}') - echo "Tag version is [${{ steps.version.outputs.version }}], Python version is [$PYTHON_VERSION]" + echo "Tag version is [$TAG_VALUE], Python version is [$PYTHON_VERSION]" if [ "$TAG_VALUE" != "$PYTHON_VERSION" ]; then echo "Version mismatch; tag version is [$TAG_VALUE] but Python version is [$PYTHON_VERSION]" >> $GITHUB_STEP_SUMMARY exit 1 From c5a0a2f914664f58ebd61c42808a190d6c8a1102 Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Mon, 27 Jan 2025 16:57:24 -0500 Subject: [PATCH 18/41] Avoid loading R/W datastore resources Fix for compatibility with CKAN Table Designer feature --- ckanext/xloader/jobs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index aa23fb86..aac2413a 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -19,7 +19,7 @@ import sqlalchemy as sa from ckan import model -from ckan.plugins.toolkit import get_action, asbool, enqueue_job, ObjectNotFound, config +from ckan.plugins.toolkit import get_action, asbool, enqueue_job, ObjectNotFound, config, h from . import db, loader from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError @@ -178,8 +178,8 @@ def xloader_data_into_datastore_(input, job_dict, logger): logger.info('Express Load starting: %s', resource_ckan_url) # check if the resource url_type is a datastore - if resource.get('url_type') == 'datastore': - logger.info('Ignoring resource - url_type=datastore - dump files are ' + if resource.get('url_type') in h.datastore_rw_resource_url_types(): + logger.info('Ignoring resource - R/W DataStore resources are ' 'managed with the Datastore API') return From 8eda56d4d64db888d5eb3aa7e3a0337dc374f6bb Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Mon, 27 Jan 2025 21:16:42 -0500 Subject: [PATCH 19/41] compatibility with ckan 2.10 and older from @duttonw --- ckanext/xloader/jobs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index aac2413a..321ec238 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -178,7 +178,13 @@ def xloader_data_into_datastore_(input, job_dict, logger): logger.info('Express Load starting: %s', resource_ckan_url) # check if the resource url_type is a datastore - if resource.get('url_type') in h.datastore_rw_resource_url_types(): + if hasattr(h, "datastore_rw_resource_url_types"): + datastore_rw_resource_url_types = h.datastore_rw_resource_url_types() + else: + #fallback for 2.10.x or older. + datastore_rw_resource_url_types = ['datastore'] + + if resource.get('url_type') in datastore_rw_resource_url_types: logger.info('Ignoring resource - R/W DataStore resources are ' 'managed with the Datastore API') return From 954ec7513cbc98d1101d60d7ab77097a4a7834b4 Mon Sep 17 00:00:00 2001 From: Paul Walsh Date: Mon, 18 Nov 2024 10:23:58 +0200 Subject: [PATCH 20/41] Proof of concept for xloader site url --- ckanext/xloader/action.py | 23 ++++++++++++----------- ckanext/xloader/command.py | 11 +++++------ ckanext/xloader/config_declaration.yaml | 8 ++++++-- ckanext/xloader/plugin.py | 12 +++++------- ckanext/xloader/tests/test_jobs.py | 23 ++++++++++++----------- 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index 0ce1aaed..4810f51c 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -147,17 +147,18 @@ def xloader_submit(context, data_dict): qualified=True ) data = { - 'api_key': utils.get_xloader_user_apitoken(), - 'job_type': 'xloader_to_datastore', - 'result_url': callback_url, - 'metadata': { - 'ignore_hash': data_dict.get('ignore_hash', False), - 'ckan_url': config['ckan.site_url'], - 'resource_id': res_id, - 'set_url_type': data_dict.get('set_url_type', False), - 'task_created': task['last_updated'], - 'original_url': resource_dict.get('url'), - } + "api_key": utils.get_xloader_user_apitoken(), + "job_type": "xloader_to_datastore", + "result_url": callback_url, + "metadata": { + "ignore_hash": data_dict.get("ignore_hash", False), + "ckan_url": config.get("ckanext.xloader.site_url") + or config["ckan.site_url"], + "resource_id": res_id, + "set_url_type": data_dict.get("set_url_type", False), + "task_created": task["last_updated"], + "original_url": resource_dict.get("url"), + }, } if custom_queue != rq_jobs.DEFAULT_QUEUE_NAME: # Don't automatically retry if it's a custom run diff --git a/ckanext/xloader/command.py b/ckanext/xloader/command.py index 4c0f2d2f..91a39acf 100644 --- a/ckanext/xloader/command.py +++ b/ckanext/xloader/command.py @@ -114,12 +114,11 @@ def _submit_resource(self, resource, user, indent=0, sync=False, queue=None): 'ignore_hash': True, } if sync: - data_dict['ckan_url'] = tk.config.get('ckan.site_url') - input_dict = { - 'metadata': data_dict, - 'api_key': 'TODO' - } - logger = logging.getLogger('ckanext.xloader.cli') + data_dict["ckan_url"] = tk.config.get( + "ckanext.xloader.site_url" + ) or tk.config.get("ckan.site_url") + input_dict = {"metadata": data_dict, "api_key": "TODO"} + logger = logging.getLogger("ckanext.xloader.cli") xloader_data_into_datastore_(input_dict, None, logger) else: if queue: diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 6d6bf21b..42489889 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -2,6 +2,12 @@ version: 1 groups: - annotation: ckanext-xloader settings options: + - key: ckanext.xloader.site_url + default: + description: | + Provide an alternate site URL for the xloader_submit action. + This is useful, for example, when the site is running within a docker network. + required: false - key: ckanext.xloader.jobs_db.uri default: sqlite:////tmp/xloader_jobs.db description: | @@ -171,5 +177,3 @@ groups: they will also display "complete", "active", "inactive", and "unknown". type: bool required: false - - diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 5c0084d9..9d5ca9a0 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -76,13 +76,11 @@ def configure(self, config_): else: self.ignore_hash = False - for config_option in ("ckan.site_url",): - if not config_.get(config_option): - raise Exception( - "Config option `{0}` must be set to use ckanext-xloader.".format( - config_option - ) - ) + site_url_configs = ("ckan.site_url", "ckanext.xloader.site_url") + if not any(site_url_configs): + raise Exception( + f"One of config options {site_url_configs} must be set to use ckanext-xloader." + ) # IPipeValidation diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index c67f4fe6..8ad86a7e 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -71,17 +71,18 @@ def data(create_with_upload, apikey): "api.action", ver=3, logic_function="xloader_hook", qualified=True ) return { - 'api_key': apikey, - 'job_type': 'xloader_to_datastore', - 'result_url': callback_url, - 'metadata': { - 'ignore_hash': True, - 'ckan_url': toolkit.config.get('ckan.site_url'), - 'resource_id': resource["id"], - 'set_url_type': False, - 'task_created': datetime.utcnow().isoformat(), - 'original_url': resource["url"], - } + "api_key": apikey, + "job_type": "xloader_to_datastore", + "result_url": callback_url, + "metadata": { + "ignore_hash": True, + "ckan_url": toolkit.config.get("ckanext.xloader.site_url") + or toolkit.config.get("ckan.site_url"), + "resource_id": resource["id"], + "set_url_type": False, + "task_created": datetime.utcnow().isoformat(), + "original_url": resource["url"], + }, } From 73779ddd850ee62a91fc89f6329be1cb09ffafd4 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 14 Jan 2025 10:35:13 +0100 Subject: [PATCH 21/41] Updates to config_declaration.yaml, jobs.py --- ckanext/xloader/config_declaration.yaml | 4 +++- ckanext/xloader/jobs.py | 22 +++++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 42489889..63235e22 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -3,10 +3,12 @@ groups: - annotation: ckanext-xloader settings options: - key: ckanext.xloader.site_url - default: + example: http://ckan-dev:5000 + default: http://ckan-dev:5000 description: | Provide an alternate site URL for the xloader_submit action. This is useful, for example, when the site is running within a docker network. + validators: configured_default("ckan.site_url",None) required: false - key: ckanext.xloader.jobs_db.uri default: sqlite:////tmp/xloader_jobs.db diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 321ec238..0b959d9a 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -12,12 +12,14 @@ import sys from psycopg2 import errors -from six.moves.urllib.parse import urlsplit +from six.moves.urllib.parse import urlsplit, urlparse, urlunparse import requests from rq import get_current_job from rq.timeouts import JobTimeoutException import sqlalchemy as sa +from urllib.parse import urljoin, urlunsplit + from ckan import model from ckan.plugins.toolkit import get_action, asbool, enqueue_job, ObjectNotFound, config, h @@ -80,9 +82,15 @@ def xloader_data_into_datastore(input): # First flag that this task is running, to indicate the job is not # stillborn, for when xloader_submit is deciding whether another job would # be a duplicate or not + + callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = urljoin( + callback_url.rstrip('/'), '/api/3/action/xloader_hook') + result_url = callback_url + job_dict = dict(metadata=input['metadata'], status='running') - callback_xloader_hook(result_url=input['result_url'], + callback_xloader_hook(result_url=result_url, api_key=input['api_key'], job_dict=job_dict) @@ -145,7 +153,7 @@ def xloader_data_into_datastore(input): errored = True finally: # job_dict is defined in xloader_hook's docstring - is_saved_ok = callback_xloader_hook(result_url=input['result_url'], + is_saved_ok = callback_xloader_hook(result_url=result_url, api_key=input['api_key'], job_dict=job_dict) errored = errored or not is_saved_ok @@ -305,6 +313,14 @@ def _download_resource_data(resource, data, api_key, logger): 'Only http, https, and ftp resources may be fetched.' ) + resource_uri = urlunsplit(('', '', url_parts.path, url_parts.query, url_parts.fragment)) + callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = urljoin( + callback_url.rstrip('/'), resource_uri) + + url = callback_url + url_parts = urlsplit(url) # reparse the url after the callback_url is set + # fetch the resource data logger.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) From 32884b981cfb465fa00e09df853eff75296fdbba Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 15 Jan 2025 11:07:25 +0100 Subject: [PATCH 22/41] Changes for config_declaration.yaml and jobs.py as per PR --- ckanext/xloader/config_declaration.yaml | 2 +- ckanext/xloader/jobs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 63235e22..c304c949 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -4,7 +4,7 @@ groups: options: - key: ckanext.xloader.site_url example: http://ckan-dev:5000 - default: http://ckan-dev:5000 + default: description: | Provide an alternate site URL for the xloader_submit action. This is useful, for example, when the site is running within a docker network. diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 0b959d9a..25ac3d5a 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -12,7 +12,7 @@ import sys from psycopg2 import errors -from six.moves.urllib.parse import urlsplit, urlparse, urlunparse +from six.moves.urllib.parse import urlsplit import requests from rq import get_current_job from rq.timeouts import JobTimeoutException From b69b9c56229ebb2aa7b9b2c6930d94bcf1f027e5 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 16 Jan 2025 11:18:26 +0100 Subject: [PATCH 23/41] Update jobs.py missed a callback_xloader_hook call, need to update result_url here too --- ckanext/xloader/jobs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 25ac3d5a..715eb429 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -220,7 +220,11 @@ def direct_load(): set_datastore_active(data, resource, logger) if 'result_url' in input: job_dict['status'] = 'running_but_viewable' - callback_xloader_hook(result_url=input['result_url'], + callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = urljoin( + callback_url.rstrip('/'), '/api/3/action/xloader_hook') + result_url = callback_url + callback_xloader_hook(result_url=result_url, api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: %s', resource_ckan_url) From b93ac149f6662386db5045756ca65f931a1a11a7 Mon Sep 17 00:00:00 2001 From: Brett Date: Sun, 19 Jan 2025 15:21:53 +0100 Subject: [PATCH 24/41] Updates to jobs.py and utils.py hopefully getting closer --- ckanext/xloader/jobs.py | 19 ++++++++----------- ckanext/xloader/utils.py | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 715eb429..7ddb2913 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -25,7 +25,7 @@ from . import db, loader from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError -from .utils import datastore_resource_exists, set_resource_metadata +from .utils import datastore_resource_exists, set_resource_metadata, get_ckan_url try: from ckan.lib.api_token import get_user_from_token @@ -83,14 +83,13 @@ def xloader_data_into_datastore(input): # stillborn, for when xloader_submit is deciding whether another job would # be a duplicate or not - callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = get_ckan_url() callback_url = urljoin( callback_url.rstrip('/'), '/api/3/action/xloader_hook') - result_url = callback_url job_dict = dict(metadata=input['metadata'], status='running') - callback_xloader_hook(result_url=result_url, + callback_xloader_hook(result_url=callback_url, api_key=input['api_key'], job_dict=job_dict) @@ -153,7 +152,7 @@ def xloader_data_into_datastore(input): errored = True finally: # job_dict is defined in xloader_hook's docstring - is_saved_ok = callback_xloader_hook(result_url=result_url, + is_saved_ok = callback_xloader_hook(result_url=callback_url, api_key=input['api_key'], job_dict=job_dict) errored = errored or not is_saved_ok @@ -220,11 +219,10 @@ def direct_load(): set_datastore_active(data, resource, logger) if 'result_url' in input: job_dict['status'] = 'running_but_viewable' - callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') + callback_url = get_ckan_url() callback_url = urljoin( callback_url.rstrip('/'), '/api/3/action/xloader_hook') - result_url = callback_url - callback_xloader_hook(result_url=result_url, + callback_xloader_hook(result_url=callback_url, api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: %s', resource_ckan_url) @@ -318,11 +316,10 @@ def _download_resource_data(resource, data, api_key, logger): ) resource_uri = urlunsplit(('', '', url_parts.path, url_parts.query, url_parts.fragment)) - callback_url = config.get('ckanext.xloader.site_url') or config.get('ckan.site_url') - callback_url = urljoin( + callback_url = get_ckan_url() + url = urljoin( callback_url.rstrip('/'), resource_uri) - url = callback_url url_parts = urlsplit(url) # reparse the url after the callback_url is set # fetch the resource data diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index 4c90c84d..b0fb7def 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -175,6 +175,28 @@ def get_xloader_user_apitoken(): return site_user["apikey"] +def get_ckan_url(): + """ Returns the CKAN URL. + + ckan may be behind a proxy, or more likely, within a docker network. + This method returns the URL set in the config file for the CKAN instance. + Containers within the same network ie: XLoader will be able to communicate with CKAN using this URL. + """ + ckan_url = config.get('ckanext.xloader.site_url', None) + if ckan_url: + return ckan_url + + # Fall back to mandatory ckan.site_url + ckan_url = config.get('ckan.site_url') + if not ckan_url: + raise ValueError( + "The ckan.site_url configuration option is required but not set. " + "Please set this value in your CKAN configuration file." + ) + + return ckan_url + + def set_resource_metadata(update_dict): ''' Set appropriate datastore_active flag on CKAN resource. From 944ce7822649ea75e198b5a348893088a1efd4d6 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 28 Jan 2025 13:00:15 +0100 Subject: [PATCH 25/41] Updates to jobs.py, utils.py and tests in test_jobs.py --- ckanext/xloader/jobs.py | 32 +++++--------- ckanext/xloader/tests/test_jobs.py | 60 +++++++++++++++++++++++++ ckanext/xloader/utils.py | 70 ++++++++++++++++++++++-------- 3 files changed, 124 insertions(+), 38 deletions(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 7ddb2913..bde4b5f0 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -25,7 +25,7 @@ from . import db, loader from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError -from .utils import datastore_resource_exists, set_resource_metadata, get_ckan_url +from .utils import datastore_resource_exists, set_resource_metadata, modify_resource_url, modify_ckan_url try: from ckan.lib.api_token import get_user_from_token @@ -83,13 +83,10 @@ def xloader_data_into_datastore(input): # stillborn, for when xloader_submit is deciding whether another job would # be a duplicate or not - callback_url = get_ckan_url() - callback_url = urljoin( - callback_url.rstrip('/'), '/api/3/action/xloader_hook') - job_dict = dict(metadata=input['metadata'], status='running') - callback_xloader_hook(result_url=callback_url, + + callback_xloader_hook(result_url=input['result_url'], api_key=input['api_key'], job_dict=job_dict) @@ -152,7 +149,7 @@ def xloader_data_into_datastore(input): errored = True finally: # job_dict is defined in xloader_hook's docstring - is_saved_ok = callback_xloader_hook(result_url=callback_url, + is_saved_ok = callback_xloader_hook(result_url=input['result_url'], api_key=input['api_key'], job_dict=job_dict) errored = errored or not is_saved_ok @@ -219,10 +216,7 @@ def direct_load(): set_datastore_active(data, resource, logger) if 'result_url' in input: job_dict['status'] = 'running_but_viewable' - callback_url = get_ckan_url() - callback_url = urljoin( - callback_url.rstrip('/'), '/api/3/action/xloader_hook') - callback_xloader_hook(result_url=callback_url, + callback_xloader_hook(result_url=input['result_url'], api_key=api_key, job_dict=job_dict) logger.info('Data now available to users: %s', resource_ckan_url) @@ -306,22 +300,17 @@ def _download_resource_data(resource, data, api_key, logger): data['datastore_contains_all_records_of_source_file'] = False which will be saved to the resource later on. ''' - # check scheme + url = resource.get('url') - url_parts = urlsplit(url) + url = modify_resource_url(url) + # check scheme + url_parts = urlsplit(url) scheme = url_parts.scheme if scheme not in ('http', 'https', 'ftp'): raise JobError( 'Only http, https, and ftp resources may be fetched.' ) - resource_uri = urlunsplit(('', '', url_parts.path, url_parts.query, url_parts.fragment)) - callback_url = get_ckan_url() - url = urljoin( - callback_url.rstrip('/'), resource_uri) - - url_parts = urlsplit(url) # reparse the url after the callback_url is set - # fetch the resource data logger.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) @@ -481,7 +470,8 @@ def callback_xloader_hook(result_url, api_key, job_dict): else: header, key = 'Authorization', api_key headers[header] = key - + + result_url = modify_ckan_url(result_url, job_dict['metadata']['ckan_url']) try: result = requests.post( result_url, diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index 8ad86a7e..671a6837 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -104,6 +104,66 @@ def test_xloader_data_into_datastore(self, cli, data): resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) assert resource["datastore_contains_all_records_of_source_file"] + def test_download_resource_data_with_ckanext_xloader_site_url(self, cli, data): + # Set the ckanext.xloader.site_url in the config + with mock.patch.dict(toolkit.config, {'ckanext.xloader.site_url': 'http://xloader-site-url'}): + data['metadata']['original_url'] = 'http://xloader-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_download_resource_data_with_ckan_site_url(self, cli, data): + # Set the ckan.site_url in the config + with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): + data['metadata']['original_url'] = 'http://ckan-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_download_resource_data_with_different_original_url(self, cli, data): + # Set the ckan.site_url in the config + with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): + data['metadata']['original_url'] = 'http://external-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_callback_xloader_hook_with_ckanext_xloader_site_url(self, cli, data): + # Set the ckanext.xloader.site_url in the config + with mock.patch.dict(toolkit.config, {'ckanext.xloader.site_url': 'http://xloader-site-url'}): + data['result_url'] = 'http://xloader-site-url/api/3/action/xloader_hook' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + + def test_callback_xloader_hook_with_ckan_site_url(self, cli, data): + # Set the ckan.site_url in the config + with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): + data['result_url'] = 'http://ckan-site-url/api/3/action/xloader_hook' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + def test_xloader_ignore_hash(self, cli, data): self.enqueue(jobs.xloader_data_into_datastore, [data]) with mock.patch("ckanext.xloader.jobs.get_response", get_response): diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index b0fb7def..eacd4a1c 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -2,6 +2,7 @@ import json import datetime +import re from six import text_type as str, binary_type @@ -20,6 +21,8 @@ log = getLogger(__name__) +from urllib.parse import urljoin, urlunsplit, urlparse + # resource.formats accepted by ckanext-xloader. Must be lowercase here. DEFAULT_FORMATS = [ "csv", @@ -175,26 +178,59 @@ def get_xloader_user_apitoken(): return site_user["apikey"] -def get_ckan_url(): - """ Returns the CKAN URL. +def modify_ckan_url(result_url: str, ckan_url: str) -> str: + """ Modifies a URL based on CKAN site URL comparison. - ckan may be behind a proxy, or more likely, within a docker network. - This method returns the URL set in the config file for the CKAN instance. - Containers within the same network ie: XLoader will be able to communicate with CKAN using this URL. + This function compares the base URL of a given result URL against a CKAN site URL. + If they differ, the result URL is modified to use the CKAN site URL as its base + while preserving the original path. + + Args: + result_url (str): The original URL to potentially modify + ckan_url (str): The base CKAN site URL to compare against + Returns: + str: The modified URL if base URLs differ, otherwise returns original URL unchanged """ - ckan_url = config.get('ckanext.xloader.site_url', None) - if ckan_url: - return ckan_url - - # Fall back to mandatory ckan.site_url - ckan_url = config.get('ckan.site_url') - if not ckan_url: - raise ValueError( - "The ckan.site_url configuration option is required but not set. " - "Please set this value in your CKAN configuration file." - ) + parsed_url = urlparse(result_url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + path_url = parsed_url.path + + if base_url != ckan_url: + result_url = urljoin(ckan_url, path_url) + + return result_url + + +def modify_resource_url(orig_ckan_url: str) -> str: + """Returns a potentially modified CKAN URL. + + This function takes a CKAN URL and potentially modifies its base URL while preserving the path, + query parameters, and fragments. The modification occurs only if two conditions are met: + 1. The base URL of the input matches the configured CKAN site URL + 2. An xloader_site_url is configured in the settings - return ckan_url + Args: + orig_ckan_url (str): The original CKAN URL to potentially modify + Returns: + str: Either the modified URL with new base URL from xloader_site_url, + or the original URL if conditions aren't met + """ + xloader_site_url = config.get('ckanext.xloader.site_url') + ckan_site_url = config.get('ckan.site_url') + + parsed_url = urlparse(orig_ckan_url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + # If the base URL matches the CKAN site URL and xloader_site_url is set, modify the URL + if base_url == ckan_site_url and xloader_site_url: + modified_ckan_url = urljoin(xloader_site_url, parsed_url.path) + if parsed_url.query: + modified_ckan_url += f"?{parsed_url.query}" + if parsed_url.fragment: + modified_ckan_url += f"#{parsed_url.fragment}" + return modified_ckan_url + + return orig_ckan_url def set_resource_metadata(update_dict): From c8714ce6d6feff29c2cddb091b1e0353c3e21d63 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 29 Jan 2025 15:48:31 +0100 Subject: [PATCH 26/41] updates to action.py and utils.py --- ckanext/xloader/action.py | 2 +- ckanext/xloader/utils.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index 4810f51c..b7be5412 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -153,7 +153,7 @@ def xloader_submit(context, data_dict): "metadata": { "ignore_hash": data_dict.get("ignore_hash", False), "ckan_url": config.get("ckanext.xloader.site_url") - or config["ckan.site_url"], + or config["ckan.site_url"], "resource_id": res_id, "set_url_type": data_dict.get("set_url_type", False), "task_created": task["last_updated"], diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index eacd4a1c..4d23ef73 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -193,9 +193,8 @@ def modify_ckan_url(result_url: str, ckan_url: str) -> str: """ parsed_url = urlparse(result_url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - path_url = parsed_url.path - if base_url != ckan_url: + path_url = parsed_url.path result_url = urljoin(ckan_url, path_url) return result_url From f12a988ea181b6250f0e245c249967a08e6b7893 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Wed, 5 Feb 2025 07:22:57 +1000 Subject: [PATCH 27/41] fix: add more unit tests --- ckanext/xloader/jobs.py | 4 +-- ckanext/xloader/tests/test_utils.py | 47 +++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 ckanext/xloader/tests/test_utils.py diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index bde4b5f0..88710b23 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -300,9 +300,7 @@ def _download_resource_data(resource, data, api_key, logger): data['datastore_contains_all_records_of_source_file'] = False which will be saved to the resource later on. ''' - - url = resource.get('url') - url = modify_resource_url(url) + url = modify_resource_url(resource.get('url')) # check scheme url_parts = urlsplit(url) scheme = url_parts.scheme diff --git a/ckanext/xloader/tests/test_utils.py b/ckanext/xloader/tests/test_utils.py new file mode 100644 index 00000000..283c9ca2 --- /dev/null +++ b/ckanext/xloader/tests/test_utils.py @@ -0,0 +1,47 @@ +import pytest +from unittest.mock import patch + +from ckanext.xloader import utils + +@pytest.mark.parametrize("result_url, ckan_url, expected", [ + ("https://example.com/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"), + ("https://ckan.example.org/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"), + ("http://old-ckan.com/resource/456", "http://new-ckan.com", "http://new-ckan.com/resource/456"), + ("https://sub.example.com/path", "https://ckan.example.com", "https://ckan.example.com/path"), + ("ftp://fileserver.com/file", "https://ckan.example.com", "ftp://fileserver.com/file"), + ("https://ckan.example.org/resource/789", "https://xloader.example.org", "https://xloader.example.org/resource/789"), + ("https://ckan.example.org/dataset/data", "https://xloader.example.org", "https://xloader.example.org/dataset/data"), + ("https://ckan.example.org/resource/123?foo=bar", "https://xloader.example.org", "https://xloader.example.org/resource/123?foo=bar"), + ("https://ckan.example.org/dataset/456#section", "https://xloader.example.org", "https://xloader.example.org/dataset/456#section"), + ("https://ckan.example.org/resource/123?param=value&other=123", "https://xloader.example.org", "https://xloader.example.org/resource/123?param=value&other=123"), + ("https://ckan.example.org/resource/partial#fragment", "https://xloader.example.org", "https://xloader.example.org/resource/partial#fragment"), + ("https://ckan.example.org/path/to/data?key=value#section", "https://xloader.example.org", "https://xloader.example.org/path/to/data?key=value#section"), +]) +def test_modify_ckan_url(result_url, ckan_url, expected): + assert utils.modify_ckan_url(result_url, ckan_url) == expected + + +def test_modify_ckan_url_no_change(): + url = "https://ckan.example.com/dataset" + assert utils.modify_ckan_url(url, "https://ckan.example.com") == url + + +@pytest.mark.parametrize("orig_ckan_url, ckan_site_url, xloader_site_url, expected", [ + ("https://ckan.example.org/resource/789", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/789"), + ("https://ckan.example.org/dataset/data", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/dataset/data"), + ("https://ckan.example.org/resource/123?foo=bar", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/123?foo=bar"), + ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/dataset/456#section"), + ("https://other-site.com/resource/999", "https://ckan.example.org", "https://xloader.example.org", "https://other-site.com/resource/999"), + ("https://ckan.example.org/resource/123?param=value&other=123", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/123?param=value&other=123"), + ("https://ckan.example.org/resource/partial#fragment", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/partial#fragment"), + ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/path/to/data?key=value#section"), +]) +def test_modify_resource_url(orig_ckan_url, ckan_site_url, xloader_site_url, expected): + with patch.dict("your_module.toolkit.config", {"ckan.site_url": ckan_site_url, "ckanext.xloader.site_url": xloader_site_url}): + assert utils.modify_resource_url(orig_ckan_url) == expected + + +def test_modify_resource_url_no_xloader_site(): + url = "https://ckan.example.org/dataset" + with patch.dict("your_module.toolkit.config", {"ckan.site_url": "https://ckan.example.org", "ckanext.xloader.site_url": None}): + assert utils.modify_resource_url(url) == url From c040a7812272a944840c49efde3fbca35c87e321 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Mon, 3 Mar 2025 11:03:02 +1000 Subject: [PATCH 28/41] fix: Streamline url replacement logic, introduce regex for ignore base url replacement --- ckanext/xloader/action.py | 23 ++++--- ckanext/xloader/command.py | 13 ++-- ckanext/xloader/config_declaration.yaml | 9 +++ ckanext/xloader/jobs.py | 16 ++--- ckanext/xloader/plugin.py | 6 -- ckanext/xloader/tests/test_jobs.py | 3 +- ckanext/xloader/tests/test_utils.py | 59 ++++++++++------- ckanext/xloader/utils.py | 86 ++++++++++++++----------- 8 files changed, 118 insertions(+), 97 deletions(-) diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index b7be5412..0ce1aaed 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -147,18 +147,17 @@ def xloader_submit(context, data_dict): qualified=True ) data = { - "api_key": utils.get_xloader_user_apitoken(), - "job_type": "xloader_to_datastore", - "result_url": callback_url, - "metadata": { - "ignore_hash": data_dict.get("ignore_hash", False), - "ckan_url": config.get("ckanext.xloader.site_url") - or config["ckan.site_url"], - "resource_id": res_id, - "set_url_type": data_dict.get("set_url_type", False), - "task_created": task["last_updated"], - "original_url": resource_dict.get("url"), - }, + 'api_key': utils.get_xloader_user_apitoken(), + 'job_type': 'xloader_to_datastore', + 'result_url': callback_url, + 'metadata': { + 'ignore_hash': data_dict.get('ignore_hash', False), + 'ckan_url': config['ckan.site_url'], + 'resource_id': res_id, + 'set_url_type': data_dict.get('set_url_type', False), + 'task_created': task['last_updated'], + 'original_url': resource_dict.get('url'), + } } if custom_queue != rq_jobs.DEFAULT_QUEUE_NAME: # Don't automatically retry if it's a custom run diff --git a/ckanext/xloader/command.py b/ckanext/xloader/command.py index 91a39acf..9937c910 100644 --- a/ckanext/xloader/command.py +++ b/ckanext/xloader/command.py @@ -5,7 +5,7 @@ import ckan.plugins.toolkit as tk from ckanext.xloader.jobs import xloader_data_into_datastore_ -from ckanext.xloader.utils import XLoaderFormats +from ckanext.xloader.utils import XLoaderFormats, get_xloader_user_apitoken class XloaderCmd: @@ -114,11 +114,12 @@ def _submit_resource(self, resource, user, indent=0, sync=False, queue=None): 'ignore_hash': True, } if sync: - data_dict["ckan_url"] = tk.config.get( - "ckanext.xloader.site_url" - ) or tk.config.get("ckan.site_url") - input_dict = {"metadata": data_dict, "api_key": "TODO"} - logger = logging.getLogger("ckanext.xloader.cli") + data_dict['ckan_url'] = tk.config.get('ckan.site_url') + input_dict = { + 'metadata': data_dict, + 'api_key': get_xloader_user_apitoken() + } + logger = logging.getLogger('ckanext.xloader.cli') xloader_data_into_datastore_(input_dict, None, logger) else: if queue: diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index c304c949..a03a5dfb 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -8,8 +8,17 @@ groups: description: | Provide an alternate site URL for the xloader_submit action. This is useful, for example, when the site is running within a docker network. + Note: This setting will not alter path. i.e ckan.root_path validators: configured_default("ckan.site_url",None) required: false + - key: ckanext.xloader.site_url_ignore_path_regex + example: "(/PathToS3HostOriginIWantToGoDirectTo|/anotherPath)" + default: + description: | + Provide the ability to ignore paths which can't be mapped to alternative site URL for resource access. + This is useful, for example, when the site is running within a docker network and the cdn front door has + Blob storage mapped to another path on the same domain. + required: false - key: ckanext.xloader.jobs_db.uri default: sqlite:////tmp/xloader_jobs.db description: | diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 88710b23..762b926b 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -18,14 +18,12 @@ from rq.timeouts import JobTimeoutException import sqlalchemy as sa -from urllib.parse import urljoin, urlunsplit - from ckan import model from ckan.plugins.toolkit import get_action, asbool, enqueue_job, ObjectNotFound, config, h from . import db, loader from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError -from .utils import datastore_resource_exists, set_resource_metadata, modify_resource_url, modify_ckan_url +from .utils import datastore_resource_exists, set_resource_metadata, modify_input_url try: from ckan.lib.api_token import get_user_from_token @@ -82,10 +80,8 @@ def xloader_data_into_datastore(input): # First flag that this task is running, to indicate the job is not # stillborn, for when xloader_submit is deciding whether another job would # be a duplicate or not - job_dict = dict(metadata=input['metadata'], status='running') - callback_xloader_hook(result_url=input['result_url'], api_key=input['api_key'], job_dict=job_dict) @@ -300,9 +296,10 @@ def _download_resource_data(resource, data, api_key, logger): data['datastore_contains_all_records_of_source_file'] = False which will be saved to the resource later on. ''' - url = modify_resource_url(resource.get('url')) + # update base url (for possible local loopback) + url = modify_input_url(resource.get('url')) # check scheme - url_parts = urlsplit(url) + url_parts = urlsplit(url) scheme = url_parts.scheme if scheme not in ('http', 'https', 'ftp'): raise JobError( @@ -468,11 +465,10 @@ def callback_xloader_hook(result_url, api_key, job_dict): else: header, key = 'Authorization', api_key headers[header] = key - - result_url = modify_ckan_url(result_url, job_dict['metadata']['ckan_url']) + try: result = requests.post( - result_url, + modify_input_url(result_url), # modify with local config data=json.dumps(job_dict, cls=DatetimeJsonEncoder), verify=SSL_VERIFY, headers=headers) diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 9d5ca9a0..9b89ae9e 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -76,12 +76,6 @@ def configure(self, config_): else: self.ignore_hash = False - site_url_configs = ("ckan.site_url", "ckanext.xloader.site_url") - if not any(site_url_configs): - raise Exception( - f"One of config options {site_url_configs} must be set to use ckanext-xloader." - ) - # IPipeValidation def receive_validation_report(self, validation_report): diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index 671a6837..1d92f842 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -76,8 +76,7 @@ def data(create_with_upload, apikey): "result_url": callback_url, "metadata": { "ignore_hash": True, - "ckan_url": toolkit.config.get("ckanext.xloader.site_url") - or toolkit.config.get("ckan.site_url"), + "ckan_url": toolkit.config.get("ckanext.xloader.site_url", toolkit.config.get("ckan.site_url")), "resource_id": resource["id"], "set_url_type": False, "task_created": datetime.utcnow().isoformat(), diff --git a/ckanext/xloader/tests/test_utils.py b/ckanext/xloader/tests/test_utils.py index 283c9ca2..384b3e17 100644 --- a/ckanext/xloader/tests/test_utils.py +++ b/ckanext/xloader/tests/test_utils.py @@ -1,14 +1,17 @@ import pytest from unittest.mock import patch - +from ckan.plugins import toolkit from ckanext.xloader import utils + @pytest.mark.parametrize("result_url, ckan_url, expected", [ ("https://example.com/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"), + ("https://example.com/resource/123", "http://127.0.0.1:3001", "http://127.0.0.1:3001/resource/123"), + ("https://example.com/resource/123", "http://127.0.0.1:3001/pathnotadded", "http://127.0.0.1:3001/resource/123"), ("https://ckan.example.org/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"), ("http://old-ckan.com/resource/456", "http://new-ckan.com", "http://new-ckan.com/resource/456"), ("https://sub.example.com/path", "https://ckan.example.com", "https://ckan.example.com/path"), - ("ftp://fileserver.com/file", "https://ckan.example.com", "ftp://fileserver.com/file"), + ("ftp://fileserver.com/file", "https://ckan.example.com", "ftp://fileserver.com/file"), ##should never happen ("https://ckan.example.org/resource/789", "https://xloader.example.org", "https://xloader.example.org/resource/789"), ("https://ckan.example.org/dataset/data", "https://xloader.example.org", "https://xloader.example.org/dataset/data"), ("https://ckan.example.org/resource/123?foo=bar", "https://xloader.example.org", "https://xloader.example.org/resource/123?foo=bar"), @@ -17,31 +20,43 @@ ("https://ckan.example.org/resource/partial#fragment", "https://xloader.example.org", "https://xloader.example.org/resource/partial#fragment"), ("https://ckan.example.org/path/to/data?key=value#section", "https://xloader.example.org", "https://xloader.example.org/path/to/data?key=value#section"), ]) -def test_modify_ckan_url(result_url, ckan_url, expected): - assert utils.modify_ckan_url(result_url, ckan_url) == expected +def test_private_modify_url(result_url, ckan_url, expected): + assert utils._modify_url(result_url, ckan_url) == expected def test_modify_ckan_url_no_change(): url = "https://ckan.example.com/dataset" - assert utils.modify_ckan_url(url, "https://ckan.example.com") == url - - -@pytest.mark.parametrize("orig_ckan_url, ckan_site_url, xloader_site_url, expected", [ - ("https://ckan.example.org/resource/789", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/789"), - ("https://ckan.example.org/dataset/data", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/dataset/data"), - ("https://ckan.example.org/resource/123?foo=bar", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/123?foo=bar"), - ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/dataset/456#section"), - ("https://other-site.com/resource/999", "https://ckan.example.org", "https://xloader.example.org", "https://other-site.com/resource/999"), - ("https://ckan.example.org/resource/123?param=value&other=123", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/123?param=value&other=123"), - ("https://ckan.example.org/resource/partial#fragment", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/resource/partial#fragment"), - ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "https://xloader.example.org", "https://xloader.example.org/path/to/data?key=value#section"), + assert utils._modify_url(url, "https://ckan.example.com") == url + + +@pytest.mark.parametrize("input_url, ckan_site_url, xloader_site_url, is_altered, expected", [ + ("https://ckan.example.org/resource/789", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/789"), + ("https://ckan.example.org/resource/789", "https://ckan.example.org", "http://127.0.0.1:3012", True, "http://127.0.0.1:3012/resource/789"), + ("https://ckan.example.org/dataset/data", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/dataset/data"), + ("https://ckan.example.org/resource/123?foo=bar", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/123?foo=bar"), + ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/dataset/456#section"), + ("https://other-site.com/resource/999", "https://ckan.example.org", "https://xloader.example.org", False, ""), + ("https://ckan.example.org/resource/123?param=value&other=123", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/123?param=value&other=123"), + ("https://ckan.example.org/resource/partial#fragment", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/partial#fragment"), + ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/path/to/data?key=value#section"), + ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "http://localhost:3000", True, "http://localhost:3000/path/to/data?key=value#section"), + ("https://ckan.example.org/blackListedPathToS3HostOrigin?key=value#section", "https://ckan.example.org", "https://xloader.example.org", False, ""), + ("ftp://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", False, ""), ]) -def test_modify_resource_url(orig_ckan_url, ckan_site_url, xloader_site_url, expected): - with patch.dict("your_module.toolkit.config", {"ckan.site_url": ckan_site_url, "ckanext.xloader.site_url": xloader_site_url}): - assert utils.modify_resource_url(orig_ckan_url) == expected +def test_modify_input_url(input_url, ckan_site_url, xloader_site_url, is_altered, expected): + with patch.dict(toolkit.config, + {"ckan.site_url": ckan_site_url, + "ckanext.xloader.site_url": xloader_site_url, + "ckanext.xloader.site_url_ignore_path_regex": "(/blackListedPathToS3HostOrigin|/anotherpath)"}): + response = utils.modify_input_url(input_url) + if is_altered: + assert response == expected + else: + assert response == input_url + -def test_modify_resource_url_no_xloader_site(): +def test_modify_input_url_no_xloader_site(): url = "https://ckan.example.org/dataset" - with patch.dict("your_module.toolkit.config", {"ckan.site_url": "https://ckan.example.org", "ckanext.xloader.site_url": None}): - assert utils.modify_resource_url(url) == url + with patch.dict(toolkit.config, {"ckan.site_url": "https://ckan.example.org", "ckanext.xloader.site_url": None}): + assert utils.modify_input_url(url) == url diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index 4d23ef73..fad9d80c 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -21,7 +21,7 @@ log = getLogger(__name__) -from urllib.parse import urljoin, urlunsplit, urlparse +from urllib.parse import urlunparse, urlparse # resource.formats accepted by ckanext-xloader. Must be lowercase here. DEFAULT_FORMATS = [ @@ -178,58 +178,66 @@ def get_xloader_user_apitoken(): return site_user["apikey"] -def modify_ckan_url(result_url: str, ckan_url: str) -> str: - """ Modifies a URL based on CKAN site URL comparison. +def _modify_url(input_url: str, base_url: str) -> str: + """ Modifies the input URL with base_url provided. - This function compares the base URL of a given result URL against a CKAN site URL. - If they differ, the result URL is modified to use the CKAN site URL as its base - while preserving the original path. - Args: - result_url (str): The original URL to potentially modify - ckan_url (str): The base CKAN site URL to compare against + input_url (str): The original URL to potentially modify + base_url (str): The base URL to compare/replace against Returns: - str: The modified URL if base URLs differ, otherwise returns original URL unchanged + str: The modified URL with replaced scheme and netloc """ - parsed_url = urlparse(result_url) - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - if base_url != ckan_url: - path_url = parsed_url.path - result_url = urljoin(ckan_url, path_url) - - return result_url - - -def modify_resource_url(orig_ckan_url: str) -> str: + parsed_input_url = urlparse(input_url) + parsed_base_url = urlparse(base_url) + # Do not modify non-HTTP(S) URLs (e.g., ftp://) + if parsed_input_url.scheme not in ("http", "https"): + return input_url + # replace scheme: "http/https" and netloc:"//:@:/" + new_url = urlunparse( + (parsed_base_url.scheme, + parsed_base_url.netloc, + parsed_input_url.path, + parsed_input_url.params, + parsed_input_url.query, + parsed_input_url.fragment)) + return new_url + + +def modify_input_url(input_url: str) -> str: """Returns a potentially modified CKAN URL. - This function takes a CKAN URL and potentially modifies its base URL while preserving the path, - query parameters, and fragments. The modification occurs only if two conditions are met: - 1. The base URL of the input matches the configured CKAN site URL - 2. An xloader_site_url is configured in the settings - + This function takes a possible CKAN URL and potentially modifies its base URL while preserving the path, + query parameters, and fragments. The modification occurs only if three conditions are met: + 1. The base URL of the input matches the configured CKAN site URL (ckan.site_url). + 2. A `ckanext.xloader.site_url` is configured in the settings. + 3. A `ckanext.xloader.site_url_ignore_path_regex` if configured in the settings and does not match. + Args: - orig_ckan_url (str): The original CKAN URL to potentially modify + input_url (str): The original CKAN URL to potentially modify Returns: - str: Either the modified URL with new base URL from xloader_site_url, + str: Either the modified URL with new base URL from xloader_site_url, or the original URL if conditions aren't met """ + xloader_site_url = config.get('ckanext.xloader.site_url') - ckan_site_url = config.get('ckan.site_url') + if not xloader_site_url: + return input_url + + parsed_input_url = urlparse(input_url) + input_base_url = f"{parsed_input_url.scheme}://{parsed_input_url.netloc}" + parsed_ckan_site_url = urlparse(config.get('ckan.site_url')) + ckan_base_url = f"{parsed_ckan_site_url.scheme}://{parsed_ckan_site_url.netloc}" - parsed_url = urlparse(orig_ckan_url) - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + xloader_ignore_regex = config.get('ckanext.xloader.site_url_ignore_path_regex') - # If the base URL matches the CKAN site URL and xloader_site_url is set, modify the URL - if base_url == ckan_site_url and xloader_site_url: - modified_ckan_url = urljoin(xloader_site_url, parsed_url.path) - if parsed_url.query: - modified_ckan_url += f"?{parsed_url.query}" - if parsed_url.fragment: - modified_ckan_url += f"#{parsed_url.fragment}" - return modified_ckan_url + #Don't alter non-matching base url's. + if input_base_url != ckan_base_url: + return input_url + #And not any urls on the ignore regex + elif xloader_ignore_regex and re.search(xloader_ignore_regex, input_url): + return input_url - return orig_ckan_url + return _modify_url(input_url, xloader_site_url) def set_resource_metadata(update_dict): From 993cd006ba3641d6a09c4bc51d020e0e6dfee171 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Mon, 3 Mar 2025 11:26:06 +1000 Subject: [PATCH 29/41] fix: revert to base wiring, as the url change occurs on runtime system which picks up job (which could have different ckan.ini config) --- ckanext/xloader/tests/test_jobs.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index 1d92f842..33386fbc 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -71,17 +71,17 @@ def data(create_with_upload, apikey): "api.action", ver=3, logic_function="xloader_hook", qualified=True ) return { - "api_key": apikey, - "job_type": "xloader_to_datastore", - "result_url": callback_url, - "metadata": { - "ignore_hash": True, - "ckan_url": toolkit.config.get("ckanext.xloader.site_url", toolkit.config.get("ckan.site_url")), - "resource_id": resource["id"], - "set_url_type": False, - "task_created": datetime.utcnow().isoformat(), - "original_url": resource["url"], - }, + 'api_key': apikey, + 'job_type': 'xloader_to_datastore', + 'result_url': callback_url, + 'metadata': { + 'ignore_hash': True, + 'ckan_url': toolkit.config.get('ckan.site_url'), + 'resource_id': resource["id"], + 'set_url_type': False, + 'task_created': datetime.utcnow().isoformat(), + 'original_url': resource["url"], + } } From cbfc55ad43409ab51626fbc4e0058da99a979bcb Mon Sep 17 00:00:00 2001 From: William Dutton Date: Mon, 3 Mar 2025 11:49:26 +1000 Subject: [PATCH 30/41] fix: remove validator on xloader.site_url as its not required --- ckanext/xloader/config_declaration.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index a03a5dfb..6fab25ff 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -9,7 +9,6 @@ groups: Provide an alternate site URL for the xloader_submit action. This is useful, for example, when the site is running within a docker network. Note: This setting will not alter path. i.e ckan.root_path - validators: configured_default("ckan.site_url",None) required: false - key: ckanext.xloader.site_url_ignore_path_regex example: "(/PathToS3HostOriginIWantToGoDirectTo|/anotherPath)" From d42bf9948565fdac1a7321946719a7ce5e9e3eda Mon Sep 17 00:00:00 2001 From: William Dutton Date: Mon, 3 Mar 2025 11:57:43 +1000 Subject: [PATCH 31/41] chore: improve test coverage of "" and None inputs --- ckanext/xloader/tests/test_utils.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ckanext/xloader/tests/test_utils.py b/ckanext/xloader/tests/test_utils.py index 384b3e17..52886851 100644 --- a/ckanext/xloader/tests/test_utils.py +++ b/ckanext/xloader/tests/test_utils.py @@ -3,6 +3,10 @@ from ckan.plugins import toolkit from ckanext.xloader import utils +def test_private_modify_url_no_change(): + url = "https://ckan.example.com/dataset" + assert utils._modify_url(url, "https://ckan.example.com") == url + @pytest.mark.parametrize("result_url, ckan_url, expected", [ ("https://example.com/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"), @@ -19,16 +23,15 @@ ("https://ckan.example.org/resource/123?param=value&other=123", "https://xloader.example.org", "https://xloader.example.org/resource/123?param=value&other=123"), ("https://ckan.example.org/resource/partial#fragment", "https://xloader.example.org", "https://xloader.example.org/resource/partial#fragment"), ("https://ckan.example.org/path/to/data?key=value#section", "https://xloader.example.org", "https://xloader.example.org/path/to/data?key=value#section"), + ("", "", ""), + ("", "http://127.0.0.1:5000", ""), + (None, None, None), + (None, "http://127.0.0.1:5000", None), ]) def test_private_modify_url(result_url, ckan_url, expected): assert utils._modify_url(result_url, ckan_url) == expected -def test_modify_ckan_url_no_change(): - url = "https://ckan.example.com/dataset" - assert utils._modify_url(url, "https://ckan.example.com") == url - - @pytest.mark.parametrize("input_url, ckan_site_url, xloader_site_url, is_altered, expected", [ ("https://ckan.example.org/resource/789", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/789"), ("https://ckan.example.org/resource/789", "https://ckan.example.org", "http://127.0.0.1:3012", True, "http://127.0.0.1:3012/resource/789"), @@ -42,6 +45,11 @@ def test_modify_ckan_url_no_change(): ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "http://localhost:3000", True, "http://localhost:3000/path/to/data?key=value#section"), ("https://ckan.example.org/blackListedPathToS3HostOrigin?key=value#section", "https://ckan.example.org", "https://xloader.example.org", False, ""), ("ftp://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", False, ""), + ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "", False, ""), + ("", "http://127.0.0.1:5000", None, False, ""), + ("", "http://127.0.0.1:5000", "", False, ""), + (None, "http://127.0.0.1:5000", None, False, ""), + (None, "http://127.0.0.1:5000", "", False, ""), ]) def test_modify_input_url(input_url, ckan_site_url, xloader_site_url, is_altered, expected): with patch.dict(toolkit.config, From ad5ea42a231923b50bd1b4f9a0cb34af23fccc37 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Mon, 3 Mar 2025 13:27:54 +1000 Subject: [PATCH 32/41] chore: cleanup 2.9 legacy items - provide better error messaging on incorrect api_key setup - user creation for testing to mimic real life 2.10+ environment chore: use cleaner pytest for setting variables --- .github/workflows/publish.yml | 76 ++------------------ .github/workflows/test.yml | 19 +++-- README.md | 4 +- ckanext/xloader/action.py | 3 +- ckanext/xloader/config_declaration.yaml | 6 +- ckanext/xloader/jobs.py | 20 ++---- ckanext/xloader/plugin.py | 8 +-- ckanext/xloader/schema.py | 6 +- ckanext/xloader/tests/test_action.py | 23 ++++-- ckanext/xloader/tests/test_jobs.py | 95 +++++++++++-------------- ckanext/xloader/utils.py | 7 +- 11 files changed, 91 insertions(+), 176 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index be0f5a20..75966354 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -34,6 +34,7 @@ jobs: validateVersion: runs-on: ubuntu-latest + if: github.repository == 'ckan/ckanext-xloader' steps: - uses: actions/checkout@v4 @@ -52,79 +53,10 @@ jobs: exit 1 fi - lint: - needs: validateVersion - if: github.repository == 'ckan/ckanext-xloader' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install requirements - run: pip install flake8 pycodestyle - - - name: Check syntax - run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --extend-exclude ckan - test: - needs: lint - strategy: - matrix: - include: #ckan-image see https://github.com/ckan/ckan-docker-base, ckan-version controls other image tags - - ckan-version: "2.11" - ckan-image: "2.11-py3.10" - - ckan-version: "2.10" - ckan-image: "2.10-py3.10" - #- ckan-version: "master" Publish does not care about master - # ckan-image: "master" - fail-fast: false - - name: CKAN ${{ matrix.ckan-version }} - runs-on: ubuntu-latest - container: - image: ckan/ckan-dev:${{ matrix.ckan-image }} - options: --user root - services: - solr: - image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 - postgres: - image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: postgres - ports: - - 5432:5432 - options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 - redis: - image: redis:3 - env: - CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test - CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test - CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test - CKAN_SOLR_URL: http://solr:8983/solr/ckan - CKAN_REDIS_URL: redis://redis:6379/1 - - steps: - - uses: actions/checkout@v4 - - if: ${{ matrix.ckan-version == 2.9 }} - run: pip install "setuptools>=44.1.0,<71" - - name: Install requirements - run: | - pip install -r requirements.txt - pip install -r dev-requirements.txt - pip install -e . - pip install -U requests[security] - # Replace default path to CKAN core config file with the one on the container - sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Setup extension (CKAN >= 2.9) - run: | - ckan -c test.ini db init - - name: Run tests - run: pytest --ckan-ini=test.ini --cov=ckanext.xloader --disable-warnings ckanext/xloader/tests + needs: validateVersion + name: Test + uses: ./.github/workflows/test.yml # Call the reusable workflow publishSkipped: if: github.repository != 'ckan/ckanext-xloader' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 487e5838..3fe3b742 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -35,7 +35,7 @@ jobs: experimental: true # master is unstable, good to know if we are compatible or not fail-fast: false - name: CKAN ${{ matrix.ckan-version }} + name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} CKAN ${{ matrix.ckan-version }} runs-on: ubuntu-latest container: image: ckan/ckan-dev:${{ matrix.ckan-image }} @@ -65,12 +65,7 @@ jobs: - uses: actions/checkout@v4 continue-on-error: ${{ matrix.experimental }} - - name: Pin setuptools for ckan 2.9 only - if: ${{ matrix.ckan-version == 2.9 }} - run: pip install "setuptools>=44.1.0,<71" - continue-on-error: ${{ matrix.experimental }} - - - name: Install requirements + - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Install requirements continue-on-error: ${{ matrix.experimental }} run: | pip install -r requirements.txt @@ -80,16 +75,20 @@ jobs: # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Setup extension + - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Setup extension continue-on-error: ${{ matrix.experimental }} run: | ckan -c test.ini db init + ckan -c test.ini user add ckan_admin email=ckan_admin@localhost password="AbCdEf12345!@#%" + ckan -c test.ini sysadmin add ckan_admin + ckan config-tool test.ini "ckanext.xloader.api_token=$(ckan -c test.ini user token add ckan_admin xloader | tail -n 1 | tr -d '\t')" + ckan -c test.ini user list - - name: Run tests + - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Run tests continue-on-error: ${{ matrix.experimental }} run: pytest --ckan-ini=test.ini --cov=ckanext.xloader --disable-warnings ckanext/xloader/tests --junit-xml=/tmp/artifacts/junit/results.xml - - name: Test Summary + - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Test Summary uses: test-summary/action@v2 continue-on-error: ${{ matrix.experimental }} with: diff --git a/README.md b/README.md index c9b36383..ccd74116 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,7 @@ To install XLoader: execute jobs against the server: ckanext.xloader.api_token = + ckan config-tool test.ini "ckanext.xloader.api_token=$(ckan -c test.ini user token add ckan_admin xloader | tail -n 1 | tr -d '\t')" 6. If it is a production server, you'll want to store jobs info in a more robust database than the default sqlite file. It can happily @@ -220,8 +221,7 @@ ckanext.xloader.api_token = eyJ0eXAiOiJKV1QiLCJh.eyJqdGkiOiJ0M2VNUFlQWFg0VU.8QgV Default value: none -Uses a specific API token for the xloader_submit action instead of the -apikey of the site_user. It's mandatory starting from CKAN 2.10. You can get one +It's mandatory starting from CKAN 2.10. You can get one running the command `ckan user token add {USER_NAME} xloader -q` diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index 0ce1aaed..108d7cc9 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -47,6 +47,7 @@ def xloader_submit(context, data_dict): :rtype: bool ''' p.toolkit.check_access('xloader_submit', context, data_dict) + api_key = utils.get_xloader_user_apitoken() custom_queue = data_dict.pop('queue', rq_jobs.DEFAULT_QUEUE_NAME) schema = context.get('schema', ckanext.xloader.schema.xloader_submit_schema()) data_dict, errors = _validate(data_dict, schema, context) @@ -147,7 +148,7 @@ def xloader_submit(context, data_dict): qualified=True ) data = { - 'api_key': utils.get_xloader_user_apitoken(), + 'api_key': api_key, 'job_type': 'xloader_to_datastore', 'result_url': callback_url, 'metadata': { diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 6fab25ff..3327c84b 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -30,9 +30,9 @@ groups: example: eyJ0eXAiOiJKV1QiLCJh.eyJqdGkiOiJ0M2VNUFlQWFg0VU.8QgV8em4RA description: | Uses a specific API token for the xloader_submit action instead of the - apikey of the site_user. Will be mandatory when dropping support for - CKAN 2.9. - required: false + apikey of the site_user. + default: 'NOT_SET' + required: true - key: ckanext.xloader.formats example: csv application/csv xls application/vnd.ms-excel description: | diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 762b926b..7eb6c477 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -25,10 +25,8 @@ from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError from .utils import datastore_resource_exists, set_resource_metadata, modify_input_url -try: - from ckan.lib.api_token import get_user_from_token -except ImportError: - get_user_from_token = None + +from ckan.lib.api_token import get_user_from_token log = logging.getLogger(__name__) @@ -511,19 +509,9 @@ def update_resource(resource, patch_only=False): def _get_user_from_key(api_key_or_token): """ Gets the user using the API Token or API Key. - - This method provides backwards compatibility for CKAN 2.9 that - supported both methods and previous CKAN versions supporting - only API Keys. """ - user = None - if get_user_from_token: - user = get_user_from_token(api_key_or_token) - if not user: - user = model.Session.query(model.User).filter_by( - apikey=api_key_or_token - ).first() - return user + return get_user_from_token(api_key_or_token) + def get_resource_and_dataset(resource_id, api_key): diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 9b89ae9e..20fa2651 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -17,13 +17,7 @@ except ImportError: HAS_IPIPE_VALIDATION = False -try: - config_declarations = toolkit.blanket.config_declarations -except AttributeError: - # CKAN 2.9 does not have config_declarations. - # Remove when dropping support. - def config_declarations(cls): - return cls +config_declarations = toolkit.blanket.config_declarations if toolkit.check_ckan_version(min_version='2.11'): from ckanext.datastore.interfaces import IDataDictionaryForm diff --git a/ckanext/xloader/schema.py b/ckanext/xloader/schema.py index 47ae65a5..dcedfd5f 100644 --- a/ckanext/xloader/schema.py +++ b/ckanext/xloader/schema.py @@ -17,11 +17,7 @@ int_validator = get_validator('int_validator') OneOf = get_validator('OneOf') ignore_not_sysadmin = get_validator('ignore_not_sysadmin') - -if p.toolkit.check_ckan_version('2.9'): - unicode_safe = get_validator('unicode_safe') -else: - unicode_safe = str +unicode_safe = get_validator('unicode_safe') def xloader_submit_schema(): diff --git a/ckanext/xloader/tests/test_action.py b/ckanext/xloader/tests/test_action.py index 8b0e2729..4497ce98 100644 --- a/ckanext/xloader/tests/test_action.py +++ b/ckanext/xloader/tests/test_action.py @@ -1,4 +1,5 @@ import pytest +from ckan.plugins import toolkit try: from unittest import mock except ImportError: @@ -117,10 +118,24 @@ def test_status(self): assert status["status"] == "pending" - def test_xloader_user_api_token_defaults_to_site_user_apikey(self): - api_token = get_xloader_user_apitoken() - site_user = helpers.call_action("get_site_user") - assert api_token == site_user["apikey"] + + def test_xloader_user_api_token_from_config(self): + sysadmin = factories.SysadminWithToken() + apikey = sysadmin["token"] + with mock.patch.dict(toolkit.config, {'ckanext.xloader.api_token': apikey}): + api_token = get_xloader_user_apitoken() + assert api_token == apikey + + @pytest.mark.ckan_config("ckanext.xloader.api_token", "NOT_SET") + def test_xloader_user_api_token_from_config_should_throw_exceptio_when_not_set(self): + + hasNotThrownException = True + try: + get_xloader_user_apitoken() + except Exception: + hasNotThrownException = False + + assert not hasNotThrownException @pytest.mark.ckan_config("ckanext.xloader.api_token", "random-api-token") def test_xloader_user_api_token(self): diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index 33386fbc..afdfa7c7 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -48,13 +48,7 @@ def _get_temp_files(dir='/tmp'): @pytest.fixture def apikey(): - if toolkit.check_ckan_version(min_version="2.10"): - sysadmin = factories.SysadminWithToken() - else: - # To provide support with CKAN 2.9 - sysadmin = factories.Sysadmin() - sysadmin["token"] = get_xloader_user_apitoken() - + sysadmin = factories.SysadminWithToken() return sysadmin["token"] @@ -103,65 +97,62 @@ def test_xloader_data_into_datastore(self, cli, data): resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) assert resource["datastore_contains_all_records_of_source_file"] + # Set the ckanext.xloader.site_url in the config + @pytest.mark.ckan_config("ckanext.xloader.site_url", 'http://xloader-site-url') def test_download_resource_data_with_ckanext_xloader_site_url(self, cli, data): - # Set the ckanext.xloader.site_url in the config - with mock.patch.dict(toolkit.config, {'ckanext.xloader.site_url': 'http://xloader-site-url'}): - data['metadata']['original_url'] = 'http://xloader-site-url/resource.csv' - self.enqueue(jobs.xloader_data_into_datastore, [data]) - with mock.patch("ckanext.xloader.jobs.get_response", get_response): - stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output - assert "Express Load completed" in stdout - resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) - assert resource["datastore_contains_all_records_of_source_file"] + data['metadata']['original_url'] = 'http://xloader-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout + + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url') def test_download_resource_data_with_ckan_site_url(self, cli, data): - # Set the ckan.site_url in the config - with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): - data['metadata']['original_url'] = 'http://ckan-site-url/resource.csv' - self.enqueue(jobs.xloader_data_into_datastore, [data]) - with mock.patch("ckanext.xloader.jobs.get_response", get_response): - stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output - assert "Express Load completed" in stdout + data['metadata']['original_url'] = 'http://ckan-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout - resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) - assert resource["datastore_contains_all_records_of_source_file"] + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url') def test_download_resource_data_with_different_original_url(self, cli, data): - # Set the ckan.site_url in the config - with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): - data['metadata']['original_url'] = 'http://external-site-url/resource.csv' - self.enqueue(jobs.xloader_data_into_datastore, [data]) - with mock.patch("ckanext.xloader.jobs.get_response", get_response): - stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output - assert "Express Load completed" in stdout + data['metadata']['original_url'] = 'http://external-site-url/resource.csv' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout - resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) - assert resource["datastore_contains_all_records_of_source_file"] + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + @pytest.mark.ckan_config("ckanext.xloader.site_url", 'http://xloader-site-url') def test_callback_xloader_hook_with_ckanext_xloader_site_url(self, cli, data): - # Set the ckanext.xloader.site_url in the config - with mock.patch.dict(toolkit.config, {'ckanext.xloader.site_url': 'http://xloader-site-url'}): - data['result_url'] = 'http://xloader-site-url/api/3/action/xloader_hook' - self.enqueue(jobs.xloader_data_into_datastore, [data]) - with mock.patch("ckanext.xloader.jobs.get_response", get_response): - stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output - assert "Express Load completed" in stdout + data['result_url'] = 'http://xloader-site-url/api/3/action/xloader_hook' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout - resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) - assert resource["datastore_contains_all_records_of_source_file"] + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] + @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url') def test_callback_xloader_hook_with_ckan_site_url(self, cli, data): - # Set the ckan.site_url in the config - with mock.patch.dict(toolkit.config, {'ckan.site_url': 'http://ckan-site-url'}): - data['result_url'] = 'http://ckan-site-url/api/3/action/xloader_hook' - self.enqueue(jobs.xloader_data_into_datastore, [data]) - with mock.patch("ckanext.xloader.jobs.get_response", get_response): - stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output - assert "Express Load completed" in stdout + data['result_url'] = 'http://ckan-site-url/api/3/action/xloader_hook' + self.enqueue(jobs.xloader_data_into_datastore, [data]) + with mock.patch("ckanext.xloader.jobs.get_response", get_response): + stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output + assert "Express Load completed" in stdout - resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) - assert resource["datastore_contains_all_records_of_source_file"] + resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) + assert resource["datastore_contains_all_records_of_source_file"] def test_xloader_ignore_hash(self, cli, data): self.enqueue(jobs.xloader_data_into_datastore, [data]) diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index fad9d80c..0089052f 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -170,12 +170,11 @@ def get_xloader_user_apitoken(): method returns the api_token set in the config file and defaults to the site_user. """ - api_token = p.toolkit.config.get('ckanext.xloader.api_token', None) - if api_token: + api_token = p.toolkit.config.get('ckanext.xloader.api_token') + if api_token and api_token != 'NOT_SET': return api_token + raise p.toolkit.ValidationError({u'ckanext.xloader.api_token': u'NOT_SET, please provide valid api token'}) - site_user = p.toolkit.get_action('get_site_user')({'ignore_auth': True}, {}) - return site_user["apikey"] def _modify_url(input_url: str, base_url: str) -> str: From bc7d701de0a904e20f8d746a2c92d996830b0b69 Mon Sep 17 00:00:00 2001 From: Yan Rudenko Date: Mon, 3 Mar 2025 20:09:04 +0100 Subject: [PATCH 33/41] Support apitoken_header_name CKAN Config --- ckanext/xloader/jobs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 321ec238..75b31ce6 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -52,6 +52,7 @@ # Retries can only occur in cases where the datastore entry exists, # so use the standard timeout RETRIED_JOB_TIMEOUT = config.get('ckanext.xloader.job_timeout', '3600') +APITOKEN_HEADER_NAME = config.get('apitoken_header_name', 'Authorization') # input = { @@ -316,7 +317,7 @@ def _download_resource_data(resource, data, api_key, logger): if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources - headers['Authorization'] = api_key + headers[APITOKEN_HEADER_NAME] = api_key # Add a constantly changing parameter to bypass URL caching. # If we're running XLoader, then either the resource has @@ -462,7 +463,7 @@ def callback_xloader_hook(result_url, api_key, job_dict): if ':' in api_key: header, key = api_key.split(':') else: - header, key = 'Authorization', api_key + header, key = APITOKEN_HEADER_NAME, api_key headers[header] = key try: From 5d79a462ec0282617f4c45fbe785f57dd21476e5 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Tue, 4 Mar 2025 09:04:23 +1000 Subject: [PATCH 34/41] fix: #244 include static webassets on packaging --- MANIFEST.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 852a43d9..ab418699 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,3 +4,6 @@ include LICENSE include README.md include ckanext/xloader/config_declaration.yaml recursive-include ckanext/xloader/templates *.html +recursive-include ckanext/xloader/webassets *.css +recursive-include ckanext/xloader/webassets *.yml +recursive-include ckanext/xloader/webassets *.js \ No newline at end of file From 2864197699d875b610b4baa996c7bd8195380b81 Mon Sep 17 00:00:00 2001 From: William Dutton Date: Tue, 4 Mar 2025 13:04:04 +1000 Subject: [PATCH 35/41] chore: increment to 2.0.1 --- CHANGELOG | 9 +++++++++ pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index faa8c2b2..30d92178 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,15 @@ CHANGELOG ========= See: https://github.com/ckan/ckanext-xloader/releases if this file has drifted. +2.0.1 2025-03-04 +================ + +## Fix + +* #244 Static webassets not included in package +* #245 support apitoken_header_name in 2.11.x. +* #241 loading R/W datasource resources via api (not hardcoded) + 2.0.0 2024-12-10 ================ diff --git a/pyproject.toml b/pyproject.toml index ce8f9eec..757c4af3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ckanext-xloader" -version = "2.0.0" +version = "2.0.1" description = "Express Loader - quickly load data into CKAN DataStore" classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", From abb0545ab33209599f0b59a5973f7211003f99d3 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Tue, 4 Mar 2025 13:59:27 +1000 Subject: [PATCH 36/41] fix Flake8 complaints - Fix whitespace, indentation, unused variables, duplicate function - Refactor to extract identical function definitions --- .flake8 | 2 ++ ckanext/xloader/jobs.py | 8 +++--- ckanext/xloader/loader.py | 42 +++++++++++----------------- ckanext/xloader/plugin.py | 6 ++-- ckanext/xloader/tests/test_jobs.py | 3 ++ ckanext/xloader/tests/test_loader.py | 17 ----------- ckanext/xloader/tests/test_plugin.py | 6 ++-- 7 files changed, 32 insertions(+), 52 deletions(-) diff --git a/.flake8 b/.flake8 index 32068ca7..973dec8d 100644 --- a/.flake8 +++ b/.flake8 @@ -17,4 +17,6 @@ max-line-length=127 # List ignore rules one per line. ignore = + C901 + E501 W503 diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index 75b31ce6..5013627a 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -182,9 +182,9 @@ def xloader_data_into_datastore_(input, job_dict, logger): if hasattr(h, "datastore_rw_resource_url_types"): datastore_rw_resource_url_types = h.datastore_rw_resource_url_types() else: - #fallback for 2.10.x or older. + # fallback for 2.10.x or older. datastore_rw_resource_url_types = ['datastore'] - + if resource.get('url_type') in datastore_rw_resource_url_types: logger.info('Ignoring resource - R/W DataStore resources are ' 'managed with the Datastore API') @@ -267,7 +267,7 @@ def tabulator_load(): logger.warning('Load using COPY failed: %s', e) logger.info('Trying again with tabulator') tabulator_load() - except JobTimeoutException as e: + except JobTimeoutException: try: tmp_file.close() except FileNotFoundError: @@ -393,7 +393,7 @@ def _download_resource_data(resource, data, api_key, logger): raise HTTPError( message=err_message, status_code=None, request_url=url, response=None) - except JobTimeoutException as e: + except JobTimeoutException: tmp_file.close() logger.warning('Job timed out after %ss', RETRIED_JOB_TIMEOUT) raise JobError('Job timed out after {}s'.format(RETRIED_JOB_TIMEOUT)) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index bb9dda48..f45b579c 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -204,7 +204,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): if f['id'] in existing_info: f['info'] = existing_info[f['id']] f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \ - else existing_fields_by_headers[f['id']].get('strip_extra_white', True) + else existing_fields_by_headers[f['id']].get('strip_extra_white', True) ''' Delete or truncate existing datastore table before proceeding, @@ -222,39 +222,32 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): fields = [ {'id': header_name, 'type': 'text', - 'strip_extra_white': True,} + 'strip_extra_white': True} for header_name in headers] logger.info('Fields: %s', fields) + def _make_whitespace_stripping_iter(super_iter): + def strip_white_space_iter(): + for row in super_iter(): + if len(row) == len(fields): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + return strip_white_space_iter + save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} try: with UnknownEncodingStream(csv_filepath, file_format, decoding_result, skip_rows=skip_rows) as stream: - super_iter = stream.iter - def strip_white_space_iter(): - for row in super_iter(): - if len(row) == len(fields): - for _index, _cell in enumerate(row): - # only strip white space if strip_extra_white is True - if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): - row[_index] = _cell.strip() - yield row - stream.iter = strip_white_space_iter + stream.iter = _make_whitespace_stripping_iter(stream.iter) stream.save(**save_args) except (EncodingError, UnicodeDecodeError): with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, skip_rows=skip_rows) as stream: - super_iter = stream.iter - def strip_white_space_iter(): - for row in super_iter(): - if len(row) == len(fields): - for _index, _cell in enumerate(row): - # only strip white space if strip_extra_white is True - if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): - row[_index] = _cell.strip() - yield row - stream.iter = strip_white_space_iter + stream.iter = _make_whitespace_stripping_iter(stream.iter) stream.save(**save_args) csv_filepath = f_write.name @@ -284,9 +277,8 @@ def strip_white_space_iter(): raise LoaderError('Could not create the database table: {}' .format(e)) - - # datstore_active is switched on by datastore_create - TODO temporarily - # disable it until the load is complete + # datastore_active is switched on by datastore_create + # TODO temporarily disable it until the load is complete with engine.begin() as conn: _disable_fulltext_trigger(conn, resource_id) diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 5c0084d9..75999127 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -91,9 +91,9 @@ def receive_validation_report(self, validation_report): res_dict = toolkit.get_action('resource_show')({'ignore_auth': True}, {'id': validation_report.get('resource_id')}) if (toolkit.asbool(toolkit.config.get('ckanext.xloader.validation.enforce_schema', True)) - or res_dict.get('schema', None)) and validation_report.get('status') != 'success': - # A schema is present, or required to be present - return + or res_dict.get('schema', None)) and validation_report.get('status') != 'success': + # A schema is present, or required to be present + return # if validation is running in async mode, it is running from the redis workers. # thus we need to do sync=True to have Xloader put the job at the front of the queue. sync = toolkit.asbool(toolkit.config.get(u'ckanext.validation.run_on_update_async', True)) diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index c67f4fe6..0105789d 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -27,6 +27,7 @@ def get_response(download_url, headers): resp.headers = headers return resp + def get_large_response(download_url, headers): """Mock jobs.get_response() method to fake a large file.""" resp = Response() @@ -34,6 +35,7 @@ def get_large_response(download_url, headers): resp.headers = {'content-length': 2000000000} return resp + def get_large_data_response(download_url, headers): """Mock jobs.get_response() method.""" resp = Response() @@ -42,6 +44,7 @@ def get_large_data_response(download_url, headers): resp.headers = headers return resp + def _get_temp_files(dir='/tmp'): return [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))] diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 7ff73446..09f87887 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -1177,23 +1177,6 @@ def test_simple_large_file(self, Session): u"text", ] - def test_simple_large_file(self, Session): - csv_filepath = get_sample_filepath("simple-large.csv") - resource = factories.Resource() - resource_id = resource['id'] - loader.load_table( - csv_filepath, - resource_id=resource_id, - mimetype="text/csv", - logger=logger, - ) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"numeric", - u"text", - ] - def test_with_mixed_types(self, Session): csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv") resource = factories.Resource() diff --git a/ckanext/xloader/tests/test_plugin.py b/ckanext/xloader/tests/test_plugin.py index f22dafbd..f6a0590f 100644 --- a/ckanext/xloader/tests/test_plugin.py +++ b/ckanext/xloader/tests/test_plugin.py @@ -81,7 +81,7 @@ def test_require_validation(self, monkeypatch): # TODO: test IPipeValidation assert not func.called # because of the validation_status not being `success` - func.called = None # reset + func.called = None # reset helpers.call_action( "resource_update", @@ -118,7 +118,7 @@ def test_enforce_validation_schema(self, monkeypatch): # TODO: test IPipeValidation assert not func.called # because of the schema being empty - func.called = None # reset + func.called = None # reset helpers.call_action( "resource_update", @@ -132,7 +132,7 @@ def test_enforce_validation_schema(self, monkeypatch): # TODO: test IPipeValidation assert not func.called # because of the validation_status not being `success` and there is a schema - func.called = None # reset + func.called = None # reset helpers.call_action( "resource_update", From 0605f6e6817964f81b931a48de17e0b68d65076b Mon Sep 17 00:00:00 2001 From: william dutton Date: Tue, 11 Mar 2025 13:55:48 +1000 Subject: [PATCH 37/41] chore: increment new version 2.1.0 including internal hostname for loader to use --- README.md | 23 +++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ccd74116..5e091308 100644 --- a/README.md +++ b/README.md @@ -437,6 +437,29 @@ Default value: `True` Controls whether or not a resource requires a Validation Schema to be present from the ckanext-validation plugin to be XLoadered. +#### ckanext.xloader.site_url +Provide an alternate site URL for the xloader_submit action. +This is useful, for example, when the site is running within a docker network. + +Note: This setting will not alter path. i.e ckan.root_path + +Example: + +``` +ckanext.xloader.site_url = http://ckan-dev:5000 +``` + +##### ckanext.xloader.site_url_ignore_path_regex +Provide the ability to ignore paths which can't be mapped to alternative site URL for resource access. +This is useful, for example, when the site is running within a docker network and the cdn front door has +Blob storage mapped to another path on the same domain. + +Example: + +``` +ckanext.xloader.site_url_ignore_path_regex = "(/PathToS3HostOriginIWantToGoDirectTo|/anotherPath)" +``` + ## Data Dictionary Fields #### strip_extra_white diff --git a/pyproject.toml b/pyproject.toml index 757c4af3..11521ef0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ckanext-xloader" -version = "2.0.1" +version = "2.1.0" description = "Express Loader - quickly load data into CKAN DataStore" classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", From c1670106b0774a1340b8b976ba14c0509fc871a3 Mon Sep 17 00:00:00 2001 From: william dutton Date: Tue, 11 Mar 2025 14:09:57 +1000 Subject: [PATCH 38/41] fix: must include on.workflow_call to allow publish to call it --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3fe3b742..ed509e4e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,6 +5,7 @@ on: pull_request: branches: - master + workflow_call: jobs: lint: From b4555bb909a7cbc381f0359e0417c6df315f6bc6 Mon Sep 17 00:00:00 2001 From: william dutton Date: Tue, 11 Mar 2025 14:40:05 +1000 Subject: [PATCH 39/41] chore: uplift redis container from v3 to v7 as rq 2.0.0 drops below v4 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ed509e4e..76a03860 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -54,7 +54,7 @@ jobs: - 5432:5432 options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 redis: - image: redis:3 + image: redis:7 env: CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test From 2eeb137ab47e30d42ebe121f61b5c31e57914b97 Mon Sep 17 00:00:00 2001 From: Chava Goldshtein Date: Thu, 31 Jul 2025 15:49:39 +0300 Subject: [PATCH 40/41] perf: implement chunked fulltext indexing for large datasets --- ckanext/xloader/loader.py | 102 +++++++++++++++++++++++++++++--------- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index f45b579c..eff5deb4 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -659,30 +659,86 @@ def _enable_fulltext_trigger(connection, resource_id): .format(table=identifier(resource_id, True)))) -def _populate_fulltext(connection, resource_id, fields): - '''Populates the _full_text column. i.e. the same as datastore_run_triggers - but it runs in 1/9 of the time. - - The downside is that it reimplements the code that calculates the text to - index, breaking DRY. And its annoying to pass in the column names. - - fields: list of dicts giving the each column's 'id' (name) and 'type' - (text/numeric/timestamp) +def _get_rows_count_of_resource(connection, table): + count_query = ''' SELECT count(_id) from {table} '''.format(table=table) + results = connection.execute(count_query) + rows_count = int(results.first()[0]) + return rows_count + + +def _populate_fulltext(connection, resource_id, fields, logger): + '''Populates the _full_text column for full-text search functionality. + + This function creates a PostgreSQL tsvector (text search vector) for each row + by concatenating all non-system columns. It's equivalent to datastore_run_triggers + but runs approximately 9x faster by using direct SQL updates. + + To handle very large datasets (e.g., 4GB+ files with millions of rows), the update + operation is partitioned into chunks to prevent: + - Database statement timeouts + - Memory exhaustion + - Lock contention that could block other operations + - Transaction log overflow + + The chunking mechanism processes rows in batches based on their _id values, + with chunk size configurable via 'ckanext.xloader.search_update_chunks' + (default: 100,000 rows per chunk). + + Args: + connection: Database connection object + resource_id (str): The datastore table identifier + fields (list): List of dicts with column 'id' (name) and 'type' + (text/numeric/timestamp) + logger: Logger instance for progress tracking + + Note: + This reimplements CKAN's text indexing logic for performance, + breaking DRY principle but providing significant speed improvements. ''' - stmt = sa.update(sa.table(resource_id, sa.column("_full_text"))).values( - _full_text=sa.text("to_tsvector({})".format( - " || ' ' || ".join( - 'coalesce({}, \'\')'.format( - identifier(field['id']) - + ('::text' if field['type'] != 'text' else '') - ) - for field in fields - if not field['id'].startswith('_') - ) - )) - ) - - connection.execute(stmt) + try: + # Get total row count to determine chunking strategy + rows_count = _get_rows_count_of_resource(connection, identifier(resource_id)) + except Exception as e: + rows_count = '' + logger.info("Failed to get resource rows count: {} ".format(str(e))) + raise + + if rows_count: + # Configure chunk size - prevents timeouts and memory issues on large datasets + # Default 100,000 rows per chunk balances performance vs. resource usage + chunks = int(config.get('ckanext.xloader.search_update_chunks', 100000)) + + # Process table in chunks using _id range queries + # This approach ensures consistent chunk sizes and allows resuming if interrupted + for start in range(0, rows_count, chunks): + try: + # Build SQL to update _full_text column with concatenated searchable content + sql = \ + ''' + UPDATE {table} + SET _full_text = to_tsvector({cols}) WHERE _id BETWEEN {first} and {end}; + '''.format( + table=identifier(resource_id), + # Concatenate all user columns (excluding system columns starting with '_') + # coalesce() handles NULL values by converting them to empty strings + cols=" || ' ' || ".join( + 'coalesce({}, \'\')'.format( + identifier(field['id']) + + ('::text' if field['type'] != 'text' else '') # Cast non-text types + ) + for field in fields + if not field['id'].startswith('_') # Skip system columns like _id, _full_text + ), + first=start, + end=start + chunks + ) + connection.execute(sql) + logger.info("Indexed rows {first} to {end} of {total}".format( + first=start, end=min(start + chunks, rows_count), total=rows_count)) + except Exception as e: + # Log chunk-specific errors but continue processing remaining chunks + logger.error("Failed to index rows {first}-{end}: {error}".format( + first=start, end=start + chunks, error=str(e))) def calculate_record_count(resource_id, logger): From 6a9fecffb1b26bca36ca07ccd51f45063e48578e Mon Sep 17 00:00:00 2001 From: Chava Goldshtein Date: Sun, 10 Aug 2025 09:59:46 +0300 Subject: [PATCH 41/41] fix logger error and add search_update_chunks variable to config_declaration file --- ckanext/xloader/config_declaration.yaml | 11 +++++++++++ ckanext/xloader/loader.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 3327c84b..4b529839 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -187,3 +187,14 @@ groups: they will also display "complete", "active", "inactive", and "unknown". type: bool required: false + - key: ckanext.xloader.search_update_chunks + default: 100000 + example: True + description: | + The number of rows to process in each batch when populating the full-text + search index. Chunked processing prevents database timeouts and memory + exhaustion when indexing very large datasets (4GB+ files with millions of rows). + Smaller values reduce memory usage but increase processing time. Larger values + improve performance but may cause timeouts on very large tables. + type: int + required: false diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index eff5deb4..23f9ea09 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -341,7 +341,7 @@ def strip_white_space_iter(): logger.info('Creating search index...') with engine.begin() as conn: - _populate_fulltext(conn, resource_id, fields=fields) + _populate_fulltext(conn, resource_id, fields=fields, logger=logger) logger.info('...search index created') return fields