From d14296da03bc0634894bda7be69a78e502204236 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 5 May 2026 15:28:59 +0100 Subject: [PATCH 1/9] test: make dataset expiry test check all fields --- tests/integration/test_dataset_expiry.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integration/test_dataset_expiry.py b/tests/integration/test_dataset_expiry.py index d90bb8c..d847eb1 100644 --- a/tests/integration/test_dataset_expiry.py +++ b/tests/integration/test_dataset_expiry.py @@ -36,6 +36,11 @@ def test_dataset_expiry_after_72_hours_failed_downloads(get_and_clear_up_context dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] assert len(datasets_in_bds) == 1 + + assert dataset["last_known_good_dataset_cached_dataset_xml_url"] is None + assert dataset["last_known_good_dataset_cached_dataset_xml_etag"] is None + assert dataset["last_known_good_dataset_cached_dataset_zip_url"] is None + assert dataset["last_known_good_dataset_cached_dataset_zip_etag"] is None assert dataset["last_known_good_dataset_downloaded"] is None assert dataset["last_known_good_dataset_hash"] is None assert dataset["last_known_good_dataset_hash_excluding_generated_timestamp"] is None From 312ed1cf3b7801cc088a17f57a8b4a4e161feeaf Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 5 May 2026 15:30:11 +0100 Subject: [PATCH 2/9] fix: ensure cache URL/ETag fields blanked expiry This commit ensures that when the cached copy of a dataset expires due to failure to download the dataset over the specified period of time that the cached URL and ETag fields are blanked out. Resolves https://github.com/IATI/bulk-data-service/issues/137 --- src/bulk_data_service/dataset_remover.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/bulk_data_service/dataset_remover.py b/src/bulk_data_service/dataset_remover.py index 3137183..c605321 100644 --- a/src/bulk_data_service/dataset_remover.py +++ b/src/bulk_data_service/dataset_remover.py @@ -77,6 +77,10 @@ def remove_download_for_expired_dataset( "last good download from Bulk Data Service".format(bds_dataset["id"], max_hours) ) + bds_dataset["last_known_good_dataset_cached_dataset_xml_url"] = None + bds_dataset["last_known_good_dataset_cached_dataset_xml_etag"] = None + bds_dataset["last_known_good_dataset_cached_dataset_zip_url"] = None + bds_dataset["last_known_good_dataset_cached_dataset_zip_etag"] = None bds_dataset["last_known_good_dataset_downloaded"] = None bds_dataset["last_known_good_dataset_hash"] = None bds_dataset["last_known_good_dataset_hash_excluding_generated_timestamp"] = None From d6eb1711c2a0781c2b54af935420bf763c3f5dd6 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:37:50 +0100 Subject: [PATCH 3/9] test: checks *_error_occurred flags created false This commit adds two tests which checks that the creation of a dataset record sets the most_recent_*_attempt_error_occurred flags to false, and that after a successful check for a dataset the most_recent_head_attempt_error_occurred flag is false (the get flag was already checked). --- tests/integration/test_dataset_add.py | 2 ++ tests/unit/test_dataset_registration.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/tests/integration/test_dataset_add.py b/tests/integration/test_dataset_add.py index c17226f..bbf47e4 100644 --- a/tests/integration/test_dataset_add.py +++ b/tests/integration/test_dataset_add.py @@ -101,6 +101,8 @@ def test_add_downloadable_dataset_for_various_encodings( check_most_recent_http_attempt_for_success("get", datasets_in_bds[dataset_id]) + assert datasets_in_bds[dataset_id]["most_recent_head_attempt_error_occurred"] is False + check_last_known_good_dataset_values_are_set(datasets_in_bds[dataset_id]) check_dataset_fields( diff --git a/tests/unit/test_dataset_registration.py b/tests/unit/test_dataset_registration.py index be1ae91..00c9d77 100644 --- a/tests/unit/test_dataset_registration.py +++ b/tests/unit/test_dataset_registration.py @@ -5,6 +5,7 @@ import pytest +from bulk_data_service.dataset import create_empty_dataset from dataset_registration.iati_registry_ckan import clean_datasets_metadata, convert_datasets_metadata @@ -42,6 +43,12 @@ def test_incomplete_necessary_data_from_ckan(field_blanker, attribute_value): assert(len(ckan_datasets) == 0) +def test_create_empty_dataset_error_occurred_defaults_to_false(): + ds = create_empty_dataset() + assert ds["most_recent_head_attempt_error_occurred"] is False + assert ds["most_recent_get_attempt_error_occurred"] is False + + @pytest.mark.parametrize("resources_value", [None, [], {"url": None}]) def test_missing_url_from_ckan(resources_value): From 65f75a7056b1ba5e416ac7e85f0c87cfe650a38a Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:41:18 +0100 Subject: [PATCH 4/9] fix: adds db migrations to set error flag columns This DB migration both sets a default of false for the error flags and sets all existing null values to false (if there are any existing values with null, it is because the relevant check hasn't been done over given the time frame, so no error has occurred). --- db-migrations/20260505_01_7kh1j.rollback.sql | 5 +++++ db-migrations/20260505_01_7kh1j.sql | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 db-migrations/20260505_01_7kh1j.rollback.sql create mode 100644 db-migrations/20260505_01_7kh1j.sql diff --git a/db-migrations/20260505_01_7kh1j.rollback.sql b/db-migrations/20260505_01_7kh1j.rollback.sql new file mode 100644 index 0000000..73e7585 --- /dev/null +++ b/db-migrations/20260505_01_7kh1j.rollback.sql @@ -0,0 +1,5 @@ +alter table iati_datasets + alter column most_recent_head_attempt_error_occurred drop default; + +alter table iati_datasets + alter column most_recent_get_attempt_error_occurred drop default; diff --git a/db-migrations/20260505_01_7kh1j.sql b/db-migrations/20260505_01_7kh1j.sql new file mode 100644 index 0000000..1e731e4 --- /dev/null +++ b/db-migrations/20260505_01_7kh1j.sql @@ -0,0 +1,16 @@ +-- +-- depends: 20250827_01_Dt6Ow + +alter table iati_datasets + alter column most_recent_head_attempt_error_occurred set default false; + +alter table iati_datasets + alter column most_recent_get_attempt_error_occurred set default false; + +update iati_datasets + set most_recent_head_attempt_error_occurred = false + where most_recent_head_attempt_error_occurred is null; + +update iati_datasets + set most_recent_get_attempt_error_occurred = false + where most_recent_get_attempt_error_occurred is null; From 7584f2d3477361d3db7108e206a14f82cd60f514 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:43:00 +0100 Subject: [PATCH 5/9] fix: create new datasets with error flags = false This commit changes the dataset creation code so that a new dataset object has its error occurred flags set to false. Resolves #136. --- src/bulk_data_service/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bulk_data_service/dataset.py b/src/bulk_data_service/dataset.py index d28370b..d4ac228 100644 --- a/src/bulk_data_service/dataset.py +++ b/src/bulk_data_service/dataset.py @@ -43,9 +43,11 @@ def create_empty_dataset() -> dict[str, Any]: empty_ds = { k: None for k in DATASET_REGISTRATION_FIELDS + DATASET_NON_REGISTRATION_FIELDS - } # type: dict[str, str | None] + } # type: dict[str, str | bool | None] empty_ds["most_recent_get_attempt_error_details"] = make_http_attempt_error_details() + empty_ds["most_recent_get_attempt_error_occurred"] = False empty_ds["most_recent_head_attempt_error_details"] = make_http_attempt_error_details() + empty_ds["most_recent_head_attempt_error_occurred"] = False return empty_ds From 70e3c74aacc951edaea9c2ddd9b257ecf84e7fe0 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:50:16 +0100 Subject: [PATCH 6/9] feat: update IATI Design System to 4.9.0 --- web/404.html | 2 +- web/index-template.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/web/404.html b/web/404.html index 494ec48..d1f2bba 100644 --- a/web/404.html +++ b/web/404.html @@ -4,7 +4,7 @@ IATI Bulk Data Service - + diff --git a/web/index-template.html b/web/index-template.html index 874d784..5ca98a1 100644 --- a/web/index-template.html +++ b/web/index-template.html @@ -4,7 +4,7 @@ IATI Bulk Data Service - + From 9c92c2c408362288e707c8791a50d996e1a5f485 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:50:31 +0100 Subject: [PATCH 7/9] docs: improve README --- README.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 11a87e3..6da6280 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ The `.env` file is used when running things locally to store environment variabl Running the app successfully requires a Postgres database and a connection to an Azure blob storage account. There is a docker compose setup which can be used to start an instance of each service locally, that can be run with: -``` +```bash docker compose up -d ``` @@ -69,18 +69,35 @@ The example `.env` file (`.env-example`) is configured to use the above docker c Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite): -``` +```bash dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50 ``` You can run the zipper operation with: -``` +```bash dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run ``` It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable. +The full range of command line arguments is listed below: + +``` +usage: iati_bulk_data_service.py [-h] --operation {checker,zipper,registry-changes-processor} [--single-run] [--run-for-n-datasets RUN_FOR_N_DATASETS] [--run-for-single-reporting-org RUN_FOR_SINGLE_REPORTING_ORG] [--skip-safety] + +options: + -h, --help show this help message and exit + --operation {checker,zipper,registry-changes-processor} + Operation to run: checker, downloader, registry-changes-processor + --single-run Perform a single run, then exit + --run-for-n-datasets RUN_FOR_N_DATASETS + Run on the first N datasets from registration service (useful for testing) + --run-for-single-reporting-org RUN_FOR_SINGLE_REPORTING_ORG + Run only for the datasets belonging to the specified reporting org short name (useful for testing) + --skip-safety Skip safety checks during the run (useful for testing) +``` + To shutdown the docker compose setup, use (the Azure Service Bus emulator appears to be a bit sensitive to Ctrl-C shutdowns, so always best to shutdown with `docker compose down`): From a71b39f82c7854196c39810c7067a61848a4afa3 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:51:27 +0100 Subject: [PATCH 8/9] docs: update CHANGELOG --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a464c7..c89b3fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Removed +## [1.4.5] - 2026-05-06 + +### Changed + +- Updated IATI Design System to 4.9.0 + +### Fixed + +- Bug where the dataset's cached URLs were not being blanked after dataset expiry. (Resolves #137) +- Bug where `most_recent_head_attempt.error_occurred` was being set to `null` instead of `false`. (Resolves #136). + ## [1.4.4] - 2026-04-22 ### Added From 3d2e52afc12590fbbe0c91936db528e3faae408d Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:51:35 +0100 Subject: [PATCH 9/9] build: bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b7ab0e5..4a485dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "1.4.4" +version = "1.4.5" requires-python = ">= 3.12.6" readme = "README.md" dependencies = [