From d14296da03bc0634894bda7be69a78e502204236 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 5 May 2026 15:28:59 +0100 Subject: [PATCH 01/18] test: make dataset expiry test check all fields --- tests/integration/test_dataset_expiry.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integration/test_dataset_expiry.py b/tests/integration/test_dataset_expiry.py index d90bb8c..d847eb1 100644 --- a/tests/integration/test_dataset_expiry.py +++ b/tests/integration/test_dataset_expiry.py @@ -36,6 +36,11 @@ def test_dataset_expiry_after_72_hours_failed_downloads(get_and_clear_up_context dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] assert len(datasets_in_bds) == 1 + + assert dataset["last_known_good_dataset_cached_dataset_xml_url"] is None + assert dataset["last_known_good_dataset_cached_dataset_xml_etag"] is None + assert dataset["last_known_good_dataset_cached_dataset_zip_url"] is None + assert dataset["last_known_good_dataset_cached_dataset_zip_etag"] is None assert dataset["last_known_good_dataset_downloaded"] is None assert dataset["last_known_good_dataset_hash"] is None assert dataset["last_known_good_dataset_hash_excluding_generated_timestamp"] is None From 312ed1cf3b7801cc088a17f57a8b4a4e161feeaf Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 5 May 2026 15:30:11 +0100 Subject: [PATCH 02/18] fix: ensure cache URL/ETag fields blanked expiry This commit ensures that when the cached copy of a dataset expires due to failure to download the dataset over the specified period of time that the cached URL and ETag fields are blanked out. Resolves https://github.com/IATI/bulk-data-service/issues/137 --- src/bulk_data_service/dataset_remover.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/bulk_data_service/dataset_remover.py b/src/bulk_data_service/dataset_remover.py index 3137183..c605321 100644 --- a/src/bulk_data_service/dataset_remover.py +++ b/src/bulk_data_service/dataset_remover.py @@ -77,6 +77,10 @@ def remove_download_for_expired_dataset( "last good download from Bulk Data Service".format(bds_dataset["id"], max_hours) ) + bds_dataset["last_known_good_dataset_cached_dataset_xml_url"] = None + bds_dataset["last_known_good_dataset_cached_dataset_xml_etag"] = None + bds_dataset["last_known_good_dataset_cached_dataset_zip_url"] = None + bds_dataset["last_known_good_dataset_cached_dataset_zip_etag"] = None bds_dataset["last_known_good_dataset_downloaded"] = None bds_dataset["last_known_good_dataset_hash"] = None bds_dataset["last_known_good_dataset_hash_excluding_generated_timestamp"] = None From d6eb1711c2a0781c2b54af935420bf763c3f5dd6 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:37:50 +0100 Subject: [PATCH 03/18] test: checks *_error_occurred flags created false This commit adds two tests which checks that the creation of a dataset record sets the most_recent_*_attempt_error_occurred flags to false, and that after a successful check for a dataset the most_recent_head_attempt_error_occurred flag is false (the get flag was already checked). --- tests/integration/test_dataset_add.py | 2 ++ tests/unit/test_dataset_registration.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/tests/integration/test_dataset_add.py b/tests/integration/test_dataset_add.py index c17226f..bbf47e4 100644 --- a/tests/integration/test_dataset_add.py +++ b/tests/integration/test_dataset_add.py @@ -101,6 +101,8 @@ def test_add_downloadable_dataset_for_various_encodings( check_most_recent_http_attempt_for_success("get", datasets_in_bds[dataset_id]) + assert datasets_in_bds[dataset_id]["most_recent_head_attempt_error_occurred"] is False + check_last_known_good_dataset_values_are_set(datasets_in_bds[dataset_id]) check_dataset_fields( diff --git a/tests/unit/test_dataset_registration.py b/tests/unit/test_dataset_registration.py index be1ae91..00c9d77 100644 --- a/tests/unit/test_dataset_registration.py +++ b/tests/unit/test_dataset_registration.py @@ -5,6 +5,7 @@ import pytest +from bulk_data_service.dataset import create_empty_dataset from dataset_registration.iati_registry_ckan import clean_datasets_metadata, convert_datasets_metadata @@ -42,6 +43,12 @@ def test_incomplete_necessary_data_from_ckan(field_blanker, attribute_value): assert(len(ckan_datasets) == 0) +def test_create_empty_dataset_error_occurred_defaults_to_false(): + ds = create_empty_dataset() + assert ds["most_recent_head_attempt_error_occurred"] is False + assert ds["most_recent_get_attempt_error_occurred"] is False + + @pytest.mark.parametrize("resources_value", [None, [], {"url": None}]) def test_missing_url_from_ckan(resources_value): From 65f75a7056b1ba5e416ac7e85f0c87cfe650a38a Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:41:18 +0100 Subject: [PATCH 04/18] fix: adds db migrations to set error flag columns This DB migration both sets a default of false for the error flags and sets all existing null values to false (if there are any existing values with null, it is because the relevant check hasn't been done over given the time frame, so no error has occurred). --- db-migrations/20260505_01_7kh1j.rollback.sql | 5 +++++ db-migrations/20260505_01_7kh1j.sql | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 db-migrations/20260505_01_7kh1j.rollback.sql create mode 100644 db-migrations/20260505_01_7kh1j.sql diff --git a/db-migrations/20260505_01_7kh1j.rollback.sql b/db-migrations/20260505_01_7kh1j.rollback.sql new file mode 100644 index 0000000..73e7585 --- /dev/null +++ b/db-migrations/20260505_01_7kh1j.rollback.sql @@ -0,0 +1,5 @@ +alter table iati_datasets + alter column most_recent_head_attempt_error_occurred drop default; + +alter table iati_datasets + alter column most_recent_get_attempt_error_occurred drop default; diff --git a/db-migrations/20260505_01_7kh1j.sql b/db-migrations/20260505_01_7kh1j.sql new file mode 100644 index 0000000..1e731e4 --- /dev/null +++ b/db-migrations/20260505_01_7kh1j.sql @@ -0,0 +1,16 @@ +-- +-- depends: 20250827_01_Dt6Ow + +alter table iati_datasets + alter column most_recent_head_attempt_error_occurred set default false; + +alter table iati_datasets + alter column most_recent_get_attempt_error_occurred set default false; + +update iati_datasets + set most_recent_head_attempt_error_occurred = false + where most_recent_head_attempt_error_occurred is null; + +update iati_datasets + set most_recent_get_attempt_error_occurred = false + where most_recent_get_attempt_error_occurred is null; From 7584f2d3477361d3db7108e206a14f82cd60f514 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:43:00 +0100 Subject: [PATCH 05/18] fix: create new datasets with error flags = false This commit changes the dataset creation code so that a new dataset object has its error occurred flags set to false. Resolves #136. --- src/bulk_data_service/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bulk_data_service/dataset.py b/src/bulk_data_service/dataset.py index d28370b..d4ac228 100644 --- a/src/bulk_data_service/dataset.py +++ b/src/bulk_data_service/dataset.py @@ -43,9 +43,11 @@ def create_empty_dataset() -> dict[str, Any]: empty_ds = { k: None for k in DATASET_REGISTRATION_FIELDS + DATASET_NON_REGISTRATION_FIELDS - } # type: dict[str, str | None] + } # type: dict[str, str | bool | None] empty_ds["most_recent_get_attempt_error_details"] = make_http_attempt_error_details() + empty_ds["most_recent_get_attempt_error_occurred"] = False empty_ds["most_recent_head_attempt_error_details"] = make_http_attempt_error_details() + empty_ds["most_recent_head_attempt_error_occurred"] = False return empty_ds From 70e3c74aacc951edaea9c2ddd9b257ecf84e7fe0 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:50:16 +0100 Subject: [PATCH 06/18] feat: update IATI Design System to 4.9.0 --- web/404.html | 2 +- web/index-template.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/web/404.html b/web/404.html index 494ec48..d1f2bba 100644 --- a/web/404.html +++ b/web/404.html @@ -4,7 +4,7 @@ IATI Bulk Data Service - + diff --git a/web/index-template.html b/web/index-template.html index 874d784..5ca98a1 100644 --- a/web/index-template.html +++ b/web/index-template.html @@ -4,7 +4,7 @@ IATI Bulk Data Service - + From 9c92c2c408362288e707c8791a50d996e1a5f485 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:50:31 +0100 Subject: [PATCH 07/18] docs: improve README --- README.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 11a87e3..6da6280 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ The `.env` file is used when running things locally to store environment variabl Running the app successfully requires a Postgres database and a connection to an Azure blob storage account. There is a docker compose setup which can be used to start an instance of each service locally, that can be run with: -``` +```bash docker compose up -d ``` @@ -69,18 +69,35 @@ The example `.env` file (`.env-example`) is configured to use the above docker c Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite): -``` +```bash dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50 ``` You can run the zipper operation with: -``` +```bash dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run ``` It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable. +The full range of command line arguments is listed below: + +``` +usage: iati_bulk_data_service.py [-h] --operation {checker,zipper,registry-changes-processor} [--single-run] [--run-for-n-datasets RUN_FOR_N_DATASETS] [--run-for-single-reporting-org RUN_FOR_SINGLE_REPORTING_ORG] [--skip-safety] + +options: + -h, --help show this help message and exit + --operation {checker,zipper,registry-changes-processor} + Operation to run: checker, downloader, registry-changes-processor + --single-run Perform a single run, then exit + --run-for-n-datasets RUN_FOR_N_DATASETS + Run on the first N datasets from registration service (useful for testing) + --run-for-single-reporting-org RUN_FOR_SINGLE_REPORTING_ORG + Run only for the datasets belonging to the specified reporting org short name (useful for testing) + --skip-safety Skip safety checks during the run (useful for testing) +``` + To shutdown the docker compose setup, use (the Azure Service Bus emulator appears to be a bit sensitive to Ctrl-C shutdowns, so always best to shutdown with `docker compose down`): From a71b39f82c7854196c39810c7067a61848a4afa3 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:51:27 +0100 Subject: [PATCH 08/18] docs: update CHANGELOG --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a464c7..c89b3fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Removed +## [1.4.5] - 2026-05-06 + +### Changed + +- Updated IATI Design System to 4.9.0 + +### Fixed + +- Bug where the dataset's cached URLs were not being blanked after dataset expiry. (Resolves #137) +- Bug where `most_recent_head_attempt.error_occurred` was being set to `null` instead of `false`. (Resolves #136). + ## [1.4.4] - 2026-04-22 ### Added From 3d2e52afc12590fbbe0c91936db528e3faae408d Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Wed, 6 May 2026 09:51:35 +0100 Subject: [PATCH 09/18] build: bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b7ab0e5..4a485dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "1.4.4" +version = "1.4.5" requires-python = ">= 3.12.6" readme = "README.md" dependencies = [ From 741d82b83bc650986b3b2ce2d40ae4f12e4be3bd Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Mon, 11 May 2026 13:33:40 +0100 Subject: [PATCH 10/18] ci: deploy to a dedicated vnet with subnet This alters the deploy to deploy to a dedicated Azure vnet and subnet, which have a dedicated IP attached. These are created outside the CI/CD pipeline, so the IP remains fixed for as long as possible. --- .github/workflows/build-and-deploy-job.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml index dd0cc05..f16022d 100644 --- a/.github/workflows/build-and-deploy-job.yml +++ b/.github/workflows/build-and-deploy-job.yml @@ -32,6 +32,7 @@ jobs: echo "CONTAINER_INSTANCE_BASE_NAME=aci-${APP_NAME}" >> ${GITHUB_ENV} echo "RESOURCE_GROUP_BASE_NAME=rg-${APP_NAME}" >> ${GITHUB_ENV} echo "STORAGE_ACCOUNT_NAME=sa${APP_NAME//-/}$TARGET_ENVIRONMENT" >> ${GITHUB_ENV} + echo "APP_NAME=${APP_NAME}" >> ${GITHUB_ENV} - name: 'Print calculated environment variables' run: | @@ -39,7 +40,7 @@ jobs: echo $CONTAINER_INSTANCE_BASE_NAME echo $RESOURCE_GROUP_BASE_NAME echo $STORAGE_ACCOUNT_NAME - + echo $APP_NAME - name: 'Checkout GitHub Action' uses: actions/checkout@v4 @@ -137,6 +138,8 @@ jobs: az -v az container create --debug \ --resource-group "${{ env.RESOURCE_GROUP_BASE_NAME }}-${{ env.TARGET_ENVIRONMENT }}" \ + --vnet "${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-vnet" \ + --subnet "${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-subnet" \ --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml - name: 'Re-generate the website links' From 0cbfaedc314dbe190958057758fd21ad52e9b2bd Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Mon, 11 May 2026 13:55:25 +0100 Subject: [PATCH 11/18] feat: script to create vnets for public IP --- azure-provision/create-vnets-public-ips.sh | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100755 azure-provision/create-vnets-public-ips.sh diff --git a/azure-provision/create-vnets-public-ips.sh b/azure-provision/create-vnets-public-ips.sh new file mode 100755 index 0000000..70212d0 --- /dev/null +++ b/azure-provision/create-vnets-public-ips.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -o errexit # abort on nonzero exitstatus +set -o nounset # abort on unbound variable +set -o pipefail # don't hide errors within pipes + +# This script creates the virtual networks, subnets and public IPs for the bulk data service. + +RESOURCE_GROUP_NAME="rg-bulk-data-service-vnets" +LOCATION="uksouth" + +az group create --name "$RESOURCE_GROUP_NAME" --location "$LOCATION" + +for ENV in dev prod; do + az network vnet create --resource-group "$RESOURCE_GROUP_NAME" \ + --name "bulk-data-service-${ENV}-vnet" \ + --address-prefix 10.0.0.0/16 \ + --subnet-name "bulk-data-service-${ENV}-subnet" \ + --subnet-prefix 10.0.1.0/24 + + az network vnet subnet update --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet-name "bulk-data-service-${ENV}-vnet" \ + --name "bulk-data-service-${ENV}-subnet" \ + --delegation Microsoft.ContainerInstance/containerGroups + + az network public-ip create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "bulk-data-service-${ENV}-public-ip" \ + --sku Standard \ + --allocation-method Static \ + --location "$LOCATION" + + az network nat gateway create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "bulk-data-service-${ENV}-nat-gateway" \ + --location "$LOCATION" \ + --public-ip-addresses "bulk-data-service-${ENV}-public-ip" \ + --idle-timeout 10 + + az network vnet subnet update \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet-name "bulk-data-service-${ENV}-vnet" \ + --name "bulk-data-service-${ENV}-subnet" \ + --nat-gateway "bulk-data-service-${ENV}-nat-gateway" +done From 680329cfa4c2f8dc0cf4070e09c634d3bf8cb7af Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Mon, 11 May 2026 15:41:01 +0100 Subject: [PATCH 12/18] fix(ci): add missing env var for MQ --- .github/workflows/build-and-deploy-job.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml index f16022d..40897da 100644 --- a/.github/workflows/build-and-deploy-job.yml +++ b/.github/workflows/build-and-deploy-job.yml @@ -106,6 +106,7 @@ jobs: LOG_WORKSPACE_KEY: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'LOG_WORKSPACE_KEY')] }} # Variables which configure the app + AZURE_SERVICE_BUS_DATASET_CHECK_RESULTS_TOPIC_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_DATASET_CHECK_RESULTS_TOPIC_NAME')] }} AZURE_SERVICE_BUS_REGISTRY_SUB_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_SUB_NAME')] }} AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME')] }} AZURE_SERVICE_BUS_WAIT_TIME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_WAIT_TIME')] }} From 180eb3bc576ea298685c7c37104c2010d998062f Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Mon, 11 May 2026 14:21:58 +0100 Subject: [PATCH 13/18] build: bump version number --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4a485dd..867b89a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "1.4.5" +version = "1.4.6" requires-python = ">= 3.12.6" readme = "README.md" dependencies = [ From cdad6ad98db3681a119b7ece821a491aebd27e8d Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Mon, 11 May 2026 13:55:57 +0100 Subject: [PATCH 14/18] docs: update README, CHANGELOG - vnet setup Also adds a missing env var for the MQ setup. --- CHANGELOG.md | 10 ++++++++++ README.md | 12 ++++++++++++ 2 files changed, 22 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c89b3fe..41df74e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Removed +## [1.4.6] - 2026-05-11 + +### Changed + +- Updated deploy to use dedicated vnet & subnet + +### Fixed + +- Added env var for the MQ topic name to the GitHub workflow. + ## [1.4.5] - 2026-05-06 ### Changed diff --git a/README.md b/README.md index 6da6280..fedeecb 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,8 @@ pytest-watcher . ### Initial Provisioning +#### Bulk Data Service App + You can create an Azure-based instance of Bulk Data Service using the `azure-create-resources.sh` script. It must be run from the root of the repository, and it requires (i) the environment variable `BDS_DB_ADMIN_PASSWORD` to be set with the password for the database, and (ii) a single parameter which is the name of the environment/instance. For instance, the following command will create a dev instance: ```bash @@ -249,6 +251,16 @@ This will create a resource group on Azure called `rg-bulk-data-service-dev`, an At the end of its run, the `azure-create-resources.sh` script will print out various secrets which need to be added to Github Actions. +**NOTE**: This is only really useful for temporary deployment or initial setup; once you're setup with CI/CD, the GitHub action does all this. + +#### Bulk Data Service Network and Public IP + +The Bulk Data Service is deployed to a dedicated vnet with subnet and attached NAT Gateway which has a public IP. To ensure the IP remains, these are not destroyed and re-created on every release (like the Azure Container Instances are). To create the networks and public IPs for dev and production, run: + +```bash +./azure-provision/create-vnets-public-ips.sh +``` + ### Deployment - Versioning The app version is set in `pyproject.toml`, and this is read by the app to use in the `User-Agent` header. When making a new release, set the version here to the appropriate value. Then, when releasing the app using the normal IATI Python app deployment process, choose the tag name to match the version chosen. From 35738fe6515273008702f7eb270774e5c15ff91a Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 12 May 2026 10:09:33 +0100 Subject: [PATCH 15/18] fix: alters public ip deploy to use full names The deploy to a public ip requires full names for the vnet and subnet because they are on a different resource group. This has required bringing in the AZURE_SCRIPTION_ID from the GitHub org secrets. --- .github/workflows/build-and-deploy-job.yml | 10 +++++++--- .../azure-resource-manager-deployment-template.yml | 5 +++-- azure-deployment/generate-manifest-from-template.sh | 9 +++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml index 40897da..1553731 100644 --- a/.github/workflows/build-and-deploy-job.yml +++ b/.github/workflows/build-and-deploy-job.yml @@ -1,7 +1,7 @@ --- name: Generic build and deploy (called by other workflows) -on: # yamllint disable-line rule:truthy +on: # yamllint disable-line rule:truthy workflow_call: inputs: APP_NAME: @@ -25,6 +25,8 @@ jobs: ACR_USERNAME: ${{ secrets.ACR_USERNAME }} ACR_PASSWORD: ${{ secrets.ACR_PASSWORD }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + steps: - name: 'Generate/build derived environment variables' run: | @@ -33,6 +35,7 @@ jobs: echo "RESOURCE_GROUP_BASE_NAME=rg-${APP_NAME}" >> ${GITHUB_ENV} echo "STORAGE_ACCOUNT_NAME=sa${APP_NAME//-/}$TARGET_ENVIRONMENT" >> ${GITHUB_ENV} echo "APP_NAME=${APP_NAME}" >> ${GITHUB_ENV} + echo "AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID}" >> ${GITHUB_ENV} - name: 'Print calculated environment variables' run: | @@ -41,6 +44,7 @@ jobs: echo $RESOURCE_GROUP_BASE_NAME echo $STORAGE_ACCOUNT_NAME echo $APP_NAME + echo $AZURE_SUBSCRIPTION_ID - name: 'Checkout GitHub Action' uses: actions/checkout@v4 @@ -139,8 +143,8 @@ jobs: az -v az container create --debug \ --resource-group "${{ env.RESOURCE_GROUP_BASE_NAME }}-${{ env.TARGET_ENVIRONMENT }}" \ - --vnet "${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-vnet" \ - --subnet "${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-subnet" \ + --vnet "/subscriptions/${{ env.AZURE_SUBSCRIPTION_ID }}/resourceGroups/rg-${{ env.APP_NAME }}-vnets/providers/Microsoft.Network/virtualNetworks/${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-vnet" \ + --subnet "/subscriptions/${{ env.AZURE_SUBSCRIPTION_ID }}/resourceGroups/rg-${{ env.APP_NAME }}-vnets/providers/Microsoft.Network/virtualNetworks/${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-vnet/subnets/${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-subnet" \ --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml - name: 'Re-generate the website links' diff --git a/azure-deployment/azure-resource-manager-deployment-template.yml b/azure-deployment/azure-resource-manager-deployment-template.yml index 6583bff..af9624c 100644 --- a/azure-deployment/azure-resource-manager-deployment-template.yml +++ b/azure-deployment/azure-resource-manager-deployment-template.yml @@ -105,8 +105,9 @@ properties: # Properties of container group requests: cpu: 1.0 memoryInGB: 0.5 + subnetIds: + - id: "/subscriptions/#AZURE_SUBSCRIPTION_ID#/resourceGroups/rg-#APP_NAME#-vnets/providers/Microsoft.Network/virtualNetworks/#APP_NAME#-#TARGET_ENVIRONMENT#-vnet/subnets/#APP_NAME#-#TARGET_ENVIRONMENT#-subnet" ipAddress: - type: "public" - dnsNameLabel: "#APP_NAME#-#TARGET_ENVIRONMENT#" + type: "private" ports: - port: 9158 diff --git a/azure-deployment/generate-manifest-from-template.sh b/azure-deployment/generate-manifest-from-template.sh index fa97660..c828d68 100755 --- a/azure-deployment/generate-manifest-from-template.sh +++ b/azure-deployment/generate-manifest-from-template.sh @@ -6,9 +6,9 @@ # by the generic 'build-and-deploy' Github action if [ "$LOCAL_DEPLOY" == "true" ]; then - echo "Deploying from local environment..." - source ./azure-deployment/manual-azure-deploy-secrets.env - source ./azure-deployment/manual-azure-deploy-variables.env + echo "Deploying from local environment..." + source ./azure-deployment/manual-azure-deploy-secrets.env + source ./azure-deployment/manual-azure-deploy-variables.env fi # Copy the template to the manifest @@ -21,6 +21,8 @@ sed -i "s^#APP_NAME#^$APP_NAME^g" ./azure-deployment/azure-resource-manager-depl sed -i "s^#TARGET_ENVIRONMENT#^$TARGET_ENVIRONMENT^g" ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i "s^#DOCKER_IMAGE_TAG#^$DOCKER_IMAGE_TAG^g" ./azure-deployment/azure-resource-manager-deployment-manifest.yml +sed -i ''s^#AZURE_SUBSCRIPTION_ID#^$AZURE_SUBSCRIPTION_ID^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml + sed -i ''s^#ACR_LOGIN_SERVER#^$ACR_LOGIN_SERVER^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#ACR_USERNAME#^$ACR_USERNAME^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#ACR_PASSWORD#^$ACR_PASSWORD^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml @@ -38,7 +40,6 @@ sed -i ''s^#DB_NAME#^$DB_NAME^g'' ./azure-deployment/azure-resource-manager-depl sed -i ''s^#DB_SSL_MODE#^$DB_SSL_MODE^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#DB_CONNECTION_TIMEOUT#^$DB_CONNECTION_TIMEOUT^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml - # Variables which configure the behaviour of the Bulk Data Service sed -i ''s^#DATA_REGISTRATION#^$DATA_REGISTRATION^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml From c43dc6cc027472850f36e83948741c69d100bdc8 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 12 May 2026 11:14:46 +0100 Subject: [PATCH 16/18] feat: manual deploy scripts This script builds and deploys to Azure from a local machine. It was removed from the repository once the GitHub CI/CD pipline was set up, but I'm restoring it here because it's useful for testing significant changes to the deployment setup without having to keep re-triggering the GitHub workflow. --- .../manual-azure-deploy-from-local.sh | 91 +++++++++++++++++++ .../manual-azure-deploy-secrets-example.env | 31 +++++++ .../manual-azure-deploy-variables-example.env | 77 ++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100755 azure-deployment/manual-azure-deploy-from-local.sh create mode 100644 azure-deployment/manual-azure-deploy-secrets-example.env create mode 100644 azure-deployment/manual-azure-deploy-variables-example.env diff --git a/azure-deployment/manual-azure-deploy-from-local.sh b/azure-deployment/manual-azure-deploy-from-local.sh new file mode 100755 index 0000000..dac1d61 --- /dev/null +++ b/azure-deployment/manual-azure-deploy-from-local.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +# This script is for deploying the Bulk Data Service to Azure from a local machine, without using GitHub Actions. +# It is useful for testing significant changes to the Azure manifest and deployment procedure, which for dev and +# prod is normally run through GitHub Actions. + +# NOTE: You will need to fill in the AZURE_SUBSCRIPTION_ID variable below before using. + +set -uo pipefail + +if [ ! -v "1" ]; then + echo "usage: $0 TARGET_ENVIRONMENT" + echo " TARGET_ENVIRONMENT should likely be 'test', 'dev', or 'prod'" + exit 1 +fi + +if [ ! -d ".git" ]; then + echo "$0: script must be run from the root of the bulk-data-service repository" + exit 1 +fi + +if [ ! -f "./azure-deployment/manual-azure-deploy-secrets.env" ]; then + echo "$0: there must be a file 'manual-azure-deploy-secrets.env' in" + echo "'azure-deployment' containing the secrets. See the examples in manual-azure-deploy-secrets-example.env'" + exit 1 +fi + +if [ ! -f "./azure-deployment/manual-azure-deploy-variables.env" ]; then + echo "$0: there must be a file 'manual-azure-deploy-variables.env' in" + echo "'azure-deployment' containing the config variables. See example: manual-azure-deploy-variables-example.env'" + exit 1 +fi + +(git remote -v 2>/dev/null | grep "IATI/bulk-data-service.git" >/dev/null) || ( + echo "$0: script must be run from the root of the bulk-data-service repository" + exit 1 +) + +. ./azure-deployment/manual-azure-deploy-secrets.env + +AZURE_SUBSCRIPTION_ID=" **** FILL IN **** " + +TARGET_ENVIRONMENT=$1 + +APP_NAME=bulk-data-service + +RESOURCE_GROUP_NAME="rg-${APP_NAME}-${TARGET_ENVIRONMENT}" + +CONTAINER_GROUP_INSTANCE_NAME="aci-${APP_NAME}-${TARGET_ENVIRONMENT}" + +DOCKER_IMAGE_TAG=$(git log -n1 --format=format:"%H") + +LOCAL_DEPLOY=true + +echo "Generating Azure ARM deployment manifest from template" +. ./azure-deployment/generate-manifest-from-template.sh + +# build the docker image for the Bulk Data Service +docker build . -t "criati.azurecr.io/bulk-data-service-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +# push Bulk Data Service image to Azure +docker push "criati.azurecr.io/bulk-data-service-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +# now configure, build and push the docker image for the nginx reverse proxy + +# create password file +htpasswd -BC 10 -c -b ./azure-deployment/nginx-reverse-proxy/htpasswd prom "$PROM_NGINX_REVERSE_PROXY_PASSWORD" + +# make the image for the nginx reverse proxy (for putting HTTP basic auth on the +# prom client) +docker build ./azure-deployment/nginx-reverse-proxy -t "criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +docker push "criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +echo az container delete \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "$CONTAINER_GROUP_INSTANCE_NAME" +az container delete \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "$CONTAINER_GROUP_INSTANCE_NAME" + +echo az container create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet" \ + --subnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet/subnets/${APP_NAME}-${TARGET_ENVIRONMENT}-subnet" \ + --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml +az container create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet" \ + --subnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet/subnets/${APP_NAME}-${TARGET_ENVIRONMENT}-subnet" \ + --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml diff --git a/azure-deployment/manual-azure-deploy-secrets-example.env b/azure-deployment/manual-azure-deploy-secrets-example.env new file mode 100644 index 0000000..281eff6 --- /dev/null +++ b/azure-deployment/manual-azure-deploy-secrets-example.env @@ -0,0 +1,31 @@ +# This file is used when doing a manual Azure deploy from a local machine. It should +# contain the equivalent of the secrets that are stored in Github actions + +ACR_LOGIN_SERVER= +ACR_USERNAME= +ACR_PASSWORD= + +DOCKER_HUB_USERNAME= +DOCKER_HUB_TOKEN= + +AZURE_STORAGE_CONNECTION_STRING= + +AZURE_SERVICE_BUS_CONNECTION_STRING= + +LOG_WORKSPACE_ID= +LOG_WORKSPACE_KEY= + +DB_USER= +DB_PASS= +DB_HOST= +DB_PORT= +DB_NAME= +DB_SSL_MODE=require +DB_CONNECTION_TIMEOUT=30 + +PROM_NGINX_REVERSE_PROXY_PASSWORD= + +DATA_REGISTRY_SUITECRM_API_URL= +DATA_REGISTRY_SUITECRM_CLIENT_ID= +DATA_REGISTRY_SUITECRM_CLIENT_SECRET= + diff --git a/azure-deployment/manual-azure-deploy-variables-example.env b/azure-deployment/manual-azure-deploy-variables-example.env new file mode 100644 index 0000000..a37c8f9 --- /dev/null +++ b/azure-deployment/manual-azure-deploy-variables-example.env @@ -0,0 +1,77 @@ +# This file is used when doing a manual Azure deploy from a local machine. It should +# contain the equivalent of the variables that are stored in Github actions + +### +### Variables pertaining to where the Bulk Data Service pulls the dataset +### registration information from during a whole-sync cycle +### + +DATA_REGISTRATION=suitecrm-registry +# DATA_REGISTRATION=ckan-registry + +DATA_REGISTRY_BASE_URL="" +DATA_REGISTRY_PUBLISHER_PLAIN_LIST_URL="" +DATA_REGISTRY_PUBLISHER_METADATA_URL="" +DATA_REGISTRY_PUBLISHER_METADATA_BATCH_SIZE="" + +# This is a flag which indicates whether to verify the SSL cerfificate. It should be set to +# true for production environments, and false for local development with self-signed certs +DATA_REGISTRY_SUITECRM_SECURE=true + +### +### Variables which affect the behaviour of the app and which can be used to +### customise / optimise speed +### + +# The number of minutes to wait between each full check cycle +CHECKER_LOOP_WAIT_MINS=10 + +# The timeout value in seconds used for dataset HEAD attempts. This value is +# passed to the requests library. +DATASET_HEAD_TIMEOUT=7 + +# The timeout value in seconds used for dataset download attempts. This value is +# passed to the requests library +DATASET_GET_TIMEOUT=22 + +# The number of hours after which we force a re-download of datasets +# successfully downloaded from servers that support ETag and Last-Modified +# headers, even when the ETag and Last-Modified header have not changed +FORCE_REDOWNLOAD_AFTER_HOURS=24 + +# Number of threads used for downloading during the full cycle. Setting to 1 +# makes for easier testing locally, value of ~12 seems to produce fastest cycle +# results. Note this will be obviated when the BDS is refactoed to use the +# Data Downloader +NUMBER_DOWNLOADER_THREADS=10 + +# The number of hours after which we re-download the dataset from servers that +# do not support `HEAD` requests +REDOWNLOAD_FROM_NON_HEAD_SERVERS_AFTER_HOURS=12 + +# The number of hours after which a failing download is removed +REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 + +# Whether to send DATASET_CHECK_RESULT messages to the IATI MQ Service (for use +# by the Dashboard) +SEND_DATASET_CHECK_RESULT_MESSAGES=no + +### +### Variables for internal configuration of application +### + +AZURE_STORAGE_BLOB_CONTAINER_NAME="\$web" + +# Azure Service Bus Server for IATI MQ + +# Local Service Bus Emulator +AZURE_SERVICE_BUS_CONNECTION_STRING= +AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME= +AZURE_SERVICE_BUS_REGISTRY_SUB_NAME= +AZURE_SERVICE_BUS_DATASET_CHECK_RESULTS_TOPIC_NAME= +AZURE_SERVICE_BUS_WAIT_TIME=0.1 + +WEB_BASE_URL='https://dev-bulk-data.iatistandard.org' + +ZIP_WORKING_DIR=/tmp/bulk-data-service-zip + From f62c3ab94eb4655daafae77bb1c340571f4de594 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 12 May 2026 11:15:34 +0100 Subject: [PATCH 17/18] build: bump version, and small change to gitignore --- .gitignore | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5cd4c56..bba8cec 100644 --- a/.gitignore +++ b/.gitignore @@ -196,5 +196,7 @@ __marimo__/ /azure-deployment/azure-resource-manager-deployment-manifest.yml /azure-deployment/nginx-reverse-proxy/htpasswd +/azure-deployment/manual-azure-deploy-secrets.env +/azure-deployment/manual-azure-deploy-variables.env /web/index.html diff --git a/pyproject.toml b/pyproject.toml index 867b89a..86280e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "1.4.6" +version = "1.4.7" requires-python = ">= 3.12.6" readme = "README.md" dependencies = [ From 3b98d698856724f012e1dcc6472b1dd99f5496a7 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 12 May 2026 11:16:43 +0100 Subject: [PATCH 18/18] docs: update CHANGELOG --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41df74e..841cb9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Removed +## [1.4.7] - 2026-05-11 + +### Added + +- Added back in the manual deploy script + +### Fixed + +- Fixed Azure deploy to use full resource identifiers for dedicated vnet & subnet + ## [1.4.6] - 2026-05-11 ### Changed