diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml index dd0cc05..1553731 100644 --- a/.github/workflows/build-and-deploy-job.yml +++ b/.github/workflows/build-and-deploy-job.yml @@ -1,7 +1,7 @@ --- name: Generic build and deploy (called by other workflows) -on: # yamllint disable-line rule:truthy +on: # yamllint disable-line rule:truthy workflow_call: inputs: APP_NAME: @@ -25,6 +25,8 @@ jobs: ACR_USERNAME: ${{ secrets.ACR_USERNAME }} ACR_PASSWORD: ${{ secrets.ACR_PASSWORD }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + steps: - name: 'Generate/build derived environment variables' run: | @@ -32,6 +34,8 @@ jobs: echo "CONTAINER_INSTANCE_BASE_NAME=aci-${APP_NAME}" >> ${GITHUB_ENV} echo "RESOURCE_GROUP_BASE_NAME=rg-${APP_NAME}" >> ${GITHUB_ENV} echo "STORAGE_ACCOUNT_NAME=sa${APP_NAME//-/}$TARGET_ENVIRONMENT" >> ${GITHUB_ENV} + echo "APP_NAME=${APP_NAME}" >> ${GITHUB_ENV} + echo "AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID}" >> ${GITHUB_ENV} - name: 'Print calculated environment variables' run: | @@ -39,7 +43,8 @@ jobs: echo $CONTAINER_INSTANCE_BASE_NAME echo $RESOURCE_GROUP_BASE_NAME echo $STORAGE_ACCOUNT_NAME - + echo $APP_NAME + echo $AZURE_SUBSCRIPTION_ID - name: 'Checkout GitHub Action' uses: actions/checkout@v4 @@ -105,6 +110,7 @@ jobs: LOG_WORKSPACE_KEY: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'LOG_WORKSPACE_KEY')] }} # Variables which configure the app + AZURE_SERVICE_BUS_DATASET_CHECK_RESULTS_TOPIC_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_DATASET_CHECK_RESULTS_TOPIC_NAME')] }} AZURE_SERVICE_BUS_REGISTRY_SUB_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_SUB_NAME')] }} AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME')] }} AZURE_SERVICE_BUS_WAIT_TIME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_WAIT_TIME')] }} @@ -137,6 +143,8 @@ jobs: az -v az container create --debug \ --resource-group "${{ env.RESOURCE_GROUP_BASE_NAME }}-${{ env.TARGET_ENVIRONMENT }}" \ + --vnet "/subscriptions/${{ env.AZURE_SUBSCRIPTION_ID }}/resourceGroups/rg-${{ env.APP_NAME }}-vnets/providers/Microsoft.Network/virtualNetworks/${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-vnet" \ + --subnet "/subscriptions/${{ env.AZURE_SUBSCRIPTION_ID }}/resourceGroups/rg-${{ env.APP_NAME }}-vnets/providers/Microsoft.Network/virtualNetworks/${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-vnet/subnets/${{ env.APP_NAME }}-${{ env.TARGET_ENVIRONMENT }}-subnet" \ --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml - name: 'Re-generate the website links' diff --git a/.gitignore b/.gitignore index 5cd4c56..bba8cec 100644 --- a/.gitignore +++ b/.gitignore @@ -196,5 +196,7 @@ __marimo__/ /azure-deployment/azure-resource-manager-deployment-manifest.yml /azure-deployment/nginx-reverse-proxy/htpasswd +/azure-deployment/manual-azure-deploy-secrets.env +/azure-deployment/manual-azure-deploy-variables.env /web/index.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a464c7..841cb9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,37 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Removed +## [1.4.7] - 2026-05-11 + +### Added + +- Added back in the manual deploy script + +### Fixed + +- Fixed Azure deploy to use full resource identifiers for dedicated vnet & subnet + +## [1.4.6] - 2026-05-11 + +### Changed + +- Updated deploy to use dedicated vnet & subnet + +### Fixed + +- Added env var for the MQ topic name to the GitHub workflow. + +## [1.4.5] - 2026-05-06 + +### Changed + +- Updated IATI Design System to 4.9.0 + +### Fixed + +- Bug where the dataset's cached URLs were not being blanked after dataset expiry. (Resolves #137) +- Bug where `most_recent_head_attempt.error_occurred` was being set to `null` instead of `false`. (Resolves #136). + ## [1.4.4] - 2026-04-22 ### Added diff --git a/README.md b/README.md index 11a87e3..fedeecb 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ The `.env` file is used when running things locally to store environment variabl Running the app successfully requires a Postgres database and a connection to an Azure blob storage account. There is a docker compose setup which can be used to start an instance of each service locally, that can be run with: -``` +```bash docker compose up -d ``` @@ -69,18 +69,35 @@ The example `.env` file (`.env-example`) is configured to use the above docker c Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite): -``` +```bash dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50 ``` You can run the zipper operation with: -``` +```bash dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run ``` It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable. +The full range of command line arguments is listed below: + +``` +usage: iati_bulk_data_service.py [-h] --operation {checker,zipper,registry-changes-processor} [--single-run] [--run-for-n-datasets RUN_FOR_N_DATASETS] [--run-for-single-reporting-org RUN_FOR_SINGLE_REPORTING_ORG] [--skip-safety] + +options: + -h, --help show this help message and exit + --operation {checker,zipper,registry-changes-processor} + Operation to run: checker, downloader, registry-changes-processor + --single-run Perform a single run, then exit + --run-for-n-datasets RUN_FOR_N_DATASETS + Run on the first N datasets from registration service (useful for testing) + --run-for-single-reporting-org RUN_FOR_SINGLE_REPORTING_ORG + Run only for the datasets belonging to the specified reporting org short name (useful for testing) + --skip-safety Skip safety checks during the run (useful for testing) +``` + To shutdown the docker compose setup, use (the Azure Service Bus emulator appears to be a bit sensitive to Ctrl-C shutdowns, so always best to shutdown with `docker compose down`): @@ -222,6 +239,8 @@ pytest-watcher . ### Initial Provisioning +#### Bulk Data Service App + You can create an Azure-based instance of Bulk Data Service using the `azure-create-resources.sh` script. It must be run from the root of the repository, and it requires (i) the environment variable `BDS_DB_ADMIN_PASSWORD` to be set with the password for the database, and (ii) a single parameter which is the name of the environment/instance. For instance, the following command will create a dev instance: ```bash @@ -232,6 +251,16 @@ This will create a resource group on Azure called `rg-bulk-data-service-dev`, an At the end of its run, the `azure-create-resources.sh` script will print out various secrets which need to be added to Github Actions. +**NOTE**: This is only really useful for temporary deployment or initial setup; once you're setup with CI/CD, the GitHub action does all this. + +#### Bulk Data Service Network and Public IP + +The Bulk Data Service is deployed to a dedicated vnet with subnet and attached NAT Gateway which has a public IP. To ensure the IP remains, these are not destroyed and re-created on every release (like the Azure Container Instances are). To create the networks and public IPs for dev and production, run: + +```bash +./azure-provision/create-vnets-public-ips.sh +``` + ### Deployment - Versioning The app version is set in `pyproject.toml`, and this is read by the app to use in the `User-Agent` header. When making a new release, set the version here to the appropriate value. Then, when releasing the app using the normal IATI Python app deployment process, choose the tag name to match the version chosen. diff --git a/azure-deployment/azure-resource-manager-deployment-template.yml b/azure-deployment/azure-resource-manager-deployment-template.yml index 6583bff..af9624c 100644 --- a/azure-deployment/azure-resource-manager-deployment-template.yml +++ b/azure-deployment/azure-resource-manager-deployment-template.yml @@ -105,8 +105,9 @@ properties: # Properties of container group requests: cpu: 1.0 memoryInGB: 0.5 + subnetIds: + - id: "/subscriptions/#AZURE_SUBSCRIPTION_ID#/resourceGroups/rg-#APP_NAME#-vnets/providers/Microsoft.Network/virtualNetworks/#APP_NAME#-#TARGET_ENVIRONMENT#-vnet/subnets/#APP_NAME#-#TARGET_ENVIRONMENT#-subnet" ipAddress: - type: "public" - dnsNameLabel: "#APP_NAME#-#TARGET_ENVIRONMENT#" + type: "private" ports: - port: 9158 diff --git a/azure-deployment/generate-manifest-from-template.sh b/azure-deployment/generate-manifest-from-template.sh index fa97660..c828d68 100755 --- a/azure-deployment/generate-manifest-from-template.sh +++ b/azure-deployment/generate-manifest-from-template.sh @@ -6,9 +6,9 @@ # by the generic 'build-and-deploy' Github action if [ "$LOCAL_DEPLOY" == "true" ]; then - echo "Deploying from local environment..." - source ./azure-deployment/manual-azure-deploy-secrets.env - source ./azure-deployment/manual-azure-deploy-variables.env + echo "Deploying from local environment..." + source ./azure-deployment/manual-azure-deploy-secrets.env + source ./azure-deployment/manual-azure-deploy-variables.env fi # Copy the template to the manifest @@ -21,6 +21,8 @@ sed -i "s^#APP_NAME#^$APP_NAME^g" ./azure-deployment/azure-resource-manager-depl sed -i "s^#TARGET_ENVIRONMENT#^$TARGET_ENVIRONMENT^g" ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i "s^#DOCKER_IMAGE_TAG#^$DOCKER_IMAGE_TAG^g" ./azure-deployment/azure-resource-manager-deployment-manifest.yml +sed -i ''s^#AZURE_SUBSCRIPTION_ID#^$AZURE_SUBSCRIPTION_ID^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml + sed -i ''s^#ACR_LOGIN_SERVER#^$ACR_LOGIN_SERVER^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#ACR_USERNAME#^$ACR_USERNAME^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#ACR_PASSWORD#^$ACR_PASSWORD^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml @@ -38,7 +40,6 @@ sed -i ''s^#DB_NAME#^$DB_NAME^g'' ./azure-deployment/azure-resource-manager-depl sed -i ''s^#DB_SSL_MODE#^$DB_SSL_MODE^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#DB_CONNECTION_TIMEOUT#^$DB_CONNECTION_TIMEOUT^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml - # Variables which configure the behaviour of the Bulk Data Service sed -i ''s^#DATA_REGISTRATION#^$DATA_REGISTRATION^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml diff --git a/azure-deployment/manual-azure-deploy-from-local.sh b/azure-deployment/manual-azure-deploy-from-local.sh new file mode 100755 index 0000000..dac1d61 --- /dev/null +++ b/azure-deployment/manual-azure-deploy-from-local.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +# This script is for deploying the Bulk Data Service to Azure from a local machine, without using GitHub Actions. +# It is useful for testing significant changes to the Azure manifest and deployment procedure, which for dev and +# prod is normally run through GitHub Actions. + +# NOTE: You will need to fill in the AZURE_SUBSCRIPTION_ID variable below before using. + +set -uo pipefail + +if [ ! -v "1" ]; then + echo "usage: $0 TARGET_ENVIRONMENT" + echo " TARGET_ENVIRONMENT should likely be 'test', 'dev', or 'prod'" + exit 1 +fi + +if [ ! -d ".git" ]; then + echo "$0: script must be run from the root of the bulk-data-service repository" + exit 1 +fi + +if [ ! -f "./azure-deployment/manual-azure-deploy-secrets.env" ]; then + echo "$0: there must be a file 'manual-azure-deploy-secrets.env' in" + echo "'azure-deployment' containing the secrets. See the examples in manual-azure-deploy-secrets-example.env'" + exit 1 +fi + +if [ ! -f "./azure-deployment/manual-azure-deploy-variables.env" ]; then + echo "$0: there must be a file 'manual-azure-deploy-variables.env' in" + echo "'azure-deployment' containing the config variables. See example: manual-azure-deploy-variables-example.env'" + exit 1 +fi + +(git remote -v 2>/dev/null | grep "IATI/bulk-data-service.git" >/dev/null) || ( + echo "$0: script must be run from the root of the bulk-data-service repository" + exit 1 +) + +. ./azure-deployment/manual-azure-deploy-secrets.env + +AZURE_SUBSCRIPTION_ID=" **** FILL IN **** " + +TARGET_ENVIRONMENT=$1 + +APP_NAME=bulk-data-service + +RESOURCE_GROUP_NAME="rg-${APP_NAME}-${TARGET_ENVIRONMENT}" + +CONTAINER_GROUP_INSTANCE_NAME="aci-${APP_NAME}-${TARGET_ENVIRONMENT}" + +DOCKER_IMAGE_TAG=$(git log -n1 --format=format:"%H") + +LOCAL_DEPLOY=true + +echo "Generating Azure ARM deployment manifest from template" +. ./azure-deployment/generate-manifest-from-template.sh + +# build the docker image for the Bulk Data Service +docker build . -t "criati.azurecr.io/bulk-data-service-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +# push Bulk Data Service image to Azure +docker push "criati.azurecr.io/bulk-data-service-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +# now configure, build and push the docker image for the nginx reverse proxy + +# create password file +htpasswd -BC 10 -c -b ./azure-deployment/nginx-reverse-proxy/htpasswd prom "$PROM_NGINX_REVERSE_PROXY_PASSWORD" + +# make the image for the nginx reverse proxy (for putting HTTP basic auth on the +# prom client) +docker build ./azure-deployment/nginx-reverse-proxy -t "criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +docker push "criati.azurecr.io/bds-prom-nginx-reverse-proxy-$TARGET_ENVIRONMENT:$DOCKER_IMAGE_TAG" + +echo az container delete \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "$CONTAINER_GROUP_INSTANCE_NAME" +az container delete \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "$CONTAINER_GROUP_INSTANCE_NAME" + +echo az container create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet" \ + --subnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet/subnets/${APP_NAME}-${TARGET_ENVIRONMENT}-subnet" \ + --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml +az container create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet" \ + --subnet "/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/rg-${APP_NAME}-vnets/providers/Microsoft.Network/virtualNetworks/${APP_NAME}-${TARGET_ENVIRONMENT}-vnet/subnets/${APP_NAME}-${TARGET_ENVIRONMENT}-subnet" \ + --file ./azure-deployment/azure-resource-manager-deployment-manifest.yml diff --git a/azure-deployment/manual-azure-deploy-secrets-example.env b/azure-deployment/manual-azure-deploy-secrets-example.env new file mode 100644 index 0000000..281eff6 --- /dev/null +++ b/azure-deployment/manual-azure-deploy-secrets-example.env @@ -0,0 +1,31 @@ +# This file is used when doing a manual Azure deploy from a local machine. It should +# contain the equivalent of the secrets that are stored in Github actions + +ACR_LOGIN_SERVER= +ACR_USERNAME= +ACR_PASSWORD= + +DOCKER_HUB_USERNAME= +DOCKER_HUB_TOKEN= + +AZURE_STORAGE_CONNECTION_STRING= + +AZURE_SERVICE_BUS_CONNECTION_STRING= + +LOG_WORKSPACE_ID= +LOG_WORKSPACE_KEY= + +DB_USER= +DB_PASS= +DB_HOST= +DB_PORT= +DB_NAME= +DB_SSL_MODE=require +DB_CONNECTION_TIMEOUT=30 + +PROM_NGINX_REVERSE_PROXY_PASSWORD= + +DATA_REGISTRY_SUITECRM_API_URL= +DATA_REGISTRY_SUITECRM_CLIENT_ID= +DATA_REGISTRY_SUITECRM_CLIENT_SECRET= + diff --git a/azure-deployment/manual-azure-deploy-variables-example.env b/azure-deployment/manual-azure-deploy-variables-example.env new file mode 100644 index 0000000..a37c8f9 --- /dev/null +++ b/azure-deployment/manual-azure-deploy-variables-example.env @@ -0,0 +1,77 @@ +# This file is used when doing a manual Azure deploy from a local machine. It should +# contain the equivalent of the variables that are stored in Github actions + +### +### Variables pertaining to where the Bulk Data Service pulls the dataset +### registration information from during a whole-sync cycle +### + +DATA_REGISTRATION=suitecrm-registry +# DATA_REGISTRATION=ckan-registry + +DATA_REGISTRY_BASE_URL="" +DATA_REGISTRY_PUBLISHER_PLAIN_LIST_URL="" +DATA_REGISTRY_PUBLISHER_METADATA_URL="" +DATA_REGISTRY_PUBLISHER_METADATA_BATCH_SIZE="" + +# This is a flag which indicates whether to verify the SSL cerfificate. It should be set to +# true for production environments, and false for local development with self-signed certs +DATA_REGISTRY_SUITECRM_SECURE=true + +### +### Variables which affect the behaviour of the app and which can be used to +### customise / optimise speed +### + +# The number of minutes to wait between each full check cycle +CHECKER_LOOP_WAIT_MINS=10 + +# The timeout value in seconds used for dataset HEAD attempts. This value is +# passed to the requests library. +DATASET_HEAD_TIMEOUT=7 + +# The timeout value in seconds used for dataset download attempts. This value is +# passed to the requests library +DATASET_GET_TIMEOUT=22 + +# The number of hours after which we force a re-download of datasets +# successfully downloaded from servers that support ETag and Last-Modified +# headers, even when the ETag and Last-Modified header have not changed +FORCE_REDOWNLOAD_AFTER_HOURS=24 + +# Number of threads used for downloading during the full cycle. Setting to 1 +# makes for easier testing locally, value of ~12 seems to produce fastest cycle +# results. Note this will be obviated when the BDS is refactoed to use the +# Data Downloader +NUMBER_DOWNLOADER_THREADS=10 + +# The number of hours after which we re-download the dataset from servers that +# do not support `HEAD` requests +REDOWNLOAD_FROM_NON_HEAD_SERVERS_AFTER_HOURS=12 + +# The number of hours after which a failing download is removed +REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 + +# Whether to send DATASET_CHECK_RESULT messages to the IATI MQ Service (for use +# by the Dashboard) +SEND_DATASET_CHECK_RESULT_MESSAGES=no + +### +### Variables for internal configuration of application +### + +AZURE_STORAGE_BLOB_CONTAINER_NAME="\$web" + +# Azure Service Bus Server for IATI MQ + +# Local Service Bus Emulator +AZURE_SERVICE_BUS_CONNECTION_STRING= +AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME= +AZURE_SERVICE_BUS_REGISTRY_SUB_NAME= +AZURE_SERVICE_BUS_DATASET_CHECK_RESULTS_TOPIC_NAME= +AZURE_SERVICE_BUS_WAIT_TIME=0.1 + +WEB_BASE_URL='https://dev-bulk-data.iatistandard.org' + +ZIP_WORKING_DIR=/tmp/bulk-data-service-zip + diff --git a/azure-provision/create-vnets-public-ips.sh b/azure-provision/create-vnets-public-ips.sh new file mode 100755 index 0000000..70212d0 --- /dev/null +++ b/azure-provision/create-vnets-public-ips.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -o errexit # abort on nonzero exitstatus +set -o nounset # abort on unbound variable +set -o pipefail # don't hide errors within pipes + +# This script creates the virtual networks, subnets and public IPs for the bulk data service. + +RESOURCE_GROUP_NAME="rg-bulk-data-service-vnets" +LOCATION="uksouth" + +az group create --name "$RESOURCE_GROUP_NAME" --location "$LOCATION" + +for ENV in dev prod; do + az network vnet create --resource-group "$RESOURCE_GROUP_NAME" \ + --name "bulk-data-service-${ENV}-vnet" \ + --address-prefix 10.0.0.0/16 \ + --subnet-name "bulk-data-service-${ENV}-subnet" \ + --subnet-prefix 10.0.1.0/24 + + az network vnet subnet update --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet-name "bulk-data-service-${ENV}-vnet" \ + --name "bulk-data-service-${ENV}-subnet" \ + --delegation Microsoft.ContainerInstance/containerGroups + + az network public-ip create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "bulk-data-service-${ENV}-public-ip" \ + --sku Standard \ + --allocation-method Static \ + --location "$LOCATION" + + az network nat gateway create \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --name "bulk-data-service-${ENV}-nat-gateway" \ + --location "$LOCATION" \ + --public-ip-addresses "bulk-data-service-${ENV}-public-ip" \ + --idle-timeout 10 + + az network vnet subnet update \ + --resource-group "$RESOURCE_GROUP_NAME" \ + --vnet-name "bulk-data-service-${ENV}-vnet" \ + --name "bulk-data-service-${ENV}-subnet" \ + --nat-gateway "bulk-data-service-${ENV}-nat-gateway" +done diff --git a/db-migrations/20260505_01_7kh1j.rollback.sql b/db-migrations/20260505_01_7kh1j.rollback.sql new file mode 100644 index 0000000..73e7585 --- /dev/null +++ b/db-migrations/20260505_01_7kh1j.rollback.sql @@ -0,0 +1,5 @@ +alter table iati_datasets + alter column most_recent_head_attempt_error_occurred drop default; + +alter table iati_datasets + alter column most_recent_get_attempt_error_occurred drop default; diff --git a/db-migrations/20260505_01_7kh1j.sql b/db-migrations/20260505_01_7kh1j.sql new file mode 100644 index 0000000..1e731e4 --- /dev/null +++ b/db-migrations/20260505_01_7kh1j.sql @@ -0,0 +1,16 @@ +-- +-- depends: 20250827_01_Dt6Ow + +alter table iati_datasets + alter column most_recent_head_attempt_error_occurred set default false; + +alter table iati_datasets + alter column most_recent_get_attempt_error_occurred set default false; + +update iati_datasets + set most_recent_head_attempt_error_occurred = false + where most_recent_head_attempt_error_occurred is null; + +update iati_datasets + set most_recent_get_attempt_error_occurred = false + where most_recent_get_attempt_error_occurred is null; diff --git a/pyproject.toml b/pyproject.toml index b7ab0e5..86280e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "1.4.4" +version = "1.4.7" requires-python = ">= 3.12.6" readme = "README.md" dependencies = [ diff --git a/src/bulk_data_service/dataset.py b/src/bulk_data_service/dataset.py index d28370b..d4ac228 100644 --- a/src/bulk_data_service/dataset.py +++ b/src/bulk_data_service/dataset.py @@ -43,9 +43,11 @@ def create_empty_dataset() -> dict[str, Any]: empty_ds = { k: None for k in DATASET_REGISTRATION_FIELDS + DATASET_NON_REGISTRATION_FIELDS - } # type: dict[str, str | None] + } # type: dict[str, str | bool | None] empty_ds["most_recent_get_attempt_error_details"] = make_http_attempt_error_details() + empty_ds["most_recent_get_attempt_error_occurred"] = False empty_ds["most_recent_head_attempt_error_details"] = make_http_attempt_error_details() + empty_ds["most_recent_head_attempt_error_occurred"] = False return empty_ds diff --git a/src/bulk_data_service/dataset_remover.py b/src/bulk_data_service/dataset_remover.py index 3137183..c605321 100644 --- a/src/bulk_data_service/dataset_remover.py +++ b/src/bulk_data_service/dataset_remover.py @@ -77,6 +77,10 @@ def remove_download_for_expired_dataset( "last good download from Bulk Data Service".format(bds_dataset["id"], max_hours) ) + bds_dataset["last_known_good_dataset_cached_dataset_xml_url"] = None + bds_dataset["last_known_good_dataset_cached_dataset_xml_etag"] = None + bds_dataset["last_known_good_dataset_cached_dataset_zip_url"] = None + bds_dataset["last_known_good_dataset_cached_dataset_zip_etag"] = None bds_dataset["last_known_good_dataset_downloaded"] = None bds_dataset["last_known_good_dataset_hash"] = None bds_dataset["last_known_good_dataset_hash_excluding_generated_timestamp"] = None diff --git a/tests/integration/test_dataset_add.py b/tests/integration/test_dataset_add.py index c17226f..bbf47e4 100644 --- a/tests/integration/test_dataset_add.py +++ b/tests/integration/test_dataset_add.py @@ -101,6 +101,8 @@ def test_add_downloadable_dataset_for_various_encodings( check_most_recent_http_attempt_for_success("get", datasets_in_bds[dataset_id]) + assert datasets_in_bds[dataset_id]["most_recent_head_attempt_error_occurred"] is False + check_last_known_good_dataset_values_are_set(datasets_in_bds[dataset_id]) check_dataset_fields( diff --git a/tests/integration/test_dataset_expiry.py b/tests/integration/test_dataset_expiry.py index d90bb8c..d847eb1 100644 --- a/tests/integration/test_dataset_expiry.py +++ b/tests/integration/test_dataset_expiry.py @@ -36,6 +36,11 @@ def test_dataset_expiry_after_72_hours_failed_downloads(get_and_clear_up_context dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] assert len(datasets_in_bds) == 1 + + assert dataset["last_known_good_dataset_cached_dataset_xml_url"] is None + assert dataset["last_known_good_dataset_cached_dataset_xml_etag"] is None + assert dataset["last_known_good_dataset_cached_dataset_zip_url"] is None + assert dataset["last_known_good_dataset_cached_dataset_zip_etag"] is None assert dataset["last_known_good_dataset_downloaded"] is None assert dataset["last_known_good_dataset_hash"] is None assert dataset["last_known_good_dataset_hash_excluding_generated_timestamp"] is None diff --git a/tests/unit/test_dataset_registration.py b/tests/unit/test_dataset_registration.py index be1ae91..00c9d77 100644 --- a/tests/unit/test_dataset_registration.py +++ b/tests/unit/test_dataset_registration.py @@ -5,6 +5,7 @@ import pytest +from bulk_data_service.dataset import create_empty_dataset from dataset_registration.iati_registry_ckan import clean_datasets_metadata, convert_datasets_metadata @@ -42,6 +43,12 @@ def test_incomplete_necessary_data_from_ckan(field_blanker, attribute_value): assert(len(ckan_datasets) == 0) +def test_create_empty_dataset_error_occurred_defaults_to_false(): + ds = create_empty_dataset() + assert ds["most_recent_head_attempt_error_occurred"] is False + assert ds["most_recent_get_attempt_error_occurred"] is False + + @pytest.mark.parametrize("resources_value", [None, [], {"url": None}]) def test_missing_url_from_ckan(resources_value): diff --git a/web/404.html b/web/404.html index 494ec48..d1f2bba 100644 --- a/web/404.html +++ b/web/404.html @@ -4,7 +4,7 @@