From 7507692ec11d8af7b4ddc6afbf685e23ed28d299 Mon Sep 17 00:00:00 2001 From: zwright Date: Mon, 23 Mar 2026 13:48:16 -0400 Subject: [PATCH 1/2] KPMP-6566: try chunks --- data_management/services/dlu_filesystem.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index f1b9d44..84b9305 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -15,15 +15,19 @@ def calculate_checksum(file_path: str): - if os.path.isdir(file_path): return "0" + if os.path.getsize(file_path) == 0: - # This is apparently the md5 returned for an empty file return 'd41d8cd98f00b204e9800998ecf8427e' - elif ".zarr" not in file_path: - with open(file_path) as f, mmap(f.fileno(), 0, access=ACCESS_READ) as f: - return md5(f).hexdigest() + + if ".zarr" not in file_path: + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + # Read in 1MB chunks to keep RAM usage low + for chunk in iter(lambda: f.read(1024 * 1024), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() else: return compute_zarr_checksum(yield_files_local(file_path)).md5 From a4cef739fba1a4d39bc9a88035648497bff2c923 Mon Sep 17 00:00:00 2001 From: zwright Date: Wed, 25 Mar 2026 15:25:09 -0400 Subject: [PATCH 2/2] KPMP-6566: increase timeout --- data_management/Dockerfile | 2 +- data_management/services/dlu_filesystem.py | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/data_management/Dockerfile b/data_management/Dockerfile index 1fe48b1..cb6adef 100644 --- a/data_management/Dockerfile +++ b/data_management/Dockerfile @@ -31,5 +31,5 @@ COPY app.py ./ COPY process_bulk_uploads.py ./ COPY services/ ./services -ENTRYPOINT ["gunicorn", "-b", ":5000", "app:app", "-t", "600"] +ENTRYPOINT ["gunicorn", "-b", ":5000", "app:app", "-t", "1200"] diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 84b9305..f1b9d44 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -15,19 +15,15 @@ def calculate_checksum(file_path: str): + if os.path.isdir(file_path): return "0" - if os.path.getsize(file_path) == 0: + # This is apparently the md5 returned for an empty file return 'd41d8cd98f00b204e9800998ecf8427e' - - if ".zarr" not in file_path: - hash_md5 = hashlib.md5() - with open(file_path, "rb") as f: - # Read in 1MB chunks to keep RAM usage low - for chunk in iter(lambda: f.read(1024 * 1024), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() + elif ".zarr" not in file_path: + with open(file_path) as f, mmap(f.fileno(), 0, access=ACCESS_READ) as f: + return md5(f).hexdigest() else: return compute_zarr_checksum(yield_files_local(file_path)).md5