From 3d0e0615a2265009b73bb0c0e3cf13a3b9229b13 Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Fri, 17 Oct 2025 16:36:25 +0200 Subject: [PATCH 1/3] fix: handle paths with non-utf-8 bytes Signed-off-by: Adrian Braemer --- src/commoncode/resource.py | 2 +- tests/test_resource.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 64dbe7cc..d19e2da8 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -389,7 +389,7 @@ def _get_resource_cache_location(self, path, create_dirs=False): path = clean_path(path) # for the cached file name, we use an md5 of the path to avoid things being too long - resid = str(md5(path.encode("utf-8")).hexdigest()) + resid = str(md5(path.encode("utf-8", "surrogateescape")).hexdigest()) cache_sub_dir, cache_file_name = resid[-2:], resid parent = join(self.cache_dir, cache_sub_dir) diff --git a/tests/test_resource.py b/tests/test_resource.py index 9711b660..70bc18b0 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -694,6 +694,11 @@ def test_codebase_cache_default(self): codebase.save_resource(child) child_2 = codebase.get_resource(path=child.path) assert child_2 == child + + def test_codebase_cache_handles_non_utf8_path(self): + test_codebase = self.get_test_loc("resource/cache2") + codebase = Codebase(test_codebase) + codebase._get_resource_cache_location('resource/cache2/\udce9', create_dirs=True) def test_codebase_cache_all_in_memory(self): test_codebase = self.get_test_loc("resource/cache2") From 3b48b9bc0495fdf066f59070af1966023bb479f9 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 22 Oct 2025 17:50:29 +0530 Subject: [PATCH 2/3] Fix string quotes in test for non-UTF8 path --- tests/test_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_resource.py b/tests/test_resource.py index 70bc18b0..93e322af 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -698,7 +698,7 @@ def test_codebase_cache_default(self): def test_codebase_cache_handles_non_utf8_path(self): test_codebase = self.get_test_loc("resource/cache2") codebase = Codebase(test_codebase) - codebase._get_resource_cache_location('resource/cache2/\udce9', create_dirs=True) + codebase._get_resource_cache_location("resource/cache2/\udce9", create_dirs=True) def test_codebase_cache_all_in_memory(self): test_codebase = self.get_test_loc("resource/cache2") From 47fb4ddf2b5abedabddd22c43bc4963ffd29bd70 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 22 Oct 2025 17:53:24 +0530 Subject: [PATCH 3/3] Fix formatting in test_resource.py --- tests/test_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_resource.py b/tests/test_resource.py index 93e322af..6249ebb4 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -694,7 +694,7 @@ def test_codebase_cache_default(self): codebase.save_resource(child) child_2 = codebase.get_resource(path=child.path) assert child_2 == child - + def test_codebase_cache_handles_non_utf8_path(self): test_codebase = self.get_test_loc("resource/cache2") codebase = Codebase(test_codebase)