diff --git a/src/py/mat3ra/utils/__init__.py b/src/py/mat3ra/utils/__init__.py index e6a4962..3de0935 100644 --- a/src/py/mat3ra/utils/__init__.py +++ b/src/py/mat3ra/utils/__init__.py @@ -1,2 +1,3 @@ +from .hash import calculate_hash_from_object, calculate_hash_from_string from .search import find_by_key_or_regex from .string import camel_to_snake, snake_to_camel diff --git a/src/py/mat3ra/utils/hash.py b/src/py/mat3ra/utils/hash.py new file mode 100644 index 0000000..d2e6429 --- /dev/null +++ b/src/py/mat3ra/utils/hash.py @@ -0,0 +1,60 @@ +import hashlib +import json +import warnings +from typing import Any, Optional + + +def _sort_keys_deep(obj: Any) -> Any: + if isinstance(obj, dict): + return {key: _sort_keys_deep(obj[key]) for key in sorted(obj)} + if isinstance(obj, list): + return [_sort_keys_deep(item) for item in obj] + return obj + + +def _get_hasher(hash_function: Optional[str]) -> Any: + algorithm = (hash_function or "md5").lower() + try: + return getattr(hashlib, algorithm)() + except (AttributeError, TypeError, ValueError): + if algorithm != "md5": + warnings.warn(f"Hash function '{hash_function}' unavailable. Falling back to MD5.", RuntimeWarning) + return hashlib.md5() + + +def calculate_hash_from_string(message: str, hash_function: str = "md5") -> str: + """ + Calculates hash of a given text. + + Defaults to MD5 for parity with the legacy JS utilities and falls back to MD5 if the + requested algorithm is unavailable. Not intended for security-sensitive use or any + scenario where collision resistance matters. + + Args: + message: Input text to hash. + hash_function: Hash function name. Falls back to MD5 if unavailable. + + Returns: + str: Hex digest of the hashed message. + """ + hasher = _get_hasher(hash_function) + hasher.update(message.encode()) + return hasher.hexdigest() + + +def calculate_hash_from_object(obj: Any, hash_function: str = "md5") -> str: + """ + Calculates hash of a given object. It must be serializable. + + Keys are sorted recursively before hashing to ensure deterministic output. + + Args: + obj: Serializable object to hash. + hash_function: Hash function name. Falls back to MD5 if unavailable. + + Returns: + str: Hex digest of the hashed object representation. + """ + sorted_obj = _sort_keys_deep(obj) + message = json.dumps(sorted_obj, separators=(",", ":"), ensure_ascii=False) + return calculate_hash_from_string(message, hash_function) diff --git a/tests/py/unit/test_hash.py b/tests/py/unit/test_hash.py new file mode 100644 index 0000000..616d34d --- /dev/null +++ b/tests/py/unit/test_hash.py @@ -0,0 +1,52 @@ +import hashlib +import json + +import pytest + +import mat3ra.utils.hash as hash_utils + + +def test_calculate_hash_from_string_default_md5(): + message = "hello world" + expected = hashlib.md5(message.encode()).hexdigest() + assert hash_utils.calculate_hash_from_string(message) == expected + + +def test_calculate_hash_from_string_with_algorithm(): + message = "hello world" + expected = hashlib.sha1(message.encode()).hexdigest() + assert hash_utils.calculate_hash_from_string(message, "sha1") == expected + + +def test_calculate_hash_from_object_sorts_keys(): + obj = { + "b": 1, + "a": {"d": 4, "c": 3}, + "list": [{"y": 2, "x": 1}, 3], + } + sorted_obj = {"a": {"c": 3, "d": 4}, "b": 1, "list": [{"x": 1, "y": 2}, 3]} + expected_message = json.dumps(sorted_obj, separators=(",", ":"), ensure_ascii=False) + expected = hashlib.md5(expected_message.encode()).hexdigest() + assert hash_utils.calculate_hash_from_object(obj) == expected + + +def test_calculate_hash_from_object_with_algorithm(): + obj = {"b": 1, "a": 2} + expected_message = json.dumps({"a": 2, "b": 1}, separators=(",", ":"), ensure_ascii=False) + expected = hashlib.sha256(expected_message.encode()).hexdigest() + assert hash_utils.calculate_hash_from_object(obj, "sha256") == expected + + +def test_calculate_hash_from_string_invalid_algorithm_falls_back_to_md5(): + message = "fallback" + expected = hashlib.md5(message.encode()).hexdigest() + with pytest.warns(RuntimeWarning): + assert hash_utils.calculate_hash_from_string(message, "unknown") == expected + + +def test_calculate_hash_from_object_invalid_algorithm_falls_back_to_md5(): + obj = {"b": 1, "a": 2} + expected_message = json.dumps({"a": 2, "b": 1}, separators=(",", ":"), ensure_ascii=False) + expected = hashlib.md5(expected_message.encode()).hexdigest() + with pytest.warns(RuntimeWarning): + assert hash_utils.calculate_hash_from_object(obj, "unknown") == expected