Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/py/mat3ra/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .hash import calculate_hash_from_object, calculate_hash_from_string
from .search import find_by_key_or_regex
from .string import camel_to_snake, snake_to_camel
60 changes: 60 additions & 0 deletions src/py/mat3ra/utils/hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import hashlib
import json
import warnings
from typing import Any, Optional


def _sort_keys_deep(obj: Any) -> Any:
if isinstance(obj, dict):
return {key: _sort_keys_deep(obj[key]) for key in sorted(obj)}
if isinstance(obj, list):
return [_sort_keys_deep(item) for item in obj]
return obj


def _get_hasher(hash_function: Optional[str]) -> Any:
algorithm = (hash_function or "md5").lower()
try:
return getattr(hashlib, algorithm)()
except (AttributeError, TypeError, ValueError):
if algorithm != "md5":
warnings.warn(f"Hash function '{hash_function}' unavailable. Falling back to MD5.", RuntimeWarning)
return hashlib.md5()


def calculate_hash_from_string(message: str, hash_function: str = "md5") -> str:
"""
Calculates hash of a given text.

Defaults to MD5 for parity with the legacy JS utilities and falls back to MD5 if the
requested algorithm is unavailable. Not intended for security-sensitive use or any
scenario where collision resistance matters.

Args:
message: Input text to hash.
hash_function: Hash function name. Falls back to MD5 if unavailable.

Returns:
str: Hex digest of the hashed message.
"""
hasher = _get_hasher(hash_function)
hasher.update(message.encode())
return hasher.hexdigest()


def calculate_hash_from_object(obj: Any, hash_function: str = "md5") -> str:
"""
Calculates hash of a given object. It must be serializable.

Keys are sorted recursively before hashing to ensure deterministic output.

Args:
obj: Serializable object to hash.
hash_function: Hash function name. Falls back to MD5 if unavailable.

Returns:
str: Hex digest of the hashed object representation.
"""
sorted_obj = _sort_keys_deep(obj)
message = json.dumps(sorted_obj, separators=(",", ":"), ensure_ascii=False)
return calculate_hash_from_string(message, hash_function)
52 changes: 52 additions & 0 deletions tests/py/unit/test_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import hashlib
import json

import pytest

import mat3ra.utils.hash as hash_utils


def test_calculate_hash_from_string_default_md5():
message = "hello world"
expected = hashlib.md5(message.encode()).hexdigest()
assert hash_utils.calculate_hash_from_string(message) == expected


def test_calculate_hash_from_string_with_algorithm():
message = "hello world"
expected = hashlib.sha1(message.encode()).hexdigest()
assert hash_utils.calculate_hash_from_string(message, "sha1") == expected


def test_calculate_hash_from_object_sorts_keys():
obj = {
"b": 1,
"a": {"d": 4, "c": 3},
"list": [{"y": 2, "x": 1}, 3],
}
sorted_obj = {"a": {"c": 3, "d": 4}, "b": 1, "list": [{"x": 1, "y": 2}, 3]}
expected_message = json.dumps(sorted_obj, separators=(",", ":"), ensure_ascii=False)
expected = hashlib.md5(expected_message.encode()).hexdigest()
assert hash_utils.calculate_hash_from_object(obj) == expected


def test_calculate_hash_from_object_with_algorithm():
obj = {"b": 1, "a": 2}
expected_message = json.dumps({"a": 2, "b": 1}, separators=(",", ":"), ensure_ascii=False)
expected = hashlib.sha256(expected_message.encode()).hexdigest()
assert hash_utils.calculate_hash_from_object(obj, "sha256") == expected


def test_calculate_hash_from_string_invalid_algorithm_falls_back_to_md5():
message = "fallback"
expected = hashlib.md5(message.encode()).hexdigest()
with pytest.warns(RuntimeWarning):
assert hash_utils.calculate_hash_from_string(message, "unknown") == expected


def test_calculate_hash_from_object_invalid_algorithm_falls_back_to_md5():
obj = {"b": 1, "a": 2}
expected_message = json.dumps({"a": 2, "b": 1}, separators=(",", ":"), ensure_ascii=False)
expected = hashlib.md5(expected_message.encode()).hexdigest()
with pytest.warns(RuntimeWarning):
assert hash_utils.calculate_hash_from_object(obj, "unknown") == expected
Loading