From e89d81f249dc2a3b2bd891c6e65e16e651405e05 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 9 Feb 2026 23:33:29 -0600 Subject: [PATCH 01/15] refactor: move from deprecated pkg_resources --- dataprofiler/labelers/base_data_labeler.py | 20 +++++++++++++------ dataprofiler/labelers/data_labelers.py | 7 +++++-- dataprofiler/labelers/data_processing.py | 8 ++++++-- .../tests/labelers/test_char_tf_load_model.py | 11 ++++++---- .../test_character_level_cnn_model.py | 5 +++-- .../tests/labelers/test_column_name_model.py | 6 ++++-- .../tests/labelers/test_data_labelers.py | 8 +++++--- .../tests/labelers/test_data_processing.py | 6 ++++-- ...st_integration_column_name_data_labeler.py | 5 +++-- .../test_integration_regex_data_labeler.py | 5 +++-- .../tests/labelers/test_regex_model.py | 5 +++-- 11 files changed, 57 insertions(+), 29 deletions(-) diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index 201f78998..c80754560 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -1,15 +1,17 @@ """Contains abstract classes from which labeler classes will inherit.""" + from __future__ import annotations +import importlib.resources import json import os import sys import warnings +from pathlib import Path from typing import cast import numpy as np import pandas as pd -import pkg_resources from dataprofiler._typing import DataArray @@ -17,7 +19,8 @@ from . import data_processing from .base_model import BaseModel -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + default_labeler_dir = Path(base) / "labelers" class BaseDataLabeler: @@ -246,7 +249,8 @@ def set_params(self, params: dict) -> None: self._postprocessor.set_params(**params["postprocessor"]) self.check_pipeline( - skip_postprocessor=self._postprocessor is None, error_on_mismatch=False + skip_postprocessor=self._postprocessor is None, + error_on_mismatch=False, ) def add_label(self, label: str, same_as: str = None) -> None: @@ -438,7 +442,9 @@ def get_parameter_overlap_mismatches( messages.append( "Preprocessor and postprocessor value for `{}` do not " "match. {} != {}".format( - param, preprocessor_params[param], postprocessor_params[param] + param, + preprocessor_params[param], + postprocessor_params[param], ) ) if messages: @@ -490,7 +496,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict] "The load_options preprocessor class does not " "match the required DataLabeler preprocessor." "\n {} != {}".format( - processor_class.__class__.__name__, param_processor_class + processor_class.__class__.__name__, + param_processor_class, ) ) params["preprocessor"]["class"] = load_options.get("preprocessor_class") @@ -505,7 +512,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict] raise ValueError( "The load_options postprocessor class does not match " "the required DataLabeler postprocessor.\n {} != {}".format( - processor_class.__class__.__name__, param_processor_class + processor_class.__class__.__name__, + param_processor_class, ) ) params["postprocessor"]["class"] = load_options.get("postprocessor_class") diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index a6d9932b7..2cbe2dccc 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -1,17 +1,20 @@ """Module to train and choose between structured and unstructured data labelers.""" + from __future__ import annotations +import importlib.resources import os +from pathlib import Path import pandas as pd -import pkg_resources from .. import data_readers from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler from .base_model import BaseModel from .data_processing import BaseDataPostprocessor, BaseDataPreprocessor -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + default_labeler_dir = Path(base) / "labelers" def train_structured_labeler( diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index d53980a35..6fa439380 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -1,8 +1,10 @@ """Contains pre-built processors for data labeling/processing.""" + from __future__ import annotations import abc import copy +import importlib import inspect import json import math @@ -11,13 +13,15 @@ import types import warnings from collections import Counter +from pathlib import Path from typing import Any, Generator, Iterable, TypeVar, cast import numpy as np import numpy.typing as npt -import pkg_resources -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + default_labeler_dir = Path(base) / "labelers" + Processor = TypeVar("Processor", bound="BaseDataProcessor") diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index c6d70f740..35ad1311f 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -1,19 +1,20 @@ +import importlib.resources import json import os import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np import pandas as pd -import pkg_resources import tensorflow as tf from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") - +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + _resource_labeler_dir = Path(base) / "labelers" mock_model_parameters = { "model_path": "project/example/path/fake_model.h5", @@ -303,7 +304,9 @@ def test_param_validation(self, *mocks): "fake_extra_param": "fails", } model = CharLoadTFModel( - self.model_path, label_mapping=self.label_mapping, parameters=parameters + self.model_path, + label_mapping=self.label_mapping, + parameters=parameters, ) model._construct_model() self.assertDictEqual(parameters, model._parameters) diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index e120a9754..ee99809d4 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -1,12 +1,13 @@ +import importlib import json import os import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np import pandas as pd -import pkg_resources import tensorflow as tf from dataprofiler.labelers.character_level_cnn_model import ( @@ -15,7 +16,7 @@ ) _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +_resource_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_column_name_model.py b/dataprofiler/tests/labelers/test_column_name_model.py index 58f90839e..ca6be0d6c 100644 --- a/dataprofiler/tests/labelers/test_column_name_model.py +++ b/dataprofiler/tests/labelers/test_column_name_model.py @@ -1,18 +1,20 @@ +import importlib import json import os import sys import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np -import pkg_resources import dataprofiler as dp from dataprofiler.labelers.column_name_model import ColumnNameModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +_resource_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" + mock_model_parameters = { "true_positive_dict": [ diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index bbde1c506..1ac6f277a 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -149,11 +149,13 @@ def test_load_from_library(self, *mocks): @mock.patch("tensorflow.keras.models.load_model") def test_load_from_disk(self, *mocks): - import pkg_resources + import importlib + from pathlib import Path - default_labeler_dir = pkg_resources.resource_filename( - "resources", "labelers/structured_model" + default_labeler_dir = ( + Path(importlib.resources.files("resources")) / "labelers/structured_model" ) + data_labeler = dp.DataLabeler.load_from_disk(default_labeler_dir) self.assertIsInstance(data_labeler, BaseDataLabeler) diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 00b4b088b..5ee50996c 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -1,13 +1,14 @@ +import importlib import json import os import random import re import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np -import pkg_resources from dataprofiler.labelers.data_processing import ( BaseDataProcessor, @@ -224,7 +225,8 @@ def test_load_from_library(self, mocked_load, *mocks): BaseDataProcessor.load_from_library("default") # assert called with proper load_processor dirpath - default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") + default_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" + mocked_load.assert_called_with(os.path.join(default_labeler_dir, "default")) @mock.patch("builtins.open") diff --git a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py index bcc136ae3..f20dfd99d 100644 --- a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py @@ -1,7 +1,8 @@ +import importlib import unittest +from pathlib import Path import numpy as np -import pkg_resources import dataprofiler as dp from dataprofiler.labelers.column_name_model import ColumnNameModel @@ -11,7 +12,7 @@ DirectPassPreprocessor, ) -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +default_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" class TestColumnNameDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py index 1cb753723..0c3c7bb70 100644 --- a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py @@ -1,12 +1,13 @@ +import importlib import os import unittest +from pathlib import Path import numpy as np -import pkg_resources from dataprofiler.labelers.data_labelers import BaseDataLabeler -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +default_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" class TestRegexDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_regex_model.py b/dataprofiler/tests/labelers/test_regex_model.py index 6a279307e..4534572a9 100644 --- a/dataprofiler/tests/labelers/test_regex_model.py +++ b/dataprofiler/tests/labelers/test_regex_model.py @@ -1,16 +1,17 @@ +import importlib import json import os import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np -import pkg_resources from dataprofiler.labelers.regex_model import RegexModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +_resource_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" mock_model_parameters = { From e11fe175cf31c27d0489a3f5a1bcbfe89ee58d7f Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 9 Feb 2026 23:40:12 -0600 Subject: [PATCH 02/15] fix: base str traversal --- .../tests/labelers/test_character_level_cnn_model.py | 3 ++- dataprofiler/tests/labelers/test_column_name_model.py | 3 ++- dataprofiler/tests/labelers/test_data_labelers.py | 7 ++++--- dataprofiler/tests/labelers/test_data_processing.py | 5 ++++- .../labelers/test_integration_column_name_data_labeler.py | 3 ++- .../tests/labelers/test_integration_regex_data_labeler.py | 3 ++- dataprofiler/tests/labelers/test_regex_model.py | 3 ++- 7 files changed, 18 insertions(+), 9 deletions(-) diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index ee99809d4..7a0ce5465 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -16,7 +16,8 @@ ) _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + _resource_labeler_dir = Path(base) / "labelers" mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_column_name_model.py b/dataprofiler/tests/labelers/test_column_name_model.py index ca6be0d6c..7b3e81422 100644 --- a/dataprofiler/tests/labelers/test_column_name_model.py +++ b/dataprofiler/tests/labelers/test_column_name_model.py @@ -13,7 +13,8 @@ from dataprofiler.labelers.column_name_model import ColumnNameModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + _resource_labeler_dir = Path(base) / "labelers" mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index 1ac6f277a..bcc833434 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -152,9 +152,10 @@ def test_load_from_disk(self, *mocks): import importlib from pathlib import Path - default_labeler_dir = ( - Path(importlib.resources.files("resources")) / "labelers/structured_model" - ) + with importlib.resources.as_file( + importlib.resources.files("resources") + ) as base: + default_labeler_dir = Path(base) / "labelers/structured_model" data_labeler = dp.DataLabeler.load_from_disk(default_labeler_dir) self.assertIsInstance(data_labeler, BaseDataLabeler) diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 5ee50996c..562c67dd9 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -225,7 +225,10 @@ def test_load_from_library(self, mocked_load, *mocks): BaseDataProcessor.load_from_library("default") # assert called with proper load_processor dirpath - default_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" + with importlib.resources.as_file( + importlib.resources.files("resources") + ) as base: + default_labeler_dir = Path(base) / "labelers" mocked_load.assert_called_with(os.path.join(default_labeler_dir, "default")) diff --git a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py index f20dfd99d..df774fc50 100644 --- a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py @@ -12,7 +12,8 @@ DirectPassPreprocessor, ) -default_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + default_labeler_dir = Path(base) / "labelers" class TestColumnNameDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py index 0c3c7bb70..c817a8a7f 100644 --- a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py @@ -7,7 +7,8 @@ from dataprofiler.labelers.data_labelers import BaseDataLabeler -default_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + default_labeler_dir = Path(base) / "labelers" class TestRegexDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_regex_model.py b/dataprofiler/tests/labelers/test_regex_model.py index 4534572a9..507483710 100644 --- a/dataprofiler/tests/labelers/test_regex_model.py +++ b/dataprofiler/tests/labelers/test_regex_model.py @@ -11,7 +11,8 @@ from dataprofiler.labelers.regex_model import RegexModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = Path(importlib.resources.files("resources")) / "labelers" +with importlib.resources.as_file(importlib.resources.files("resources")) as base: + _resource_labeler_dir = Path(base) / "labelers" mock_model_parameters = { From 83ccf099b99c23274aedafdf030f7f30915185cf Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 10 Feb 2026 00:45:55 -0600 Subject: [PATCH 03/15] fix: to use func --- dataprofiler/labelers/base_data_labeler.py | 7 ++---- dataprofiler/labelers/data_labelers.py | 6 ++--- dataprofiler/labelers/utils.py | 23 +++++++++++++++++++ .../tests/labelers/test_char_tf_load_model.py | 6 ++--- .../test_character_level_cnn_model.py | 6 ++--- .../tests/labelers/test_column_name_model.py | 6 ++--- .../tests/labelers/test_data_labelers.py | 8 ++----- .../tests/labelers/test_data_processing.py | 12 +++++----- ...st_integration_column_name_data_labeler.py | 6 ++--- .../test_integration_regex_data_labeler.py | 15 ++++++++---- .../tests/labelers/test_regex_model.py | 9 ++++---- 11 files changed, 57 insertions(+), 47 deletions(-) diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index c80754560..871c050ef 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -2,12 +2,10 @@ from __future__ import annotations -import importlib.resources import json import os import sys import warnings -from pathlib import Path from typing import cast import numpy as np @@ -16,11 +14,10 @@ from dataprofiler._typing import DataArray from .. import data_readers -from . import data_processing +from . import data_processing, utils from .base_model import BaseModel -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - default_labeler_dir = Path(base) / "labelers" +default_labeler_dir = utils.find_resources_dir() / "labelers" class BaseDataLabeler: diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index 2cbe2dccc..5d69fb1a6 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -2,19 +2,17 @@ from __future__ import annotations -import importlib.resources import os -from pathlib import Path import pandas as pd from .. import data_readers +from . import utils from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler from .base_model import BaseModel from .data_processing import BaseDataPostprocessor, BaseDataPreprocessor -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - default_labeler_dir = Path(base) / "labelers" +default_labeler_dir = utils.find_resources_dir() / "labelers" def train_structured_labeler( diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 2d587f7b4..ad4e7fcc8 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -1,6 +1,9 @@ """Contains functions for checking for installations/dependencies.""" + import sys +import sysconfig import warnings +from pathlib import Path from typing import Any, Callable, List @@ -50,3 +53,23 @@ def new_f(*args: Any, **kwds: Any) -> Any: return new_f return check_module + + +def find_resources_dir() -> Path: + """Return the path to the package resources for the labeler.""" + # 1) Installed location from data_files: /resources + prefix = Path(sysconfig.get_path("data")) + installed = prefix / "resources" + if installed.exists(): + return installed + + # 2) Source tree fallback (works in editable installs / tests) + # Adjust the anchor file to something inside your package. + here = Path(__file__).resolve() + # Walk upwards to find repo root that contains "resources/labelers" + for parent in [here, *here.parents]: + candidate = parent / "resources" + if candidate.exists(): + return candidate + + raise FileNotFoundError("Could not locate resources (installed or source tree).") diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index 35ad1311f..61173e571 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -1,20 +1,18 @@ -import importlib.resources import json import os import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np import pandas as pd import tensorflow as tf +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - _resource_labeler_dir = Path(base) / "labelers" +default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mock_model_parameters = { "model_path": "project/example/path/fake_model.h5", diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index 7a0ce5465..530dda2ac 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -1,23 +1,21 @@ -import importlib import json import os import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np import pandas as pd import tensorflow as tf +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.character_level_cnn_model import ( CharacterLevelCnnModel, EncodingLayer, ) _file_dir = os.path.dirname(os.path.abspath(__file__)) -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - _resource_labeler_dir = Path(base) / "labelers" +_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_column_name_model.py b/dataprofiler/tests/labelers/test_column_name_model.py index 7b3e81422..e3326a394 100644 --- a/dataprofiler/tests/labelers/test_column_name_model.py +++ b/dataprofiler/tests/labelers/test_column_name_model.py @@ -1,20 +1,18 @@ -import importlib import json import os import sys import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np import dataprofiler as dp +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.column_name_model import ColumnNameModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - _resource_labeler_dir = Path(base) / "labelers" +_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index bcc833434..c52187bef 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -149,13 +149,9 @@ def test_load_from_library(self, *mocks): @mock.patch("tensorflow.keras.models.load_model") def test_load_from_disk(self, *mocks): - import importlib - from pathlib import Path + from dataprofiler.labelers import utils as labeler_utils - with importlib.resources.as_file( - importlib.resources.files("resources") - ) as base: - default_labeler_dir = Path(base) / "labelers/structured_model" + default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" data_labeler = dp.DataLabeler.load_from_disk(default_labeler_dir) self.assertIsInstance(data_labeler, BaseDataLabeler) diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 562c67dd9..9d386d8d9 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -1,15 +1,17 @@ -import importlib +pass import json import os import random import re import unittest from io import StringIO -from pathlib import Path + +pass from unittest import mock import numpy as np +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.data_processing import ( BaseDataProcessor, CharEncodedPreprocessor, @@ -225,10 +227,8 @@ def test_load_from_library(self, mocked_load, *mocks): BaseDataProcessor.load_from_library("default") # assert called with proper load_processor dirpath - with importlib.resources.as_file( - importlib.resources.files("resources") - ) as base: - default_labeler_dir = Path(base) / "labelers" + + default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mocked_load.assert_called_with(os.path.join(default_labeler_dir, "default")) diff --git a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py index df774fc50..5d2307458 100644 --- a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py @@ -1,10 +1,9 @@ -import importlib import unittest -from pathlib import Path import numpy as np import dataprofiler as dp +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.column_name_model import ColumnNameModel from dataprofiler.labelers.data_labelers import BaseDataLabeler from dataprofiler.labelers.data_processing import ( @@ -12,8 +11,7 @@ DirectPassPreprocessor, ) -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - default_labeler_dir = Path(base) / "labelers" +default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" class TestColumnNameDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py index c817a8a7f..7c729ccaf 100644 --- a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py @@ -1,21 +1,26 @@ -import importlib import os import unittest -from pathlib import Path import numpy as np +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.data_labelers import BaseDataLabeler -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - default_labeler_dir = Path(base) / "labelers" +default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" class TestRegexDataLabeler(unittest.TestCase): @classmethod def setUpClass(cls) -> None: cls.data = np.array( - ["123 Fake St.", "1/2/2020", "nice.", "4/3/22", "abc", "333-44-2341"] + [ + "123 Fake St.", + "1/2/2020", + "nice.", + "4/3/22", + "abc", + "333-44-2341", + ] ).reshape((-1,)) cls.data_labeler = BaseDataLabeler.load_from_disk( os.path.join(default_labeler_dir, "regex_model") diff --git a/dataprofiler/tests/labelers/test_regex_model.py b/dataprofiler/tests/labelers/test_regex_model.py index 507483710..91a2dfff3 100644 --- a/dataprofiler/tests/labelers/test_regex_model.py +++ b/dataprofiler/tests/labelers/test_regex_model.py @@ -1,18 +1,16 @@ -import importlib import json import os import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.regex_model import RegexModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - _resource_labeler_dir = Path(base) / "labelers" +_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mock_model_parameters = { @@ -163,7 +161,8 @@ def test_param_validation(self): for invalid_param_set in invalid_parameters: with self.assertRaises(ValueError): RegexModel( - label_mapping=self.label_mapping, parameters=invalid_param_set + label_mapping=self.label_mapping, + parameters=invalid_param_set, ) @mock.patch("sys.stdout", new_callable=StringIO) From 8a0af0bda6d1bf486cf34c7de8079eb04642003b Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 10 Feb 2026 00:53:29 -0600 Subject: [PATCH 04/15] fix: add missing change --- dataprofiler/labelers/data_processing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 6fa439380..c4517a0e3 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -4,7 +4,6 @@ import abc import copy -import importlib import inspect import json import math @@ -13,14 +12,14 @@ import types import warnings from collections import Counter -from pathlib import Path from typing import Any, Generator, Iterable, TypeVar, cast import numpy as np import numpy.typing as npt -with importlib.resources.as_file(importlib.resources.files("resources")) as base: - default_labeler_dir = Path(base) / "labelers" +from . import utils + +default_labeler_dir = utils.find_resources_dir() / "labelers" Processor = TypeVar("Processor", bound="BaseDataProcessor") From ef42a14bb10638c165edf76abfce1ef4b62f26a3 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 12:32:30 -0600 Subject: [PATCH 05/15] refactor: move from data_files to package_files --- MANIFEST.in | 1 + setup.py | 12 +++--------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 3f426b7bb..ce4044adb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -16,6 +16,7 @@ recursive-include dataprofiler *.txt recursive-include resources *.json recursive-include resources *.pb recursive-include resources *.py +recursive-include resources *.keras recursive-include dataprofiler/labelers/embeddings *.txt include versioneer.py diff --git a/setup.py b/setup.py index f1f799446..277d6d521 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,5 @@ """A setuptools for the Data Profiler Application and Python Libraries.""" -import os - # To use a consistent encoding from codecs import open from os import path @@ -42,9 +40,6 @@ reports_packages = f.read().splitlines() resource_dir = "resources" -default_labeler_files = [ - (d, [os.path.join(d, f) for f in files]) for d, _, files in os.walk(resource_dir) -] DESCRIPTION = ( @@ -110,15 +105,14 @@ # # If there are data files included in your packages that need to be # # installed, specify them here. If using Python 2.6 or less, then these # # have to be included in MANIFEST.in as well. - # package_data={ - # 'data': [], - # }, + package_data={ + "data": [f"{resource_dir}/*"], + }, # # # Although 'package_data' is the preferred approach, in some case you may # # need to place data files outside of your packages. See: # # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa # # In this case, 'data_file' will be installed into '/my_data' - data_files=default_labeler_files, include_package_data=True, ) From c946fc6808db619fef1c58ddc7f597f08210a27b Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 13:21:49 -0600 Subject: [PATCH 06/15] refactor: resources to be in package --- MANIFEST.in | 8 ++++---- dataprofiler/labelers/utils.py | 18 +++--------------- .../data_labeler_parameters.json | 0 .../column_name_labeler/label_mapping.json | 0 .../column_name_labeler/model_parameters.json | 0 .../postprocessor_parameters.json | 0 .../preprocessor_parameters.json | 0 .../regex_model/data_labeler_parameters.json | 0 .../labelers/regex_model/label_mapping.json | 0 .../regex_model/model_parameters.json | 0 .../regex_model/postprocessor_parameters.json | 0 .../regex_model/preprocessor_parameters.json | 0 .../data_labeler_parameters.json | 0 .../structured_model/label_mapping.json | 0 .../labelers/structured_model/model.keras | Bin .../structured_model/model_parameters.json | 0 .../postprocessor_parameters.json | 0 .../preprocessor_parameters.json | 0 .../data_labeler_parameters.json | 0 .../unstructured_model/label_mapping.json | 0 .../labelers/unstructured_model/model.keras | Bin .../unstructured_model/model_parameters.json | 0 .../postprocessor_parameters.json | 0 .../preprocessor_parameters.json | 0 resources/__init__.py | 2 -- setup.py | 2 +- 26 files changed, 8 insertions(+), 22 deletions(-) rename {resources => dataprofiler/resources}/labelers/column_name_labeler/data_labeler_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/column_name_labeler/label_mapping.json (100%) rename {resources => dataprofiler/resources}/labelers/column_name_labeler/model_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/column_name_labeler/postprocessor_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/column_name_labeler/preprocessor_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/regex_model/data_labeler_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/regex_model/label_mapping.json (100%) rename {resources => dataprofiler/resources}/labelers/regex_model/model_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/regex_model/postprocessor_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/regex_model/preprocessor_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/structured_model/data_labeler_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/structured_model/label_mapping.json (100%) rename {resources => dataprofiler/resources}/labelers/structured_model/model.keras (100%) rename {resources => dataprofiler/resources}/labelers/structured_model/model_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/structured_model/postprocessor_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/structured_model/preprocessor_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/unstructured_model/data_labeler_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/unstructured_model/label_mapping.json (100%) rename {resources => dataprofiler/resources}/labelers/unstructured_model/model.keras (100%) rename {resources => dataprofiler/resources}/labelers/unstructured_model/model_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/unstructured_model/postprocessor_parameters.json (100%) rename {resources => dataprofiler/resources}/labelers/unstructured_model/preprocessor_parameters.json (100%) delete mode 100644 resources/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in index ce4044adb..bafea0779 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -13,10 +13,10 @@ recursive-include dataprofiler *.parquet recursive-include dataprofiler *.py recursive-include dataprofiler *.txt -recursive-include resources *.json -recursive-include resources *.pb -recursive-include resources *.py -recursive-include resources *.keras +recursive-include dataprofiler/resources *.json +recursive-include dataprofiler/resources *.pb +recursive-include dataprofiler/resources *.py +recursive-include dataprofiler/resources *.keras recursive-include dataprofiler/labelers/embeddings *.txt include versioneer.py diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index ad4e7fcc8..a66a0b126 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -1,7 +1,7 @@ """Contains functions for checking for installations/dependencies.""" +import importlib.resources import sys -import sysconfig import warnings from pathlib import Path from typing import Any, Callable, List @@ -57,19 +57,7 @@ def new_f(*args: Any, **kwds: Any) -> Any: def find_resources_dir() -> Path: """Return the path to the package resources for the labeler.""" - # 1) Installed location from data_files: /resources - prefix = Path(sysconfig.get_path("data")) - installed = prefix / "resources" - if installed.exists(): - return installed - - # 2) Source tree fallback (works in editable installs / tests) - # Adjust the anchor file to something inside your package. - here = Path(__file__).resolve() - # Walk upwards to find repo root that contains "resources/labelers" - for parent in [here, *here.parents]: - candidate = parent / "resources" - if candidate.exists(): - return candidate + with importlib.resources.as_file(importlib.resources.files("resources")) as base: + return Path(base) raise FileNotFoundError("Could not locate resources (installed or source tree).") diff --git a/resources/labelers/column_name_labeler/data_labeler_parameters.json b/dataprofiler/resources/labelers/column_name_labeler/data_labeler_parameters.json similarity index 100% rename from resources/labelers/column_name_labeler/data_labeler_parameters.json rename to dataprofiler/resources/labelers/column_name_labeler/data_labeler_parameters.json diff --git a/resources/labelers/column_name_labeler/label_mapping.json b/dataprofiler/resources/labelers/column_name_labeler/label_mapping.json similarity index 100% rename from resources/labelers/column_name_labeler/label_mapping.json rename to dataprofiler/resources/labelers/column_name_labeler/label_mapping.json diff --git a/resources/labelers/column_name_labeler/model_parameters.json b/dataprofiler/resources/labelers/column_name_labeler/model_parameters.json similarity index 100% rename from resources/labelers/column_name_labeler/model_parameters.json rename to dataprofiler/resources/labelers/column_name_labeler/model_parameters.json diff --git a/resources/labelers/column_name_labeler/postprocessor_parameters.json b/dataprofiler/resources/labelers/column_name_labeler/postprocessor_parameters.json similarity index 100% rename from resources/labelers/column_name_labeler/postprocessor_parameters.json rename to dataprofiler/resources/labelers/column_name_labeler/postprocessor_parameters.json diff --git a/resources/labelers/column_name_labeler/preprocessor_parameters.json b/dataprofiler/resources/labelers/column_name_labeler/preprocessor_parameters.json similarity index 100% rename from resources/labelers/column_name_labeler/preprocessor_parameters.json rename to dataprofiler/resources/labelers/column_name_labeler/preprocessor_parameters.json diff --git a/resources/labelers/regex_model/data_labeler_parameters.json b/dataprofiler/resources/labelers/regex_model/data_labeler_parameters.json similarity index 100% rename from resources/labelers/regex_model/data_labeler_parameters.json rename to dataprofiler/resources/labelers/regex_model/data_labeler_parameters.json diff --git a/resources/labelers/regex_model/label_mapping.json b/dataprofiler/resources/labelers/regex_model/label_mapping.json similarity index 100% rename from resources/labelers/regex_model/label_mapping.json rename to dataprofiler/resources/labelers/regex_model/label_mapping.json diff --git a/resources/labelers/regex_model/model_parameters.json b/dataprofiler/resources/labelers/regex_model/model_parameters.json similarity index 100% rename from resources/labelers/regex_model/model_parameters.json rename to dataprofiler/resources/labelers/regex_model/model_parameters.json diff --git a/resources/labelers/regex_model/postprocessor_parameters.json b/dataprofiler/resources/labelers/regex_model/postprocessor_parameters.json similarity index 100% rename from resources/labelers/regex_model/postprocessor_parameters.json rename to dataprofiler/resources/labelers/regex_model/postprocessor_parameters.json diff --git a/resources/labelers/regex_model/preprocessor_parameters.json b/dataprofiler/resources/labelers/regex_model/preprocessor_parameters.json similarity index 100% rename from resources/labelers/regex_model/preprocessor_parameters.json rename to dataprofiler/resources/labelers/regex_model/preprocessor_parameters.json diff --git a/resources/labelers/structured_model/data_labeler_parameters.json b/dataprofiler/resources/labelers/structured_model/data_labeler_parameters.json similarity index 100% rename from resources/labelers/structured_model/data_labeler_parameters.json rename to dataprofiler/resources/labelers/structured_model/data_labeler_parameters.json diff --git a/resources/labelers/structured_model/label_mapping.json b/dataprofiler/resources/labelers/structured_model/label_mapping.json similarity index 100% rename from resources/labelers/structured_model/label_mapping.json rename to dataprofiler/resources/labelers/structured_model/label_mapping.json diff --git a/resources/labelers/structured_model/model.keras b/dataprofiler/resources/labelers/structured_model/model.keras similarity index 100% rename from resources/labelers/structured_model/model.keras rename to dataprofiler/resources/labelers/structured_model/model.keras diff --git a/resources/labelers/structured_model/model_parameters.json b/dataprofiler/resources/labelers/structured_model/model_parameters.json similarity index 100% rename from resources/labelers/structured_model/model_parameters.json rename to dataprofiler/resources/labelers/structured_model/model_parameters.json diff --git a/resources/labelers/structured_model/postprocessor_parameters.json b/dataprofiler/resources/labelers/structured_model/postprocessor_parameters.json similarity index 100% rename from resources/labelers/structured_model/postprocessor_parameters.json rename to dataprofiler/resources/labelers/structured_model/postprocessor_parameters.json diff --git a/resources/labelers/structured_model/preprocessor_parameters.json b/dataprofiler/resources/labelers/structured_model/preprocessor_parameters.json similarity index 100% rename from resources/labelers/structured_model/preprocessor_parameters.json rename to dataprofiler/resources/labelers/structured_model/preprocessor_parameters.json diff --git a/resources/labelers/unstructured_model/data_labeler_parameters.json b/dataprofiler/resources/labelers/unstructured_model/data_labeler_parameters.json similarity index 100% rename from resources/labelers/unstructured_model/data_labeler_parameters.json rename to dataprofiler/resources/labelers/unstructured_model/data_labeler_parameters.json diff --git a/resources/labelers/unstructured_model/label_mapping.json b/dataprofiler/resources/labelers/unstructured_model/label_mapping.json similarity index 100% rename from resources/labelers/unstructured_model/label_mapping.json rename to dataprofiler/resources/labelers/unstructured_model/label_mapping.json diff --git a/resources/labelers/unstructured_model/model.keras b/dataprofiler/resources/labelers/unstructured_model/model.keras similarity index 100% rename from resources/labelers/unstructured_model/model.keras rename to dataprofiler/resources/labelers/unstructured_model/model.keras diff --git a/resources/labelers/unstructured_model/model_parameters.json b/dataprofiler/resources/labelers/unstructured_model/model_parameters.json similarity index 100% rename from resources/labelers/unstructured_model/model_parameters.json rename to dataprofiler/resources/labelers/unstructured_model/model_parameters.json diff --git a/resources/labelers/unstructured_model/postprocessor_parameters.json b/dataprofiler/resources/labelers/unstructured_model/postprocessor_parameters.json similarity index 100% rename from resources/labelers/unstructured_model/postprocessor_parameters.json rename to dataprofiler/resources/labelers/unstructured_model/postprocessor_parameters.json diff --git a/resources/labelers/unstructured_model/preprocessor_parameters.json b/dataprofiler/resources/labelers/unstructured_model/preprocessor_parameters.json similarity index 100% rename from resources/labelers/unstructured_model/preprocessor_parameters.json rename to dataprofiler/resources/labelers/unstructured_model/preprocessor_parameters.json diff --git a/resources/__init__.py b/resources/__init__.py deleted file mode 100644 index dd86bffe7..000000000 --- a/resources/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Contains resources for labelers.""" -# empty init diff --git a/setup.py b/setup.py index 277d6d521..19abdf844 100644 --- a/setup.py +++ b/setup.py @@ -106,7 +106,7 @@ # # installed, specify them here. If using Python 2.6 or less, then these # # have to be included in MANIFEST.in as well. package_data={ - "data": [f"{resource_dir}/*"], + "dataprofiler": [f"dataprofiler/{resource_dir}/*"], }, # # # Although 'package_data' is the preferred approach, in some case you may From e0f80b4f10525e9932bcf8d6b76aa7c2c75b6e60 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 14:02:46 -0600 Subject: [PATCH 07/15] fix: utils --- dataprofiler/labelers/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index a66a0b126..83831293a 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -57,7 +57,9 @@ def new_f(*args: Any, **kwds: Any) -> Any: def find_resources_dir() -> Path: """Return the path to the package resources for the labeler.""" - with importlib.resources.as_file(importlib.resources.files("resources")) as base: + with importlib.resources.as_file( + importlib.resources.files("dataprofiler").joinpath("resources") + ) as base: return Path(base) raise FileNotFoundError("Could not locate resources (installed or source tree).") From 11a394dc2f40b5c8df92d847938847c835916cd3 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 14:16:32 -0600 Subject: [PATCH 08/15] fix: path error --- dataprofiler/tests/labelers/test_data_labelers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index c52187bef..73e4caf60 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -151,7 +151,9 @@ def test_load_from_library(self, *mocks): def test_load_from_disk(self, *mocks): from dataprofiler.labelers import utils as labeler_utils - default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" + default_labeler_dir = ( + labeler_utils.find_resources_dir() / "labelers/structured_model" + ) data_labeler = dp.DataLabeler.load_from_disk(default_labeler_dir) self.assertIsInstance(data_labeler, BaseDataLabeler) From 60e58551a5d392ab5437393477167e3ae63a1ff3 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 14:22:33 -0600 Subject: [PATCH 09/15] fix: limit pandas --- .pre-commit-config.yaml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 666cde4b4..3f857386d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,7 +51,7 @@ repos: h5py>=2.10.0, wheel>=0.33.1, numpy<2.0.0, - pandas>=1.1.2, + 'pandas>=1.1.2,<3.0.0', python-dateutil>=2.7.5, pytz>=2020.1, pyarrow>=1.0.1, diff --git a/requirements.txt b/requirements.txt index e32f32851..ec313e34f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ h5py>=2.10.0 wheel>=0.33.1 numpy<2.0.0 -pandas>=1.1.2 +pandas>=1.1.2,<3.0.0 python-dateutil>=2.7.5 pytz>=2020.1 pyarrow>=1.0.1 From 7977fa4c7dfe6fdbcab4ee97fc0a2e25f396db47 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 14:36:37 -0600 Subject: [PATCH 10/15] fix: str required --- dataprofiler/tests/labelers/test_data_labelers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index 73e4caf60..2862c2f03 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -151,7 +151,7 @@ def test_load_from_library(self, *mocks): def test_load_from_disk(self, *mocks): from dataprofiler.labelers import utils as labeler_utils - default_labeler_dir = ( + default_labeler_dir = str( labeler_utils.find_resources_dir() / "labelers/structured_model" ) From 8e026f9227b0106c7df08526af07774572fc60a3 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 15:27:52 -0600 Subject: [PATCH 11/15] fix: tests bc of almost --- .../test_categorical_column_profile.py | 47 ++++++++-- .../test_column_profile_compilers.py | 87 ++++++++++++++++--- .../profilers/test_float_column_profile.py | 72 ++++++++++++--- .../profilers/test_int_column_profile.py | 5 ++ .../tests/profilers/test_profile_builder.py | 9 +- .../profilers/test_text_column_profile.py | 78 +++++++++++++++-- 6 files changed, 257 insertions(+), 41 deletions(-) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 55d2ea68e..0786d0b0f 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -324,7 +324,15 @@ def test_categorical_mapping(self): ) self.assertEqual(num_null_types, len(column_profile.null_types)) self.assertEqual(num_nan_count, len(column_profile.null_types_index["nan"])) - expected = {"abcd": 2, "aa": 2, "b": 1, "4": 1, "3": 1, "2": 2, "dfd": 1} + expected = { + "abcd": 2, + "aa": 2, + "b": 1, + "4": 1, + "3": 1, + "2": 2, + "dfd": 1, + } self.assertDictEqual(expected, cat_profiler._categories) num_null_types = 4 num_nan_count = 2 @@ -448,7 +456,19 @@ def test_categorical_merge(self): ["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan] ) df2 = pd.Series( - ["1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b", "ee"] + [ + "1", + "null", + "ee", + "NaN", + "ff", + "nan", + "gg", + "None", + "aa", + "b", + "ee", + ] ) # Expected is based off insertion order @@ -673,10 +693,12 @@ def test_categorical_merge(self): self.assertIsNone(merge_stop_conditions_not_met._stopped_at_unique_count) self.assertIsNone(merge_stop_conditions_not_met._stopped_at_unique_ratio) self.assertEqual( - 0.99, merge_stop_conditions_not_met.stop_condition_unique_value_ratio + 0.99, + merge_stop_conditions_not_met.stop_condition_unique_value_ratio, ) self.assertEqual( - 12, merge_stop_conditions_not_met.max_sample_size_to_check_stop_condition + 12, + merge_stop_conditions_not_met.max_sample_size_to_check_stop_condition, ) def test_gini_impurity(self): @@ -731,6 +753,11 @@ def test_categorical_diff(self): }, } actual_diff = profile.diff(profile2) + self.assertAlmostEqual( + expected_diff.get("statistics").get("chi2-test").pop("p-value"), + actual_diff.get("statistics").get("chi2-test").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, actual_diff) # Test with one categorical column matching @@ -754,7 +781,10 @@ def test_categorical_diff(self): profile2.update(df_not_categorical) expected_diff = { "categorical": [True, False], - "statistics": {"unique_count": -10, "unique_ratio": -0.7142857142857143}, + "statistics": { + "unique_count": -10, + "unique_ratio": -0.7142857142857143, + }, } self.assertDictEqual(expected_diff, profile.diff(profile2)) @@ -984,7 +1014,12 @@ def test_json_decode_after_update(self): deserialized.update(df_categorical) assert deserialized.sample_size == 14 - assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1} + assert deserialized.categorical_counts == { + "c": 5, + "b": 4, + "a": 4, + "d": 1, + } def test_cms_max_num_heavy_hitters(self): df_categorical = pd.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10) diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 1e0afc124..e6a5d3f8d 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -64,7 +64,8 @@ def test_add_profilers(self): compiler1.name = "compiler1" compiler2.name = "compiler2" with self.assertRaisesRegex( - ValueError, "Column profile names are unmatched: " "compiler1 != compiler2" + ValueError, + "Column profile names are unmatched: " "compiler1 != compiler2", ): compiler1 + compiler2 @@ -95,7 +96,9 @@ def test_no_profilers_error(self): self.assertEqual("Must add profilers.", str(e.exception)) @mock.patch.multiple( - col_pro_compilers.BaseCompiler, __abstractmethods__=set(), _profilers="mock" + col_pro_compilers.BaseCompiler, + __abstractmethods__=set(), + _profilers="mock", ) def test_no_options_error(self): with self.assertRaisesRegex( @@ -230,7 +233,10 @@ def test_diff_primitive_compilers(self): "stddev": 3.285085839971525, "t-test": { "t-statistic": 0.4155260166386663, - "conservative": {"deg_of_free": 1.0, "p-value": 0.749287157907667}, + "conservative": { + "deg_of_free": 1.0, + "p-value": 0.749287157907667, + }, "welch": { "deg_of_free": 3.6288111187629117, "p-value": 0.7011367179395704, @@ -256,6 +262,19 @@ def test_diff_primitive_compilers(self): profile_diff["statistics"].pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff["statistics"].get("t-test").get("welch").pop("p-value"), + profile_diff["statistics"].get("t-test").get("welch").pop("p-value"), + places=10, + ) + self.assertAlmostEqual( + expected_diff["statistics"] + .get("t-test") + .get("conservative") + .pop("p-value"), + profile_diff["statistics"].get("t-test").get("conservative").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Test different compilers @@ -354,6 +373,11 @@ def test_disabling_columns_during_primitive_diff(self): profile_diff["statistics"].pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("statistics").get("t-test").get("welch").pop("p-value"), + profile_diff.get("statistics").get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Test disabling all columns in one compiler @@ -576,7 +600,10 @@ def test_json_encode_after_update(self): "data": { "name": "test", "_profiles": { - "order": {"class": "OrderColumn", "data": {"an": "order"}}, + "order": { + "class": "OrderColumn", + "data": {"an": "order"}, + }, "category": { "class": "CategoricalColumn", "data": {"this": "category"}, @@ -713,7 +740,11 @@ def test_compiler_data_labeler_diff(self, *mocks): expected_diff = { "statistics": { "avg_predictions": {"a": "unchanged", "b": -0.7, "c": 0.7}, - "label_representation": {"a": -0.84, "b": "unchanged", "c": 0.84}, + "label_representation": { + "a": -0.84, + "b": "unchanged", + "c": 0.84, + }, }, "data_label": [["a"], [], ["b"]], } @@ -762,7 +793,9 @@ def test_json_encode_after_update(self, mock_instance, *mocks): compiler = col_pro_compilers.ColumnDataLabelerCompiler(data) with mock.patch.object( - compiler._profiles["data_labeler"], "__dict__", {"data_label": "INTEGER"} + compiler._profiles["data_labeler"], + "__dict__", + {"data_label": "INTEGER"}, ): serialized = json.dumps(compiler, cls=ProfileEncoder) @@ -1016,7 +1049,20 @@ def test_compiler_stats_diff(self, *mocks): "statistics": { "vocab": [ ["H", "l"], - ["e", "o", " ", "T", "h", "i", "s", "a", "t", "g", "r", "n"], + [ + "e", + "o", + " ", + "T", + "h", + "i", + "s", + "a", + "t", + "g", + "r", + "n", + ], ["u", "k", "w", "m", "y", "9"], ], "vocab_count": [ @@ -1037,7 +1083,11 @@ def test_compiler_stats_diff(self, *mocks): }, {"m": 2, "9": 2, "u": 1, "k": 1, "w": 1, "y": 1}, ], - "words": [["Hello", "test"], ["grant"], ["unknown", "name", "9"]], + "words": [ + ["Hello", "test"], + ["grant"], + ["unknown", "name", "9"], + ], "word_count": [ {"Hello": 2, "test": 1}, {"grant": "unchanged"}, @@ -1097,7 +1147,20 @@ def test_compiler_stats_diff(self, *mocks): "statistics": { "vocab": [ ["H", "l"], - ["e", "o", " ", "T", "h", "i", "s", "a", "t", "g", "r", "n"], + [ + "e", + "o", + " ", + "T", + "h", + "i", + "s", + "a", + "t", + "g", + "r", + "n", + ], ["u", "k", "w", "m", "y", "9"], ], "vocab_count": [ @@ -1118,7 +1181,11 @@ def test_compiler_stats_diff(self, *mocks): }, {"m": 2, "9": 2, "u": 1, "k": 1, "w": 1, "y": 1}, ], - "words": [["Hello", "test"], ["grant"], ["unknown", "name", "9"]], + "words": [ + ["Hello", "test"], + ["grant"], + ["unknown", "name", "9"], + ], "word_count": [ {"Hello": 2, "test": 1}, {"grant": "unchanged"}, diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d79fdd641..b11ff7f32 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -626,7 +626,8 @@ def test_null_values_for_histogram(self): } self.assertEqual( - expected_histogram["bin_counts"].tolist(), histogram["bin_counts"].tolist() + expected_histogram["bin_counts"].tolist(), + histogram["bin_counts"].tolist(), ) self.assertCountEqual(expected_histogram["bin_edges"], histogram["bin_edges"]) @@ -759,7 +760,8 @@ def test_profile_histogram_w_updates(self): self.assertIsNotNone(merged_profiler.histogram_selection) histogram = profile["histogram"] self.assertEqual( - expected_histogram["bin_counts"].tolist(), histogram["bin_counts"].tolist() + expected_histogram["bin_counts"].tolist(), + histogram["bin_counts"].tolist(), ) self.assertCountEqual( np.round(expected_histogram["bin_edges"], 12), @@ -833,7 +835,7 @@ def test_total_histogram_bin_variance(self): def test_histogram_loss(self): # run time is small - diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime = ( + (diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime,) = ( 0.3, 0.2, 0.1, @@ -843,12 +845,17 @@ def test_histogram_loss(self): ) expected_loss = 0.1 / 0.2 + 0.05 / 0.05 est_loss = FloatColumn._histogram_loss( - diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime + diff_var, + avg_diffvar, + total_var, + avg_totalvar, + run_time, + avg_runtime, ) self.assertEqual(expected_loss, est_loss) # run time is big - diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime = ( + (diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime,) = ( 0.3, 0.2, 0.1, @@ -858,7 +865,12 @@ def test_histogram_loss(self): ) expected_loss = 0.1 / 0.2 + 0.05 / 0.05 + 8 / 14 est_loss = FloatColumn._histogram_loss( - diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime + diff_var, + avg_diffvar, + total_var, + avg_totalvar, + run_time, + avg_runtime, ) self.assertEqual(expected_loss, est_loss) @@ -866,7 +878,15 @@ def test_select_method_for_histogram(self): data = pd.Series([], dtype=object) profiler = FloatColumn(data.name) profiler.update(data) - list_method = ["auto", "fd", "doane", "scott", "rice", "sturges", "sqrt"] + list_method = [ + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ] current_exact_var = 0 # sqrt has the least current loss current_est_var = np.array([0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.005]) @@ -877,7 +897,10 @@ def test_select_method_for_histogram(self): for i, method in enumerate(list_method): profiler.histogram_methods[method]["total_loss"] = list_total_loss[i] selected_method = profiler._select_method_for_histogram( - current_exact_var, current_est_var, current_total_var, current_run_time + current_exact_var, + current_est_var, + current_total_var, + current_run_time, ) self.assertEqual(selected_method, "sqrt") @@ -894,7 +917,10 @@ def test_select_method_for_histogram(self): for i, method in enumerate(list_method): profiler.histogram_methods[method]["total_loss"] = list_total_loss[i] selected_method = profiler._select_method_for_histogram( - current_exact_var, current_est_var, current_total_var, current_run_time + current_exact_var, + current_est_var, + current_total_var, + current_run_time, ) self.assertEqual(selected_method, "sturges") @@ -921,7 +947,12 @@ def test_merge_histogram(self): profiler._merge_histogram(input_array) merged_hist = profiler._histogram_for_profile("sqrt")[0] - expected_bin_counts, expected_bin_edges = [5, 2, 2], [0.5, 2.0, 3.5, 5.0] + expected_bin_counts, expected_bin_edges = [5, 2, 2], [ + 0.5, + 2.0, + 3.5, + 5.0, + ] self.assertEqual(expected_bin_counts, merged_hist["bin_counts"].tolist()) self.assertCountEqual(expected_bin_edges, merged_hist["bin_edges"]) @@ -1326,7 +1357,8 @@ def test_profile_merge(self): self.assertEqual(profiler3.min, expected_profile.pop("min")) self.assertEqual(profiler3.max, expected_profile.pop("max")) self.assertEqual( - histogram["bin_counts"].tolist(), expected_histogram["bin_counts"].tolist() + histogram["bin_counts"].tolist(), + expected_histogram["bin_counts"].tolist(), ) self.assertCountEqual(histogram["bin_edges"], expected_histogram["bin_edges"]) @@ -1539,7 +1571,10 @@ def test_histogram_option_integration(self): self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges"], num_profiler.histogram_bin_method_names) - options.histogram_and_quantiles.bin_count_or_method = ["sturges", "doane"] + options.histogram_and_quantiles.bin_count_or_method = [ + "sturges", + "doane", + ] num_profiler = FloatColumn(name="test2", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges", "doane"], num_profiler.histogram_bin_method_names) @@ -1553,7 +1588,8 @@ def test_histogram_option_integration(self): # case when just 1 unique value, should just set bin size to be 1 num_profiler.update(pd.Series(["1", "1"])) self.assertEqual( - 1, len(num_profiler.histogram_methods["custom"]["histogram"]["bin_counts"]) + 1, + len(num_profiler.histogram_methods["custom"]["histogram"]["bin_counts"]), ) # case when more than 1 unique value, by virtue of a streaming update @@ -1703,7 +1739,10 @@ def test_diff(self): }, "t-test": { "t-statistic": 0.5393164101529813, - "conservative": {"deg_of_free": 2.0, "p-value": 0.643676756587475}, + "conservative": { + "deg_of_free": 2.0, + "p-value": 0.643676756587475, + }, "welch": { "deg_of_free": 4.999127432888682, "p-value": 0.6128117908944144, @@ -1733,6 +1772,11 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Assert type error is properly called diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 961b33c8c..b53c38398 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1092,6 +1092,11 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Assert type error is properly called diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index c4e604737..e4372f8f6 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2156,9 +2156,14 @@ def test_diff_categorical_chi2_test(self, *mocks): "deg_of_free": 2, "p-value": 0.3099238764710244, } - self.assertDictEqual( - expected_chi2_test_dict, diff["data_stats"][0]["statistics"]["chi2-test"] + + chi2_diff = diff["data_stats"][0]["statistics"]["chi2-test"] + self.assertAlmostEqual( + expected_chi2_test_dict.pop("p-value"), + chi2_diff.pop("p-value"), + places=10, ) + self.assertDictEqual(expected_chi2_test_dict, chi2_diff) @mock.patch( "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 12fb1d27b..ea7694829 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -41,7 +41,20 @@ def test_profiled_vocab(self): ] ).apply(str) df2 = pd.Series( - ["1", "1", "ee", "ff", "ff", "gg", "gg", "abcd", "aa", "b", "ee", "b"] + [ + "1", + "1", + "ee", + "ff", + "ff", + "gg", + "gg", + "abcd", + "aa", + "b", + "ee", + "b", + ] ).apply(str) df3 = pd.Series( [ @@ -112,7 +125,20 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b): ] ).apply(str) df2 = pd.Series( - ["1", "1", "ee", "ff", "ff", "gg", "gg", "abcd", "aa", "b", "ee", "b"] + [ + "1", + "1", + "ee", + "ff", + "ff", + "gg", + "gg", + "abcd", + "aa", + "b", + "ee", + "b", + ] ).apply(str) df3 = pd.Series( [ @@ -451,7 +477,18 @@ def test_profile_merge_with_different_options(self): options.histogram_and_quantiles.bin_count_or_method = None df = pd.Series( - ["pancake", "banana", "lighthouse", "aa", "b", "4", "3", "2", "dfd", "2"] + [ + "pancake", + "banana", + "lighthouse", + "aa", + "b", + "4", + "3", + "2", + "dfd", + "2", + ] ) profiler1 = TextColumn("Text", options=options) @@ -489,7 +526,8 @@ def test_profile_merge_with_different_options(self): def test_text_column_with_wrong_options(self): with self.assertRaisesRegex( - ValueError, "TextColumn parameter 'options' must be of" " type TextOptions." + ValueError, + "TextColumn parameter 'options' must be of" " type TextOptions.", ): profiler = TextColumn("Text", options="wrong_data_type") @@ -531,7 +569,10 @@ def test_histogram_option_integration(self): self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges"], num_profiler.histogram_bin_method_names) - options.histogram_and_quantiles.bin_count_or_method = ["sturges", "doane"] + options.histogram_and_quantiles.bin_count_or_method = [ + "sturges", + "doane", + ] num_profiler = TextColumn(name="test2", options=options) self.assertIsNone(num_profiler.histogram_selection) self.assertEqual(["sturges", "doane"], num_profiler.histogram_bin_method_names) @@ -545,7 +586,8 @@ def test_histogram_option_integration(self): # case when just 1 unique value, should just set bin size to be 1 num_profiler.update(pd.Series(["1", "1"])) self.assertEqual( - 1, len(num_profiler.histogram_methods["custom"]["histogram"]["bin_counts"]) + 1, + len(num_profiler.histogram_methods["custom"]["histogram"]["bin_counts"]), ) # case when more than 1 unique value, by virtue of a streaming update @@ -589,7 +631,10 @@ def test_diff(self): ), "t-test": { "t-statistic": -1.9339958714826413, - "conservative": {"deg_of_free": 8.0, "p-value": 0.08916903961929257}, + "conservative": { + "deg_of_free": 8.0, + "p-value": 0.08916903961929257, + }, "welch": { "deg_of_free": 15.761400272034564, "p-value": 0.07127621949432528, @@ -612,6 +657,11 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) @mock.patch("time.time", return_value=0.0) @@ -676,7 +726,10 @@ def test_json_encode_after_update(self, time): "total_loss": 0.0, "current_loss": 0.0, "suggested_bin_count": 5, - "histogram": {"bin_counts": None, "bin_edges": None}, + "histogram": { + "bin_counts": None, + "bin_edges": None, + }, } }, "_stored_histogram": { @@ -685,7 +738,14 @@ def test_json_encode_after_update(self, time): "suggested_bin_count": 1000, "histogram": { "bin_counts": [6, 4, 0, 0, 1], - "bin_edges": [1.0, 3.2, 5.4, 7.6000000000000005, 9.8, 12.0], + "bin_edges": [ + 1.0, + 3.2, + 5.4, + 7.6000000000000005, + 9.8, + 12.0, + ], }, }, "_batch_history": [ From 1236c7b4d6e3f43f61383a1f05b6ec6fe4c5a9c8 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 12 Feb 2026 15:43:36 -0600 Subject: [PATCH 12/15] fix: almost equal --- .../tests/profilers/test_column_profile_compilers.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index e6a5d3f8d..7cb6b57fe 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -378,6 +378,17 @@ def test_disabling_columns_during_primitive_diff(self): profile_diff.get("statistics").get("t-test").get("welch").pop("p-value"), places=10, ) + self.assertAlmostEqual( + expected_diff.get("statistics") + .get("t-test") + .get("conservative") + .pop("p-value"), + profile_diff.get("statistics") + .get("t-test") + .get("conservative") + .pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Test disabling all columns in one compiler From f387e1fe71cfa6da0a1ad1453b70f1de88496b5e Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 24 Feb 2026 22:40:58 -0600 Subject: [PATCH 13/15] feat: refactor to pass in a path or string or None --- dataprofiler/labelers/base_data_labeler.py | 2 +- dataprofiler/labelers/data_labelers.py | 2 +- dataprofiler/labelers/data_processing.py | 2 +- dataprofiler/labelers/utils.py | 17 ++++++++++------- .../tests/labelers/test_char_tf_load_model.py | 2 +- .../labelers/test_character_level_cnn_model.py | 2 +- .../tests/labelers/test_column_name_model.py | 3 +-- .../tests/labelers/test_data_labelers.py | 2 +- .../tests/labelers/test_data_processing.py | 2 +- ...test_integration_column_name_data_labeler.py | 2 +- .../test_integration_regex_data_labeler.py | 2 +- dataprofiler/tests/labelers/test_regex_model.py | 2 +- 12 files changed, 21 insertions(+), 19 deletions(-) diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index 871c050ef..f9a4a0ab1 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -17,7 +17,7 @@ from . import data_processing, utils from .base_model import BaseModel -default_labeler_dir = utils.find_resources_dir() / "labelers" +default_labeler_dir = utils.find_resources_dir("labelers") class BaseDataLabeler: diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index 5d69fb1a6..961b45e61 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -12,7 +12,7 @@ from .base_model import BaseModel from .data_processing import BaseDataPostprocessor, BaseDataPreprocessor -default_labeler_dir = utils.find_resources_dir() / "labelers" +default_labeler_dir = utils.find_resources_dir("labelers") def train_structured_labeler( diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index c4517a0e3..ba17a3bd3 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -19,7 +19,7 @@ from . import utils -default_labeler_dir = utils.find_resources_dir() / "labelers" +default_labeler_dir = utils.find_resources_dir("labelers") Processor = TypeVar("Processor", bound="BaseDataProcessor") diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 83831293a..ddc252872 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -3,6 +3,7 @@ import importlib.resources import sys import warnings +from importlib.resources.abc import Traversable from pathlib import Path from typing import Any, Callable, List @@ -55,11 +56,13 @@ def new_f(*args: Any, **kwds: Any) -> Any: return check_module -def find_resources_dir() -> Path: - """Return the path to the package resources for the labeler.""" - with importlib.resources.as_file( - importlib.resources.files("dataprofiler").joinpath("resources") - ) as base: - return Path(base) +def find_resources_dir(resource_path: str | Path | None = None) -> Traversable: + """Return the path to the package resources.""" + resource = importlib.resources.files("dataprofiler") / "resources" + if resource_path: + resource /= resource_path - raise FileNotFoundError("Could not locate resources (installed or source tree).") + if not (resource.is_file() or resource.is_dir()): + raise FileNotFoundError(f"Resource not found: {resource_path}") + + return resource diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index 61173e571..40879e579 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -12,7 +12,7 @@ from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" +default_labeler_dir = labeler_utils.find_resources_dir("labelers") mock_model_parameters = { "model_path": "project/example/path/fake_model.h5", diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index 530dda2ac..cbc35b131 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -15,7 +15,7 @@ ) _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" +_resource_labeler_dir = labeler_utils.find_resources_dir("labelers") mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_column_name_model.py b/dataprofiler/tests/labelers/test_column_name_model.py index e3326a394..dfd4274ee 100644 --- a/dataprofiler/tests/labelers/test_column_name_model.py +++ b/dataprofiler/tests/labelers/test_column_name_model.py @@ -11,8 +11,7 @@ from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.column_name_model import ColumnNameModel -_file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" +_resource_labeler_dir = labeler_utils.find_resources_dir("labelers") mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index 2862c2f03..b0cd4c7ee 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -152,7 +152,7 @@ def test_load_from_disk(self, *mocks): from dataprofiler.labelers import utils as labeler_utils default_labeler_dir = str( - labeler_utils.find_resources_dir() / "labelers/structured_model" + labeler_utils.find_resources_dir("labelers/structured_model") ) data_labeler = dp.DataLabeler.load_from_disk(default_labeler_dir) diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 9d386d8d9..5ec15594f 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -228,7 +228,7 @@ def test_load_from_library(self, mocked_load, *mocks): # assert called with proper load_processor dirpath - default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" + default_labeler_dir = labeler_utils.find_resources_dir("labelers") mocked_load.assert_called_with(os.path.join(default_labeler_dir, "default")) diff --git a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py index 5d2307458..8b19731f1 100644 --- a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py @@ -11,7 +11,7 @@ DirectPassPreprocessor, ) -default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" +default_labeler_dir = labeler_utils.find_resources_dir("labelers") class TestColumnNameDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py index 7c729ccaf..df72b99e9 100644 --- a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py @@ -6,7 +6,7 @@ from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.data_labelers import BaseDataLabeler -default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" +default_labeler_dir = labeler_utils.find_resources_dir("labelers") class TestRegexDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_regex_model.py b/dataprofiler/tests/labelers/test_regex_model.py index 91a2dfff3..66ac64482 100644 --- a/dataprofiler/tests/labelers/test_regex_model.py +++ b/dataprofiler/tests/labelers/test_regex_model.py @@ -10,7 +10,7 @@ from dataprofiler.labelers.regex_model import RegexModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" +_resource_labeler_dir = labeler_utils.find_resources_dir("labelers") mock_model_parameters = { From 03855a7f620c2b01b6b289117a0f81511889a181 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 24 Feb 2026 22:49:41 -0600 Subject: [PATCH 14/15] fix: import for older versions --- dataprofiler/labelers/utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index ddc252872..60f7eb32d 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -3,9 +3,16 @@ import importlib.resources import sys import warnings -from importlib.resources.abc import Traversable from pathlib import Path -from typing import Any, Callable, List +from typing import TYPE_CHECKING, Any, Callable, List + +if TYPE_CHECKING: + try: + # Newer Pythons / newer typeshed + from importlib.resources.abc import Traversable + except ModuleNotFoundError: + # Older Pythons + from importlib.abc import Traversable def warn_missing_module(labeler_function: str, module_name: str) -> None: From af1303510692d54eefc815a1f787cc01b8d8276a Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 24 Feb 2026 22:51:05 -0600 Subject: [PATCH 15/15] fix: Tranversable must be done at runtime --- dataprofiler/labelers/utils.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 60f7eb32d..995538696 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -4,15 +4,14 @@ import sys import warnings from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, List - -if TYPE_CHECKING: - try: - # Newer Pythons / newer typeshed - from importlib.resources.abc import Traversable - except ModuleNotFoundError: - # Older Pythons - from importlib.abc import Traversable +from typing import Any, Callable, List + +try: + # Newer Pythons / newer typeshed + from importlib.resources.abc import Traversable +except ModuleNotFoundError: + # Older Pythons + from importlib.abc import Traversable def warn_missing_module(labeler_function: str, module_name: str) -> None: