Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/UTESTS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11", "3.12"]
test-group: ["unit", "integration", "layers"]
include:
- python-version: "3.11"
Expand All @@ -87,7 +87,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Cache Poetry dependencies
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
Expand Down Expand Up @@ -124,7 +124,7 @@ jobs:
timeout-minutes: 8

- name: Upload test results
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: always()
with:
name: test-results-${{ matrix.python-version }}-${{ matrix.test-group }}
Expand Down Expand Up @@ -157,15 +157,15 @@ jobs:
timeout-minutes: 10

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
file: ./coverage.xml
flags: unittests
name: codecov-umbrella
fail_ci_if_error: false

- name: Upload coverage report
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: htmlcov/
Expand Down Expand Up @@ -194,7 +194,7 @@ jobs:
timeout-minutes: 10

- name: Upload benchmark results
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: always()
with:
name: benchmark-results
Expand Down
57 changes: 57 additions & 0 deletions kdp/layers/preserve_dtype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import tensorflow as tf
from tensorflow import keras


@tf.keras.utils.register_keras_serializable(package="kdp.layers")
class PreserveDtypeLayer(keras.layers.Layer):
"""Custom Keras layer that preserves the original dtype of input tensors.

This is useful for passthrough features where we want to maintain the original
data type without any casting.
"""

def __init__(self, target_dtype=None, **kwargs):
"""Initialize the layer.

Args:
target_dtype: Optional target dtype to cast to. If None, preserves original dtype.
**kwargs: Additional keyword arguments
"""
super().__init__(**kwargs)
self.target_dtype = target_dtype

def call(self, inputs, **kwargs):
"""Process the input tensor, optionally casting to target_dtype.

Args:
inputs: Input tensor of any dtype
**kwargs: Additional keyword arguments

Returns:
Tensor with preserved or target dtype
"""
if self.target_dtype is not None:
return tf.cast(inputs, self.target_dtype)
return inputs

def get_config(self):
"""Return the config dictionary for serialization.

Returns:
A dictionary with the layer configuration
"""
config = super().get_config()
config.update({"target_dtype": self.target_dtype})
return config

@classmethod
def from_config(cls, config):
"""Create a new instance from the serialized configuration.

Args:
config: Layer configuration dictionary

Returns:
A new instance of the layer
"""
return cls(**config)
22 changes: 22 additions & 0 deletions kdp/layers_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from kdp.layers.text_preprocessing_layer import TextPreprocessingLayer
from kdp.layers.cast_to_float import CastToFloat32Layer
from kdp.layers.preserve_dtype import PreserveDtypeLayer
from kdp.layers.date_parsing_layer import DateParsingLayer
from kdp.layers.date_encoding_layer import DateEncodingLayer
from kdp.layers.season_layer import SeasonLayer
Expand Down Expand Up @@ -183,6 +184,27 @@ def cast_to_float32_layer(
**kwargs,
)

@staticmethod
def preserve_dtype_layer(
name: str = "preserve_dtype", target_dtype=None, **kwargs: dict
) -> tf.keras.layers.Layer:
"""Create a PreserveDtypeLayer layer.

Args:
name: The name of the layer.
target_dtype: Optional target dtype to cast to. If None, preserves original dtype.
**kwargs: Additional keyword arguments to pass to the layer constructor.

Returns:
An instance of the PreserveDtypeLayer layer.
"""
return PreprocessorLayerFactory.create_layer(
layer_class=PreserveDtypeLayer,
name=name,
target_dtype=target_dtype,
**kwargs,
)

@staticmethod
def date_parsing_layer(
name: str = "date_parsing_layer", **kwargs: dict
Expand Down
17 changes: 14 additions & 3 deletions kdp/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1126,6 +1126,15 @@ def _add_categorical_lookup(
if feature.category_encoding == CategoryEncodingOptions.HASHING:
return

# Handle empty vocabulary by providing a fallback
if not vocab:
logger.warning(
f"Empty vocabulary for categorical feature '{feature_name}'. "
"Using fallback vocabulary with placeholder values."
)
# Provide a minimal vocabulary with unknown/placeholder values
vocab = ["<UNK>"]

# Default behavior if no specific preprocessing is defined
if feature.feature_type == FeatureType.STRING_CATEGORICAL:
preprocessor.add_processing_step(
Expand Down Expand Up @@ -1414,10 +1423,12 @@ def _add_pipeline_passthrough(self, feature_name: str, input_layer) -> None:
feature_name=feature_name,
)
else:
# For passthrough features, we only ensure type consistency by casting to float32
# For passthrough features, preserve the original dtype or cast to specified dtype
target_dtype = getattr(_feature, "dtype", None)
preprocessor.add_processing_step(
layer_creator=PreprocessorLayerFactory.cast_to_float32_layer,
name=f"cast_to_float_{feature_name}",
layer_creator=PreprocessorLayerFactory.preserve_dtype_layer,
name=f"preserve_dtype_{feature_name}",
target_dtype=target_dtype,
)

# Optionally reshape if needed
Expand Down
Loading
Loading