From 7e1c46a7f5e7c0d1aba14a9a9653f93831088733 Mon Sep 17 00:00:00 2001 From: Yongda Fan Date: Sat, 7 Feb 2026 01:13:41 -0500 Subject: [PATCH 1/6] add some API for the processors --- pyhealth/processors/base_processor.py | 47 +++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/pyhealth/processors/base_processor.py b/pyhealth/processors/base_processor.py index fb489fb85..16768f079 100644 --- a/pyhealth/processors/base_processor.py +++ b/pyhealth/processors/base_processor.py @@ -52,6 +52,53 @@ def process(self, value: Any) -> Any: Processed value. """ pass + + def is_continuous(self) -> bool: + """Returns whether the output (in particular, the value tensor) of the processor is + continuous (float) or discrete (int). + + Returns: + True if the output is continuous, False if it is discrete. + """ + raise NotImplementedError("is_continuous method is not implemented for this processor.") + + def schema(self) -> tuple[str, ...]: + """Returns the schema of the processed feature. For a processor that emits a single tensor, + this should just return `["value"]`. For a processor that emits a tuple of tensors, + this should return a tuple of the same length as the tuple, with the semantic name of each tensor, + such as `["time", "value"]`, `["value", "mask"]`, etc. + + Returns: + Tuple of semantic names corresponding to the output of the processor. + """ + raise NotImplementedError("Schema method is not implemented for this processor.") + + def dim(self) -> tuple[int, ...]: + """Number of dimensions (`Tensor.dim()`) for each output + tensor, in the same order as the output tuple. + + Returns: + Tuple of integers corresponding to the number of dimensions of each output tensor. + """ + raise NotImplementedError("dim method is not implemented for this processor.") + + def spatial(self, i: int) -> tuple[bool, ...]: + """Whether each dimension (axis) of the i-th output tensor is spatial (i.e. corresponds to a spatial + axis like time, height, width, etc.) or not. This is used to determine how to apply + augmentations and other transformations that should only be applied to spatial dimensions. + + E.g. for CNN or RNN features, this would help determine which dimensions to apply spatial augmentations to, + and which dimensions to treat as channels or features. + + Args: + i: Index of the output tensor to check. + + Returns: + Tuple of booleans corresponding to whether each axis of the i-th output tensor is spatial or not. + """ + raise NotImplementedError("spatial method is not implemented for this processor.") + + class SampleProcessor(Processor): From aa292d26263d3585395fdd2d6f7d8364a3c76dff Mon Sep 17 00:00:00 2001 From: Yongda Fan Date: Sat, 7 Feb 2026 01:28:27 -0500 Subject: [PATCH 2/6] Implement for some of the processors --- pyhealth/processors/audio_processor.py | 55 ++++++++++++++++ pyhealth/processors/image_processor.py | 41 ++++++++++++ pyhealth/processors/label_processor.py | 72 +++++++++++++++++++++ pyhealth/processors/timeseries_processor.py | 19 ++++++ 4 files changed, 187 insertions(+) diff --git a/pyhealth/processors/audio_processor.py b/pyhealth/processors/audio_processor.py index 17b4bf236..5e4ab433a 100644 --- a/pyhealth/processors/audio_processor.py +++ b/pyhealth/processors/audio_processor.py @@ -134,6 +134,61 @@ def process(self, value: Union[str, Path]) -> Any: return waveform + def is_continuous(self) -> bool: + """Audio data is continuous (float-valued). + + Returns: + True, since audio waveforms and spectrograms are continuous signals. + """ + return True + + def schema(self) -> tuple[str, ...]: + """Returns the schema of the processed audio feature. + + The audio processor emits a single tensor (waveform or mel spectrogram). + + Returns: + ("value",) + """ + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Number of dimensions for the output tensor. + + Returns: + (2,) for waveform output (channels, samples), or + (3,) for mel spectrogram output (channels, n_mels, time). + """ + if self.n_mels is not None: + return (3,) + return (2,) + + def spatial(self, i: int) -> tuple[bool, ...]: + """Whether each dimension of the output tensor is spatial. + + For waveform (channels, samples): channels is not spatial, samples is. + For mel spectrogram (channels, n_mels, time): channels is not spatial, + n_mels and time are. + + Args: + i: Index of the output tensor (must be 0). + + Returns: + Tuple of booleans for each axis. + + Raises: + IndexError: If i != 0 (only one output tensor). + """ + if i != 0: + raise IndexError( + f"AudioProcessor has 1 output tensor, but index {i} was requested." + ) + if self.n_mels is not None: + # (channels, n_mels, time) + return (False, True, True) + # (channels, samples) + return (False, True) + def __repr__(self) -> str: return ( f"AudioProcessor(sample_rate={self.sample_rate}, " diff --git a/pyhealth/processors/image_processor.py b/pyhealth/processors/image_processor.py index d174529a7..04ac53c96 100644 --- a/pyhealth/processors/image_processor.py +++ b/pyhealth/processors/image_processor.py @@ -95,6 +95,47 @@ def process(self, value: Union[str, Path]) -> Any: img.load() # Avoid "too many open files" errors return self.transform(img) + def is_continuous(self) -> bool: + """Image data is continuous (float-valued pixel intensities). + + Returns: + True. + """ + return True + + def schema(self) -> tuple[str, ...]: + """Single tensor output. + + Returns: + ("value",) + """ + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output tensor has 3 dimensions: (C, H, W). + + Returns: + (3,) + """ + return (3,) + + def spatial(self, i: int) -> tuple[bool, ...]: + """Spatial axes for the output tensor (C, H, W). + + Channels are not spatial; height and width are. + + Args: + i: Index of the output tensor (must be 0). + + Returns: + (False, True, True) + """ + if i != 0: + raise IndexError( + f"ImageProcessor has 1 output tensor, but index {i} was requested." + ) + return (False, True, True) + def __repr__(self) -> str: return ( f"ImageLoadingProcessor(image_size={self.image_size}, " diff --git a/pyhealth/processors/label_processor.py b/pyhealth/processors/label_processor.py index ae8d1f8aa..a394027ba 100644 --- a/pyhealth/processors/label_processor.py +++ b/pyhealth/processors/label_processor.py @@ -40,6 +40,24 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return 1 + def is_continuous(self) -> bool: + """Binary labels are discrete (0 or 1).""" + return False + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output shape is (1,), so 1 dimension.""" + return (1,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"BinaryLabelProcessor has 1 output tensor, but index {i} was requested." + ) + return (False,) + def __repr__(self): return f"BinaryLabelProcessor(label_vocab_size={len(self.label_vocab)})" @@ -72,6 +90,24 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return len(self.label_vocab) + def is_continuous(self) -> bool: + """Multi-class labels are discrete.""" + return False + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output is a scalar tensor (dim 0).""" + return (0,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"MultiClassLabelProcessor has 1 output tensor, but index {i} was requested." + ) + return () + def __repr__(self): return f"MultiClassLabelProcessor(label_vocab_size={len(self.label_vocab)})" @@ -115,6 +151,24 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return len(self.label_vocab) + def is_continuous(self) -> bool: + """Multi-label indicators are discrete (binary 0/1).""" + return False + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output shape is (num_classes,), so 1 dimension.""" + return (1,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"MultiLabelProcessor has 1 output tensor, but index {i} was requested." + ) + return (False,) + def __repr__(self): return f"MultiLabelProcessor(label_vocab_size={len(self.label_vocab)})" @@ -131,5 +185,23 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return 1 + def is_continuous(self) -> bool: + """Regression labels are continuous.""" + return True + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output shape is (1,), so 1 dimension.""" + return (1,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"RegressionLabelProcessor has 1 output tensor, but index {i} was requested." + ) + return (False,) + def __repr__(self): return "RegressionLabelProcessor()" diff --git a/pyhealth/processors/timeseries_processor.py b/pyhealth/processors/timeseries_processor.py index f5abcfae7..7bc51835c 100644 --- a/pyhealth/processors/timeseries_processor.py +++ b/pyhealth/processors/timeseries_processor.py @@ -102,6 +102,25 @@ def size(self): # Size equals number of features, unknown until first process return self.n_features + def is_continuous(self) -> bool: + """Time series values are continuous.""" + return True + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output is a 2D tensor (time_steps, features).""" + return (2,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"TimeseriesProcessor has 1 output tensor, but index {i} was requested." + ) + # Time dimension is spatial; feature dimension is not + return (True, False) + def __repr__(self): return ( f"TimeSeriesProcessor(sampling_rate={self.sampling_rate}, " From 4861a6ba082ed28f8e57de57ca97e47923f4c7ad Mon Sep 17 00:00:00 2001 From: Yongda Fan Date: Sat, 7 Feb 2026 01:45:06 -0500 Subject: [PATCH 3/6] More processors --- .../deep_nested_sequence_processor.py | 40 ++++++- .../processors/nested_sequence_processor.py | 38 +++++++ pyhealth/processors/sequence_processor.py | 18 ++++ pyhealth/processors/stagenet_processor.py | 101 ++++++++++++++++++ pyhealth/processors/tensor_processor.py | 79 +++++++++++++- 5 files changed, 273 insertions(+), 3 deletions(-) diff --git a/pyhealth/processors/deep_nested_sequence_processor.py b/pyhealth/processors/deep_nested_sequence_processor.py index 6cedd10f0..aa640253c 100644 --- a/pyhealth/processors/deep_nested_sequence_processor.py +++ b/pyhealth/processors/deep_nested_sequence_processor.py @@ -185,6 +185,25 @@ def __repr__(self): f"max_inner_len={self._max_inner_len})" ) + def is_continuous(self) -> bool: + """Deep nested sequence codes are discrete indices.""" + return False + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output is a 3D tensor (groups, visits, codes).""" + return (3,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"DeepNestedSequenceProcessor has 1 output tensor, but index {i} was requested." + ) + # Groups are not sequential; visits are temporal/spatial; codes-per-visit is an unordered set + return (False, True, False) + @register_processor("deep_nested_sequence_floats") class DeepNestedFloatsProcessor(FeatureProcessor): @@ -379,4 +398,23 @@ def __repr__(self): f"max_middle_len={self._max_middle_len}, " f"max_inner_len={self._max_inner_len}, " f"forward_fill={self.forward_fill})" - ) \ No newline at end of file + ) + + def is_continuous(self) -> bool: + """Deep nested float values are continuous.""" + return True + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output is a 3D tensor (groups, visits, features).""" + return (3,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"DeepNestedFloatsProcessor has 1 output tensor, but index {i} was requested." + ) + # Groups are not sequential; visits are temporal/spatial; features dimension is not + return (False, True, False) \ No newline at end of file diff --git a/pyhealth/processors/nested_sequence_processor.py b/pyhealth/processors/nested_sequence_processor.py index 89da66e37..2dd67800d 100644 --- a/pyhealth/processors/nested_sequence_processor.py +++ b/pyhealth/processors/nested_sequence_processor.py @@ -162,6 +162,25 @@ def __repr__(self): f"padding={self._padding})" ) + def is_continuous(self) -> bool: + """Nested sequence codes are discrete indices.""" + return False + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output is a 2D tensor (visits, codes_per_visit).""" + return (2,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"NestedSequenceProcessor has 1 output tensor, but index {i} was requested." + ) + # Visits (time) is spatial; codes-per-visit is an unordered set, not spatial + return (True, False) + @register_processor("nested_sequence_floats") class NestedFloatsProcessor(FeatureProcessor): @@ -341,3 +360,22 @@ def __repr__(self): f"forward_fill={self.forward_fill}, " f"padding={self._padding})" ) + + def is_continuous(self) -> bool: + """Nested float values are continuous.""" + return True + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output is a 2D tensor (visits, features).""" + return (2,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"NestedFloatsProcessor has 1 output tensor, but index {i} was requested." + ) + # Visits (time) is spatial; features dimension is not + return (True, False) diff --git a/pyhealth/processors/sequence_processor.py b/pyhealth/processors/sequence_processor.py index 01162548e..389b62ccf 100644 --- a/pyhealth/processors/sequence_processor.py +++ b/pyhealth/processors/sequence_processor.py @@ -71,5 +71,23 @@ def add(self, tokens: set[str]): def size(self): return len(self.code_vocab) + def is_continuous(self) -> bool: + """Sequence codes are discrete indices.""" + return False + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Output is a 1D tensor of code indices.""" + return (1,) + + def spatial(self, i: int) -> tuple[bool, ...]: + if i != 0: + raise IndexError( + f"SequenceProcessor has 1 output tensor, but index {i} was requested." + ) + return (True,) + def __repr__(self): return f"SequenceProcessor(code_vocab_size={len(self.code_vocab)})" diff --git a/pyhealth/processors/stagenet_processor.py b/pyhealth/processors/stagenet_processor.py index 0441ba869..863c7bebd 100644 --- a/pyhealth/processors/stagenet_processor.py +++ b/pyhealth/processors/stagenet_processor.py @@ -191,6 +191,8 @@ def _encode_nested_codes(self, nested_codes: List[List[str]]) -> torch.Tensor: Pads all inner sequences to self._max_nested_len (global max). """ + assert self._max_nested_len is not None, "Max nested length must be set during fit()" + # Handle empty nested codes (no visits/events) # Return single padding token with shape (1, max_len) if len(nested_codes) == 0: @@ -219,6 +221,56 @@ def size(self) -> int: """Return vocabulary size.""" return len(self.code_vocab) + def is_continuous(self) -> bool: + """Code indices are discrete.""" + return False + + def schema(self) -> tuple[str, ...]: + """Output is a tuple of (time_tensor, value_tensor).""" + return ("time", "value") + + def dim(self) -> tuple[int, ...]: + """Number of dimensions for each output tensor. + + Time tensor is 1D. Value tensor is 1D (flat) or 2D (nested). + Must be called after fit(). + + Returns: + (1, 1) for flat codes or (1, 2) for nested codes. + """ + if self._is_nested is None: + raise NotImplementedError( + "StageNetProcessor.dim() requires fit() to be called first " + "to determine whether codes are flat or nested." + ) + if self._is_nested: + return (1, 2) + return (1, 1) + + def spatial(self, i: int) -> tuple[bool, ...]: + """Whether each dimension of the i-th output tensor is spatial. + + Args: + i: 0 for time tensor, 1 for value tensor. + """ + if i == 0: + # Time tensor: 1D, the time dimension is spatial + return (True,) + elif i == 1: + if self._is_nested is None: + raise NotImplementedError( + "StageNetProcessor.spatial() requires fit() to be called first." + ) + if self._is_nested: + # (visits, codes_per_visit) - visits are sequential/spatial, + # codes_per_visit is an unordered set and not spatial + return (True, False) + # Flat codes: single sequence dimension is spatial + return (True,) + raise IndexError( + f"StageNetProcessor has 2 output tensors, but index {i} was requested." + ) + def __repr__(self): if self._is_nested: return ( @@ -369,6 +421,55 @@ def size(self): """Return feature dimension.""" return self._size + def is_continuous(self) -> bool: + """Numeric values are continuous.""" + return True + + def schema(self) -> tuple[str, ...]: + """Output is a tuple of (time_tensor, value_tensor).""" + return ("time", "value") + + def dim(self) -> tuple[int, ...]: + """Number of dimensions for each output tensor. + + Time tensor is 1D. Value tensor is 1D (flat) or 2D (nested). + Must be called after fit(). + + Returns: + (1, 1) for flat values or (1, 2) for nested values. + """ + if self._is_nested is None: + raise NotImplementedError( + "StageNetTensorProcessor.dim() requires fit() to be called first " + "to determine whether values are flat or nested." + ) + if self._is_nested: + return (1, 2) + return (1, 1) + + def spatial(self, i: int) -> tuple[bool, ...]: + """Whether each dimension of the i-th output tensor is spatial. + + Args: + i: 0 for time tensor, 1 for value tensor. + """ + if i == 0: + # Time tensor: 1D, the time dimension is spatial + return (True,) + elif i == 1: + if self._is_nested is None: + raise NotImplementedError( + "StageNetTensorProcessor.spatial() requires fit() to be called first." + ) + if self._is_nested: + # (time_steps, features) - time is spatial, features are not + return (True, False) + # Flat: single sequence dimension is spatial + return (True,) + raise IndexError( + f"StageNetTensorProcessor has 2 output tensors, but index {i} was requested." + ) + def __repr__(self): return ( f"StageNetTensorProcessor(is_nested={self._is_nested}, " diff --git a/pyhealth/processors/tensor_processor.py b/pyhealth/processors/tensor_processor.py index b74b98ac5..e6d7d3730 100644 --- a/pyhealth/processors/tensor_processor.py +++ b/pyhealth/processors/tensor_processor.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Dict, Iterable, Optional import torch @@ -21,15 +21,41 @@ class TensorProcessor(FeatureProcessor): - torch.Tensor with appropriate shape and dtype """ - def __init__(self, dtype: torch.dtype = torch.float32): + def __init__( + self, + dtype: torch.dtype = torch.float32, + spatial_dims: Optional[tuple[bool, ...]] = None, + ): """ Initialize the TensorProcessor. Args: dtype: The desired torch data type for the output tensor. Default is torch.float32. + spatial_dims: Tuple of booleans indicating which dimensions are spatial. + If None, defaults to all False. Default is None. """ self.dtype = dtype + self._n_dim = None + self._spatial_dims = spatial_dims + + def fit(self, samples: Iterable[Dict[str, Any]], field: str) -> None: + """Infer n_dim from the first valid sample. + + Args: + samples: Iterable of sample dictionaries. + field: The field name to extract from samples. + """ + for sample in samples: + if field in sample and sample[field] is not None: + value = sample[field] + tensor = ( + value.detach().clone() + if isinstance(value, torch.Tensor) + else torch.tensor(value, dtype=self.dtype) + ) + self._n_dim = tensor.dim() + break def process(self, value: Any) -> torch.Tensor: """ @@ -57,6 +83,55 @@ def size(self) -> None: """ return None + def is_continuous(self) -> bool: + """Whether the output tensor is continuous, inferred from dtype. + + Returns: + True if dtype is floating point, False otherwise. + """ + return self.dtype.is_floating_point + + def schema(self) -> tuple[str, ...]: + return ("value",) + + def dim(self) -> tuple[int, ...]: + """Number of dimensions for the output tensor. + + Returns: + (n_dim,) + + Raises: + NotImplementedError: If n_dim was not provided and fit() was not called. + """ + if self._n_dim is None: + raise NotImplementedError( + "TensorProcessor cannot determine n_dim automatically. " + "Call fit() first." + ) + return (self._n_dim,) + + def spatial(self, i: int) -> tuple[bool, ...]: + """Whether each dimension of the output tensor is spatial. + + If spatial_dims was provided at init, returns that. Otherwise defaults + to all False based on n_dim. + + Args: + i: Index of the output tensor (must be 0). + """ + if i != 0: + raise IndexError( + f"TensorProcessor has 1 output tensor, but index {i} was requested." + ) + if self._spatial_dims is not None: + return self._spatial_dims + if self._n_dim is None: + raise NotImplementedError( + "TensorProcessor cannot determine spatial dims. " + "Call fit() first." + ) + return tuple(False for _ in range(self._n_dim)) + def __repr__(self) -> str: """ String representation of the processor. From 8c11754412411b14af868726999595e702e0b88c Mon Sep 17 00:00:00 2001 From: Yongda Fan Date: Sat, 7 Feb 2026 05:03:37 -0500 Subject: [PATCH 4/6] Add comment --- pyhealth/processors/base_processor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyhealth/processors/base_processor.py b/pyhealth/processors/base_processor.py index 16768f079..da5371800 100644 --- a/pyhealth/processors/base_processor.py +++ b/pyhealth/processors/base_processor.py @@ -68,6 +68,11 @@ def schema(self) -> tuple[str, ...]: this should return a tuple of the same length as the tuple, with the semantic name of each tensor, such as `["time", "value"]`, `["value", "mask"]`, etc. + Typical semantic names include: + - "value": the main processed tensor output of the processor + - "time": the time tensor output of the processor (mostly for StageNet) + - "mask": the mask tensor output of the processor (if applicable) + Returns: Tuple of semantic names corresponding to the output of the processor. """ From b87f795e3c7bd4d90c1b0fe3500ac5b5fd45f6b2 Mon Sep 17 00:00:00 2001 From: Yongda Fan Date: Sat, 7 Feb 2026 06:19:07 -0500 Subject: [PATCH 5/6] rename method to is_token to better reflects is usage --- pyhealth/processors/audio_processor.py | 8 ++++---- pyhealth/processors/base_processor.py | 12 ++++++----- .../deep_nested_sequence_processor.py | 12 +++++------ pyhealth/processors/image_processor.py | 8 ++++---- pyhealth/processors/label_processor.py | 20 +++++++++---------- .../processors/nested_sequence_processor.py | 12 +++++------ pyhealth/processors/sequence_processor.py | 6 +++--- pyhealth/processors/stagenet_processor.py | 12 +++++------ pyhealth/processors/tensor_processor.py | 8 ++++---- pyhealth/processors/timeseries_processor.py | 6 +++--- 10 files changed, 53 insertions(+), 51 deletions(-) diff --git a/pyhealth/processors/audio_processor.py b/pyhealth/processors/audio_processor.py index 5e4ab433a..78a5973e4 100644 --- a/pyhealth/processors/audio_processor.py +++ b/pyhealth/processors/audio_processor.py @@ -134,13 +134,13 @@ def process(self, value: Union[str, Path]) -> Any: return waveform - def is_continuous(self) -> bool: - """Audio data is continuous (float-valued). + def is_token(self) -> bool: + """Audio data is continuous (float-valued), not discrete tokens. Returns: - True, since audio waveforms and spectrograms are continuous signals. + False, since audio waveforms and spectrograms are continuous signals. """ - return True + return False def schema(self) -> tuple[str, ...]: """Returns the schema of the processed audio feature. diff --git a/pyhealth/processors/base_processor.py b/pyhealth/processors/base_processor.py index da5371800..d64c5364a 100644 --- a/pyhealth/processors/base_processor.py +++ b/pyhealth/processors/base_processor.py @@ -53,14 +53,16 @@ def process(self, value: Any) -> Any: """ pass - def is_continuous(self) -> bool: - """Returns whether the output (in particular, the value tensor) of the processor is - continuous (float) or discrete (int). + def is_token(self) -> bool: + """Returns whether the output (in particular, the value tensor) of the processor + represents discrete token indices (True) or continuous values (False). This is used to + determine whether to apply token-based transformations (e.g. `nn.Embedding`) or + value-based augmentations (e.g. `nn.Linear`). Returns: - True if the output is continuous, False if it is discrete. + True if the output of the processor represents discrete token indices, False otherwise. """ - raise NotImplementedError("is_continuous method is not implemented for this processor.") + raise NotImplementedError("is_token method is not implemented for this processor.") def schema(self) -> tuple[str, ...]: """Returns the schema of the processed feature. For a processor that emits a single tensor, diff --git a/pyhealth/processors/deep_nested_sequence_processor.py b/pyhealth/processors/deep_nested_sequence_processor.py index aa640253c..bd24e3bff 100644 --- a/pyhealth/processors/deep_nested_sequence_processor.py +++ b/pyhealth/processors/deep_nested_sequence_processor.py @@ -185,9 +185,9 @@ def __repr__(self): f"max_inner_len={self._max_inner_len})" ) - def is_continuous(self) -> bool: - """Deep nested sequence codes are discrete indices.""" - return False + def is_token(self) -> bool: + """Deep nested sequence codes are discrete token indices.""" + return True def schema(self) -> tuple[str, ...]: return ("value",) @@ -400,9 +400,9 @@ def __repr__(self): f"forward_fill={self.forward_fill})" ) - def is_continuous(self) -> bool: - """Deep nested float values are continuous.""" - return True + def is_token(self) -> bool: + """Deep nested float values are continuous, not discrete tokens.""" + return False def schema(self) -> tuple[str, ...]: return ("value",) diff --git a/pyhealth/processors/image_processor.py b/pyhealth/processors/image_processor.py index 04ac53c96..2e78a91be 100644 --- a/pyhealth/processors/image_processor.py +++ b/pyhealth/processors/image_processor.py @@ -95,13 +95,13 @@ def process(self, value: Union[str, Path]) -> Any: img.load() # Avoid "too many open files" errors return self.transform(img) - def is_continuous(self) -> bool: - """Image data is continuous (float-valued pixel intensities). + def is_token(self) -> bool: + """Image data is continuous (float-valued pixel intensities), not discrete tokens. Returns: - True. + False. """ - return True + return False def schema(self) -> tuple[str, ...]: """Single tensor output. diff --git a/pyhealth/processors/label_processor.py b/pyhealth/processors/label_processor.py index a394027ba..81976fa2f 100644 --- a/pyhealth/processors/label_processor.py +++ b/pyhealth/processors/label_processor.py @@ -40,8 +40,8 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return 1 - def is_continuous(self) -> bool: - """Binary labels are discrete (0 or 1).""" + def is_token(self) -> bool: + """Binary labels are continuous float targets for BCE loss.""" return False def schema(self) -> tuple[str, ...]: @@ -90,9 +90,9 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return len(self.label_vocab) - def is_continuous(self) -> bool: - """Multi-class labels are discrete.""" - return False + def is_token(self) -> bool: + """Multi-class labels are discrete token indices.""" + return True def schema(self) -> tuple[str, ...]: return ("value",) @@ -151,8 +151,8 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return len(self.label_vocab) - def is_continuous(self) -> bool: - """Multi-label indicators are discrete (binary 0/1).""" + def is_token(self) -> bool: + """Multi-label indicators are continuous float targets for BCE loss.""" return False def schema(self) -> tuple[str, ...]: @@ -185,9 +185,9 @@ def process(self, value: Any) -> torch.Tensor: def size(self): return 1 - def is_continuous(self) -> bool: - """Regression labels are continuous.""" - return True + def is_token(self) -> bool: + """Regression labels are continuous, not discrete tokens.""" + return False def schema(self) -> tuple[str, ...]: return ("value",) diff --git a/pyhealth/processors/nested_sequence_processor.py b/pyhealth/processors/nested_sequence_processor.py index 2dd67800d..009c6fe27 100644 --- a/pyhealth/processors/nested_sequence_processor.py +++ b/pyhealth/processors/nested_sequence_processor.py @@ -162,9 +162,9 @@ def __repr__(self): f"padding={self._padding})" ) - def is_continuous(self) -> bool: - """Nested sequence codes are discrete indices.""" - return False + def is_token(self) -> bool: + """Nested sequence codes are discrete token indices.""" + return True def schema(self) -> tuple[str, ...]: return ("value",) @@ -361,9 +361,9 @@ def __repr__(self): f"padding={self._padding})" ) - def is_continuous(self) -> bool: - """Nested float values are continuous.""" - return True + def is_token(self) -> bool: + """Nested float values are continuous, not discrete tokens.""" + return False def schema(self) -> tuple[str, ...]: return ("value",) diff --git a/pyhealth/processors/sequence_processor.py b/pyhealth/processors/sequence_processor.py index 389b62ccf..8b831e8e2 100644 --- a/pyhealth/processors/sequence_processor.py +++ b/pyhealth/processors/sequence_processor.py @@ -71,9 +71,9 @@ def add(self, tokens: set[str]): def size(self): return len(self.code_vocab) - def is_continuous(self) -> bool: - """Sequence codes are discrete indices.""" - return False + def is_token(self) -> bool: + """Sequence codes are discrete token indices.""" + return True def schema(self) -> tuple[str, ...]: return ("value",) diff --git a/pyhealth/processors/stagenet_processor.py b/pyhealth/processors/stagenet_processor.py index 863c7bebd..71165c859 100644 --- a/pyhealth/processors/stagenet_processor.py +++ b/pyhealth/processors/stagenet_processor.py @@ -221,9 +221,9 @@ def size(self) -> int: """Return vocabulary size.""" return len(self.code_vocab) - def is_continuous(self) -> bool: - """Code indices are discrete.""" - return False + def is_token(self) -> bool: + """Code indices are discrete token indices.""" + return True def schema(self) -> tuple[str, ...]: """Output is a tuple of (time_tensor, value_tensor).""" @@ -421,9 +421,9 @@ def size(self): """Return feature dimension.""" return self._size - def is_continuous(self) -> bool: - """Numeric values are continuous.""" - return True + def is_token(self) -> bool: + """Numeric values are continuous, not discrete tokens.""" + return False def schema(self) -> tuple[str, ...]: """Output is a tuple of (time_tensor, value_tensor).""" diff --git a/pyhealth/processors/tensor_processor.py b/pyhealth/processors/tensor_processor.py index e6d7d3730..2ef759460 100644 --- a/pyhealth/processors/tensor_processor.py +++ b/pyhealth/processors/tensor_processor.py @@ -83,13 +83,13 @@ def size(self) -> None: """ return None - def is_continuous(self) -> bool: - """Whether the output tensor is continuous, inferred from dtype. + def is_token(self) -> bool: + """Whether the output tensor represents discrete token indices, inferred from dtype. Returns: - True if dtype is floating point, False otherwise. + True if dtype is integer (discrete tokens), False if floating point (continuous). """ - return self.dtype.is_floating_point + return not self.dtype.is_floating_point def schema(self) -> tuple[str, ...]: return ("value",) diff --git a/pyhealth/processors/timeseries_processor.py b/pyhealth/processors/timeseries_processor.py index 7bc51835c..65dd2da0e 100644 --- a/pyhealth/processors/timeseries_processor.py +++ b/pyhealth/processors/timeseries_processor.py @@ -102,9 +102,9 @@ def size(self): # Size equals number of features, unknown until first process return self.n_features - def is_continuous(self) -> bool: - """Time series values are continuous.""" - return True + def is_token(self) -> bool: + """Time series values are continuous, not discrete tokens.""" + return False def schema(self) -> tuple[str, ...]: return ("value",) From 094e226555ff66faae1dd972efd1c0874f64aca5 Mon Sep 17 00:00:00 2001 From: Yongda Fan Date: Sat, 7 Feb 2026 21:05:40 -0500 Subject: [PATCH 6/6] Update API --- pyhealth/processors/audio_processor.py | 12 +--- pyhealth/processors/base_processor.py | 9 +-- .../deep_nested_sequence_processor.py | 12 +--- pyhealth/processors/image_processor.py | 9 +-- pyhealth/processors/label_processor.py | 24 ++----- .../processors/nested_sequence_processor.py | 12 +--- pyhealth/processors/sequence_processor.py | 6 +- pyhealth/processors/stagenet_processor.py | 68 +++++++------------ pyhealth/processors/tensor_processor.py | 9 +-- pyhealth/processors/timeseries_processor.py | 6 +- 10 files changed, 39 insertions(+), 128 deletions(-) diff --git a/pyhealth/processors/audio_processor.py b/pyhealth/processors/audio_processor.py index 78a5973e4..b938b0b44 100644 --- a/pyhealth/processors/audio_processor.py +++ b/pyhealth/processors/audio_processor.py @@ -163,26 +163,16 @@ def dim(self) -> tuple[int, ...]: return (3,) return (2,) - def spatial(self, i: int) -> tuple[bool, ...]: + def spatial(self) -> tuple[bool, ...]: """Whether each dimension of the output tensor is spatial. For waveform (channels, samples): channels is not spatial, samples is. For mel spectrogram (channels, n_mels, time): channels is not spatial, n_mels and time are. - Args: - i: Index of the output tensor (must be 0). - Returns: Tuple of booleans for each axis. - - Raises: - IndexError: If i != 0 (only one output tensor). """ - if i != 0: - raise IndexError( - f"AudioProcessor has 1 output tensor, but index {i} was requested." - ) if self.n_mels is not None: # (channels, n_mels, time) return (False, True, True) diff --git a/pyhealth/processors/base_processor.py b/pyhealth/processors/base_processor.py index d64c5364a..48cbe26ae 100644 --- a/pyhealth/processors/base_processor.py +++ b/pyhealth/processors/base_processor.py @@ -89,19 +89,16 @@ def dim(self) -> tuple[int, ...]: """ raise NotImplementedError("dim method is not implemented for this processor.") - def spatial(self, i: int) -> tuple[bool, ...]: - """Whether each dimension (axis) of the i-th output tensor is spatial (i.e. corresponds to a spatial + def spatial(self) -> tuple[bool, ...]: + """Whether each dimension (axis) of the value tensor is spatial (i.e. corresponds to a spatial axis like time, height, width, etc.) or not. This is used to determine how to apply augmentations and other transformations that should only be applied to spatial dimensions. E.g. for CNN or RNN features, this would help determine which dimensions to apply spatial augmentations to, and which dimensions to treat as channels or features. - Args: - i: Index of the output tensor to check. - Returns: - Tuple of booleans corresponding to whether each axis of the i-th output tensor is spatial or not. + Tuple of booleans corresponding to whether each axis of the value tensor is spatial or not. """ raise NotImplementedError("spatial method is not implemented for this processor.") diff --git a/pyhealth/processors/deep_nested_sequence_processor.py b/pyhealth/processors/deep_nested_sequence_processor.py index bd24e3bff..2d67633eb 100644 --- a/pyhealth/processors/deep_nested_sequence_processor.py +++ b/pyhealth/processors/deep_nested_sequence_processor.py @@ -196,11 +196,7 @@ def dim(self) -> tuple[int, ...]: """Output is a 3D tensor (groups, visits, codes).""" return (3,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"DeepNestedSequenceProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: # Groups are not sequential; visits are temporal/spatial; codes-per-visit is an unordered set return (False, True, False) @@ -411,10 +407,6 @@ def dim(self) -> tuple[int, ...]: """Output is a 3D tensor (groups, visits, features).""" return (3,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"DeepNestedFloatsProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: # Groups are not sequential; visits are temporal/spatial; features dimension is not return (False, True, False) \ No newline at end of file diff --git a/pyhealth/processors/image_processor.py b/pyhealth/processors/image_processor.py index 2e78a91be..b0185c1ec 100644 --- a/pyhealth/processors/image_processor.py +++ b/pyhealth/processors/image_processor.py @@ -119,21 +119,14 @@ def dim(self) -> tuple[int, ...]: """ return (3,) - def spatial(self, i: int) -> tuple[bool, ...]: + def spatial(self) -> tuple[bool, ...]: """Spatial axes for the output tensor (C, H, W). Channels are not spatial; height and width are. - Args: - i: Index of the output tensor (must be 0). - Returns: (False, True, True) """ - if i != 0: - raise IndexError( - f"ImageProcessor has 1 output tensor, but index {i} was requested." - ) return (False, True, True) def __repr__(self) -> str: diff --git a/pyhealth/processors/label_processor.py b/pyhealth/processors/label_processor.py index 81976fa2f..969721995 100644 --- a/pyhealth/processors/label_processor.py +++ b/pyhealth/processors/label_processor.py @@ -51,11 +51,7 @@ def dim(self) -> tuple[int, ...]: """Output shape is (1,), so 1 dimension.""" return (1,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"BinaryLabelProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: return (False,) def __repr__(self): @@ -101,11 +97,7 @@ def dim(self) -> tuple[int, ...]: """Output is a scalar tensor (dim 0).""" return (0,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"MultiClassLabelProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: return () def __repr__(self): @@ -162,11 +154,7 @@ def dim(self) -> tuple[int, ...]: """Output shape is (num_classes,), so 1 dimension.""" return (1,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"MultiLabelProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: return (False,) def __repr__(self): @@ -196,11 +184,7 @@ def dim(self) -> tuple[int, ...]: """Output shape is (1,), so 1 dimension.""" return (1,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"RegressionLabelProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: return (False,) def __repr__(self): diff --git a/pyhealth/processors/nested_sequence_processor.py b/pyhealth/processors/nested_sequence_processor.py index 009c6fe27..461575621 100644 --- a/pyhealth/processors/nested_sequence_processor.py +++ b/pyhealth/processors/nested_sequence_processor.py @@ -173,11 +173,7 @@ def dim(self) -> tuple[int, ...]: """Output is a 2D tensor (visits, codes_per_visit).""" return (2,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"NestedSequenceProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: # Visits (time) is spatial; codes-per-visit is an unordered set, not spatial return (True, False) @@ -372,10 +368,6 @@ def dim(self) -> tuple[int, ...]: """Output is a 2D tensor (visits, features).""" return (2,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"NestedFloatsProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: # Visits (time) is spatial; features dimension is not return (True, False) diff --git a/pyhealth/processors/sequence_processor.py b/pyhealth/processors/sequence_processor.py index 8b831e8e2..d7a7b1ddf 100644 --- a/pyhealth/processors/sequence_processor.py +++ b/pyhealth/processors/sequence_processor.py @@ -82,11 +82,7 @@ def dim(self) -> tuple[int, ...]: """Output is a 1D tensor of code indices.""" return (1,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"SequenceProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: return (True,) def __repr__(self): diff --git a/pyhealth/processors/stagenet_processor.py b/pyhealth/processors/stagenet_processor.py index 71165c859..cce8819c5 100644 --- a/pyhealth/processors/stagenet_processor.py +++ b/pyhealth/processors/stagenet_processor.py @@ -247,29 +247,18 @@ def dim(self) -> tuple[int, ...]: return (1, 2) return (1, 1) - def spatial(self, i: int) -> tuple[bool, ...]: - """Whether each dimension of the i-th output tensor is spatial. - - Args: - i: 0 for time tensor, 1 for value tensor. - """ - if i == 0: - # Time tensor: 1D, the time dimension is spatial - return (True,) - elif i == 1: - if self._is_nested is None: - raise NotImplementedError( - "StageNetProcessor.spatial() requires fit() to be called first." - ) - if self._is_nested: - # (visits, codes_per_visit) - visits are sequential/spatial, - # codes_per_visit is an unordered set and not spatial - return (True, False) - # Flat codes: single sequence dimension is spatial - return (True,) - raise IndexError( - f"StageNetProcessor has 2 output tensors, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: + """Whether each dimension of the value tensor is spatial.""" + if self._is_nested is None: + raise NotImplementedError( + "StageNetProcessor.spatial() requires fit() to be called first." + ) + if self._is_nested: + # (visits, codes_per_visit) - visits are sequential/spatial, + # codes_per_visit is an unordered set and not spatial + return (True, False) + # Flat codes: single sequence dimension is spatial + return (True,) def __repr__(self): if self._is_nested: @@ -447,28 +436,17 @@ def dim(self) -> tuple[int, ...]: return (1, 2) return (1, 1) - def spatial(self, i: int) -> tuple[bool, ...]: - """Whether each dimension of the i-th output tensor is spatial. - - Args: - i: 0 for time tensor, 1 for value tensor. - """ - if i == 0: - # Time tensor: 1D, the time dimension is spatial - return (True,) - elif i == 1: - if self._is_nested is None: - raise NotImplementedError( - "StageNetTensorProcessor.spatial() requires fit() to be called first." - ) - if self._is_nested: - # (time_steps, features) - time is spatial, features are not - return (True, False) - # Flat: single sequence dimension is spatial - return (True,) - raise IndexError( - f"StageNetTensorProcessor has 2 output tensors, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: + """Whether each dimension of the value tensor is spatial.""" + if self._is_nested is None: + raise NotImplementedError( + "StageNetTensorProcessor.spatial() requires fit() to be called first." + ) + if self._is_nested: + # (time_steps, features) - time is spatial, features are not + return (True, False) + # Flat: single sequence dimension is spatial + return (True,) def __repr__(self): return ( diff --git a/pyhealth/processors/tensor_processor.py b/pyhealth/processors/tensor_processor.py index 2ef759460..b1270051d 100644 --- a/pyhealth/processors/tensor_processor.py +++ b/pyhealth/processors/tensor_processor.py @@ -110,19 +110,12 @@ def dim(self) -> tuple[int, ...]: ) return (self._n_dim,) - def spatial(self, i: int) -> tuple[bool, ...]: + def spatial(self) -> tuple[bool, ...]: """Whether each dimension of the output tensor is spatial. If spatial_dims was provided at init, returns that. Otherwise defaults to all False based on n_dim. - - Args: - i: Index of the output tensor (must be 0). """ - if i != 0: - raise IndexError( - f"TensorProcessor has 1 output tensor, but index {i} was requested." - ) if self._spatial_dims is not None: return self._spatial_dims if self._n_dim is None: diff --git a/pyhealth/processors/timeseries_processor.py b/pyhealth/processors/timeseries_processor.py index 65dd2da0e..e761e8035 100644 --- a/pyhealth/processors/timeseries_processor.py +++ b/pyhealth/processors/timeseries_processor.py @@ -113,11 +113,7 @@ def dim(self) -> tuple[int, ...]: """Output is a 2D tensor (time_steps, features).""" return (2,) - def spatial(self, i: int) -> tuple[bool, ...]: - if i != 0: - raise IndexError( - f"TimeseriesProcessor has 1 output tensor, but index {i} was requested." - ) + def spatial(self) -> tuple[bool, ...]: # Time dimension is spatial; feature dimension is not return (True, False)