Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions pyhealth/processors/base_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,24 +93,27 @@ def process(self, samples: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
pass

class VocabMixin(ABC):
class TokenProcessorInterface(ABC):
"""
Base class for feature processors that build a vocabulary.

Provides a common interface for accessing vocabulary-related information.
"""

PAD = 0
UNK = 1

@abstractmethod
def remove(self, vocabularies: set[str]):
def remove(self, tokens: set[str]):
"""Remove specified vocabularies from the processor."""
pass

@abstractmethod
def retain(self, vocabularies: set[str]):
def retain(self, tokens: set[str]):
"""Retain only the specified vocabularies in the processor."""
pass

@abstractmethod
def add(self, vocabularies: set[str]):
def add(self, tokens: set[str]):
"""Add specified vocabularies to the processor."""
pass
42 changes: 21 additions & 21 deletions pyhealth/processors/deep_nested_sequence_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import torch

from . import register_processor
from .base_processor import FeatureProcessor, VocabMixin
from .base_processor import FeatureProcessor, TokenProcessorInterface


@register_processor("deep_nested_sequence")
class DeepNestedSequenceProcessor(FeatureProcessor, VocabMixin):
class DeepNestedSequenceProcessor(FeatureProcessor, TokenProcessorInterface):
"""
Feature processor for deeply nested categorical sequences with vocabulary.

Expand All @@ -30,8 +30,8 @@ class DeepNestedSequenceProcessor(FeatureProcessor, VocabMixin):
(num_groups, max_visits_per_group, max_codes_per_visit)

Special tokens:
- <unk>: -1 for unknown codes
- <pad>: 0 for padding
- <unk>: 1 for unknown codes

Examples:
>>> processor = DeepNestedSequenceProcessor()
Expand All @@ -45,9 +45,8 @@ class DeepNestedSequenceProcessor(FeatureProcessor, VocabMixin):
"""

def __init__(self):
# -1 for <unk> for ease of boolean arithmetic > 0, > -1, etc.
self.code_vocab: Dict[Any, int] = {"<unk>": -1, "<pad>": 0}
self._next_index = 1
self.code_vocab: Dict[Any, int] = {"<pad>": self.PAD, "<unk>": self.UNK}
self._next_index = 2
self._max_middle_len = 1 # Maximum length of middle sequences (e.g. visits)
self._max_inner_len = 1 # Maximum length of inner sequences (e.g. codes per visit)

Expand Down Expand Up @@ -86,26 +85,27 @@ def fit(self, samples: Iterable[Dict[str, Any]], field: str) -> None:
self._max_middle_len = max(1, max_middle_len)
self._max_inner_len = max(1, max_inner_len)

def remove(self, vocabularies: set[str]):
def remove(self, tokens: set[str]):
"""Remove specified vocabularies from the processor."""
vocab = list(set(self.code_vocab.keys()) - vocabularies - {"<pad>", "<unk>"})
self.code_vocab = {"<pad>": 0, "<unk>": -1}
for i, v in enumerate(vocab):
self.code_vocab[v] = i + 1
keep = set(self.code_vocab.keys()) - tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def retain(self, vocabularies: set[str]):
def retain(self, tokens: set[str]):
"""Retain only the specified vocabularies in the processor."""
vocab = list(set(self.code_vocab.keys()) & vocabularies)
self.code_vocab = {"<pad>": 0, "<unk>": -1}
for i, v in enumerate(vocab):
self.code_vocab[v] = i + 1
keep = set(self.code_vocab.keys()) & tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def add(self, vocabularies: set[str]):
def add(self, tokens: set[str]):
"""Add specified vocabularies to the processor."""
vocab = list(set(self.code_vocab.keys()) | vocabularies - {"<pad>", "<unk>"})
self.code_vocab = {"<pad>": 0, "<unk>": -1}
for i, v in enumerate(vocab):
self.code_vocab[v] = i + 1
i = len(self.code_vocab)
for token in tokens:
if token not in self.code_vocab:
self.code_vocab[token] = i
i += 1

def process(self, value: List[List[List[Any]]]) -> torch.Tensor:
"""Process deep nested sequence into padded 3D tensor.
Expand Down
43 changes: 21 additions & 22 deletions pyhealth/processors/nested_sequence_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import torch

from . import register_processor
from .base_processor import FeatureProcessor, VocabMixin
from .base_processor import FeatureProcessor, TokenProcessorInterface


@register_processor("nested_sequence")
class NestedSequenceProcessor(FeatureProcessor, VocabMixin):
class NestedSequenceProcessor(FeatureProcessor, TokenProcessorInterface):
"""
Feature processor for nested categorical sequences with vocabulary.

Expand All @@ -22,8 +22,8 @@ class NestedSequenceProcessor(FeatureProcessor, VocabMixin):
4. Returns a 2D tensor of shape (num_visits, max_codes_per_visit)

Special tokens:
- <unk>: -1 for unknown codes
- <pad>: 0 for padding
- <unk>: 1 for unknown codes

Args:
padding: Additional padding to add on top of the observed maximum inner
Expand All @@ -45,9 +45,8 @@ class NestedSequenceProcessor(FeatureProcessor, VocabMixin):
"""

def __init__(self, padding: int = 0):
# <unk> will be set to len(vocab) after fit
self.code_vocab: Dict[Any, int] = {"<unk>": None, "<pad>": 0}
self._next_index = 1
self.code_vocab: Dict[Any, int] = {"<pad>": self.PAD, "<unk>": self.UNK}
self._next_index = 2
self._max_inner_len = 1 # Maximum length of inner sequences
self._padding = padding # Additional padding beyond observed max

Expand Down Expand Up @@ -82,27 +81,27 @@ def fit(self, samples: Iterable[Dict[str, Any]], field: str) -> None:
observed_max = max(1, max_inner_len)
self._max_inner_len = observed_max + self._padding

# Set <unk> token to len(vocab) - 1 after building vocabulary
# (-1 because <unk> is already in vocab)
self.code_vocab["<unk>"] = len(self.code_vocab) - 1

def remove(self, vocabularies: set[str]):
def remove(self, tokens: set[str]):
"""Remove specified vocabularies from the processor."""
vocab = list(set(self.code_vocab.keys()) - vocabularies - {"<pad>", "<unk>"})
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
keep = set(self.code_vocab.keys()) - tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def retain(self, vocabularies: set[str]):
def retain(self, tokens: set[str]):
"""Retain only the specified vocabularies in the processor."""
vocab = list(set(self.code_vocab.keys()) & vocabularies)
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
keep = set(self.code_vocab.keys()) & tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def add(self, vocabularies: set[str]):
def add(self, tokens: set[str]):
"""Add specified vocabularies to the processor."""
vocab = list(set(self.code_vocab.keys()) | vocabularies - {"<pad>", "<unk>"})
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
i = len(self.code_vocab)
for token in tokens:
if token not in self.code_vocab:
self.code_vocab[token] = i
i += 1

def process(self, value: List[List[Any]]) -> torch.Tensor:
"""Process nested sequence into padded 2D tensor.
Expand Down
39 changes: 20 additions & 19 deletions pyhealth/processors/sequence_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import torch

from . import register_processor
from .base_processor import FeatureProcessor, VocabMixin
from .base_processor import FeatureProcessor, TokenProcessorInterface


@register_processor("sequence")
class SequenceProcessor(FeatureProcessor, VocabMixin):
class SequenceProcessor(FeatureProcessor, TokenProcessorInterface):
"""
Feature processor for encoding categorical sequences (e.g., medical codes) into numerical indices.

Expand All @@ -16,9 +16,8 @@ class SequenceProcessor(FeatureProcessor, VocabMixin):
"""

def __init__(self):
# <unk> will be set to len(vocab) after fit
self.code_vocab: Dict[Any, int] = {"<pad>": 0}
self._next_index = 1
self.code_vocab: Dict[Any, int] = {"<pad>": self.PAD, "<unk>": self.UNK}
self._next_index = 2

def fit(self, samples: Iterable[Dict[str, Any]], field: str) -> None:
for sample in samples:
Expand All @@ -29,8 +28,6 @@ def fit(self, samples: Iterable[Dict[str, Any]], field: str) -> None:
self.code_vocab[token] = self._next_index
self._next_index += 1

self.code_vocab["<unk>"] = len(self.code_vocab)

def process(self, value: Any) -> torch.Tensor:
"""Process token value(s) into tensor of indices.

Expand All @@ -49,23 +46,27 @@ def process(self, value: Any) -> torch.Tensor:

return torch.tensor(indices, dtype=torch.long)

def remove(self, vocabularies: set[str]):
def remove(self, tokens: set[str]):
"""Remove specified vocabularies from the processor."""
vocab = list(set(self.code_vocab.keys()) - vocabularies - {"<pad>", "<unk>"})
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
keep = set(self.code_vocab.keys()) - tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def retain(self, vocabularies: set[str]):
def retain(self, tokens: set[str]):
"""Retain only the specified vocabularies in the processor."""
vocab = list(set(self.code_vocab.keys()) & vocabularies)
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
keep = set(self.code_vocab.keys()) & tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def add(self, vocabularies: set[str]):
def add(self, tokens: set[str]):
"""Add specified vocabularies to the processor."""
vocab = list(set(self.code_vocab.keys()) | vocabularies - {"<pad>", "<unk>"})
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
i = len(self.code_vocab)
for token in tokens:
if token not in self.code_vocab:
self.code_vocab[token] = i
i += 1

def size(self):
return len(self.code_vocab)
Expand Down
41 changes: 20 additions & 21 deletions pyhealth/processors/stagenet_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import torch

from . import register_processor
from .base_processor import FeatureProcessor, VocabMixin
from .base_processor import FeatureProcessor, TokenProcessorInterface


@register_processor("stagenet")
class StageNetProcessor(FeatureProcessor, VocabMixin):
class StageNetProcessor(FeatureProcessor, TokenProcessorInterface):
"""
Feature processor for StageNet CODE inputs with coupled value/time data.

Expand Down Expand Up @@ -55,9 +55,8 @@ class StageNetProcessor(FeatureProcessor, VocabMixin):
"""

def __init__(self, padding: int = 0):
# <unk> will be set to len(vocab) after fit
self.code_vocab: Dict[Any, int] = {"<unk>": None, "<pad>": 0}
self._next_index = 1
self.code_vocab: Dict[Any, int] = {"<pad>": self.PAD, "<unk>": self.UNK}
self._next_index = 2
self._is_nested = None # Will be determined during fit
# Max inner sequence length for nested codes
self._max_nested_len = None
Expand Down Expand Up @@ -118,27 +117,27 @@ def fit(self, samples: Iterable[Dict[str, Any]], field: str) -> None:
observed_max = max(1, max_inner_len)
self._max_nested_len = observed_max + self._padding

# Set <unk> token to the next available index
# Since <unk> is already in the vocab dict, we use _next_index
self.code_vocab["<unk>"] = self._next_index

def remove(self, vocabularies: set[str]):
def remove(self, tokens: set[str]):
"""Remove specified vocabularies from the processor."""
vocab = list(set(self.code_vocab.keys()) - vocabularies - {"<pad>", "<unk>"})
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
keep = set(self.code_vocab.keys()) - tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def retain(self, vocabularies: set[str]):
def retain(self, tokens: set[str]):
"""Retain only the specified vocabularies in the processor."""
vocab = list(set(self.code_vocab.keys()) & vocabularies)
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
keep = set(self.code_vocab.keys()) & tokens | {"<pad>", "<unk>"}
order = [k for k, v in sorted(self.code_vocab.items(), key=lambda x: x[1]) if k in keep]

self.code_vocab = { k : i for i, k in enumerate(order) }

def add(self, vocabularies: set[str]):
def add(self, tokens: set[str]):
"""Add specified vocabularies to the processor."""
vocab = list(set(self.code_vocab.keys()) | vocabularies - {"<pad>", "<unk>"})
vocab = ["<pad>"] + vocab + ["<unk>"]
self.code_vocab = {v: i for i, v in enumerate(vocab)}
i = len(self.code_vocab)
for token in tokens:
if token not in self.code_vocab:
self.code_vocab[token] = i
i += 1

def process(
self, value: Tuple[Optional[List], List]
Expand Down
10 changes: 4 additions & 6 deletions tests/core/test_stagenet_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,16 @@ class TestStageNetProcessor(unittest.TestCase):
"""Tests for StageNetProcessor (categorical codes)."""

def test_unknown_token_index(self):
"""Test that <unk> token is len(vocab) - 1, not -1."""
"""Test that <unk> token is at index 1."""
processor = StageNetProcessor()
samples = [
{"data": ([0.0, 1.0], [["A", "B"], ["C", "D", "E"]])},
{"data": ([0.0], [["F"]])},
]
processor.fit(samples, "data")

# <unk> should be len(vocab) - 1 (last index)
expected_unk_idx = len(processor.code_vocab) - 1
self.assertEqual(processor.code_vocab["<unk>"], expected_unk_idx)
# <unk> should be at index 1 (following <pad> at 0)
self.assertEqual(processor.code_vocab["<unk>"], 1)

# <unk> must be >= 0 for nn.Embedding compatibility
self.assertGreaterEqual(processor.code_vocab["<unk>"], 0)
Expand All @@ -40,9 +39,8 @@ def test_unknown_token_index(self):
self.assertEqual(processor.code_vocab["<pad>"], 0)

# Verify vocab size includes both special tokens
# Vocab: <unk>, <pad>, A, B, C, D, E, F = 8 tokens
# Vocab: <pad>, <unk>, A, B, C, D, E, F = 8 tokens
self.assertEqual(len(processor.code_vocab), 8)
self.assertEqual(processor.code_vocab["<unk>"], 7)

def test_unknown_token_embedding_compatibility(self):
"""Test that <unk> index works with nn.Embedding."""
Expand Down