diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 112e256ee07..2995de297d7 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -173,7 +173,8 @@ jobs: opt-einsum \ nltk \ fvcore \ - scikit-optimize + scikit-optimize \ + flair kill $KA cd src/main/python python -m unittest discover -s tests/scuro -p 'test_*.py' -v diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index 8b5a8621d1d..7849c038165 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -30,7 +30,13 @@ AggregatedRepresentation, ) from systemds.scuro.representations.average import Average -from systemds.scuro.representations.bert import Bert +from systemds.scuro.representations.bert import ( + Bert, + RoBERTa, + DistillBERT, + ALBERT, + ELECTRA, +) from systemds.scuro.representations.bow import BoW from systemds.scuro.representations.concatenation import Concatenation from systemds.scuro.representations.context import Context @@ -101,6 +107,16 @@ from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer from systemds.scuro.representations.vgg import VGG19 from systemds.scuro.representations.clip import CLIPText, CLIPVisual +from systemds.scuro.representations.text_context import ( + SentenceBoundarySplit, + OverlappingSplit, +) +from systemds.scuro.representations.text_context_with_indices import ( + SentenceBoundarySplitIndices, + OverlappingSplitIndices, +) +from systemds.scuro.representations.elmo import ELMoRepresentation + __all__ = [ "BaseLoader", @@ -113,6 +129,10 @@ "AggregatedRepresentation", "Average", "Bert", + "RoBERTa", + "DistillBERT", + "ALBERT", + "ELECTRA", "BoW", "Concatenation", "Context", @@ -177,4 +197,9 @@ "VGG19", "CLIPVisual", "CLIPText", + "SentenceBoundarySplit", + "OverlappingSplit", + "ELMoRepresentation", + "SentenceBoundarySplitIndices", + "OverlappingSplitIndices", ] diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py index 3b20245956b..dc62e9b65b6 100644 --- a/src/main/python/systemds/scuro/drsearch/operator_registry.py +++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py @@ -33,8 +33,10 @@ class Registry: _instance = None _representations = {} - _context_operators = [] + _context_operators = {} _fusion_operators = [] + _text_context_operators = [] + _video_context_operators = [] def __new__(cls): if not cls._instance: @@ -60,8 +62,13 @@ def add_representation( ): self._representations[modality].append(representation) - def add_context_operator(self, context_operator): - self._context_operators.append(context_operator) + def add_context_operator(self, context_operator, modality_type): + if not isinstance(modality_type, list): + modality_type = [modality_type] + for m_type in modality_type: + if not m_type in self._context_operators.keys(): + self._context_operators[m_type] = [] + self._context_operators[m_type].append(context_operator) def add_fusion_operator(self, fusion_operator): self._fusion_operators.append(fusion_operator) @@ -76,9 +83,8 @@ def get_not_self_contained_representations(self, modality: ModalityType): reps.append(rep) return reps - def get_context_operators(self): - # TODO: return modality specific context operations - return self._context_operators + def get_context_operators(self, modality_type): + return self._context_operators[modality_type] def get_fusion_operators(self): return self._fusion_operators @@ -121,13 +127,15 @@ def decorator(cls): return decorator -def register_context_operator(): +def register_context_operator(modality_type): """ Decorator to register a context operator. + + @param modality_type: The modality type for which the context operator is to be registered """ def decorator(cls): - Registry().add_context_operator(cls) + Registry().add_context_operator(cls, modality_type) return cls return decorator diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py index 1a348a91df2..4cde294b17c 100644 --- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py +++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py @@ -87,8 +87,8 @@ def _get_not_self_contained_reps(self, modality_type): ) @lru_cache(maxsize=32) - def _get_context_operators(self): - return self.operator_registry.get_context_operators() + def _get_context_operators(self, modality_type): + return self.operator_registry.get_context_operators(modality_type) def store_results(self, file_name=None): if file_name is None: @@ -302,6 +302,39 @@ def _build_modality_dag( current_node_id = rep_node_id dags.append(builder.build(current_node_id)) + if operator.needs_context: + context_operators = self._get_context_operators(modality.modality_type) + for context_op in context_operators: + if operator.initial_context_length is not None: + context_length = operator.initial_context_length + + context_node_id = builder.create_operation_node( + context_op, + [leaf_id], + context_op(context_length).get_current_parameters(), + ) + else: + context_node_id = builder.create_operation_node( + context_op, + [leaf_id], + context_op().get_current_parameters(), + ) + + context_rep_node_id = builder.create_operation_node( + operator.__class__, + [context_node_id], + operator.get_current_parameters(), + ) + + agg_operator = AggregatedRepresentation() + context_agg_node_id = builder.create_operation_node( + agg_operator.__class__, + [context_rep_node_id], + agg_operator.get_current_parameters(), + ) + + dags.append(builder.build(context_agg_node_id)) + if not operator.self_contained: not_self_contained_reps = self._get_not_self_contained_reps( modality.modality_type @@ -344,7 +377,7 @@ def _build_modality_dag( def default_context_operators(self, modality, builder, leaf_id, current_node_id): dags = [] - context_operators = self._get_context_operators() + context_operators = self._get_context_operators(modality.modality_type) for context_op in context_operators: if ( modality.modality_type != ModalityType.TEXT @@ -368,7 +401,7 @@ def default_context_operators(self, modality, builder, leaf_id, current_node_id) def temporal_context_operators(self, modality, builder, leaf_id, current_node_id): aggregators = self.operator_registry.get_representations(modality.modality_type) - context_operators = self._get_context_operators() + context_operators = self._get_context_operators(modality.modality_type) dags = [] for agg in aggregators: diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index f7739f03df9..3b014653028 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -18,8 +18,6 @@ # under the License. # # ------------------------------------------------------------- -from functools import reduce -from operator import or_ from typing import Union, List from systemds.scuro.modality.type import ModalityType diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index c2fe38176f7..23d97e869b0 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -108,8 +108,12 @@ def update_base_metadata(cls, md, data, data_is_single_instance=True): shape = data.shape elif data_layout is DataLayout.NESTED_LEVEL: if data_is_single_instance: - dtype = data.dtype - shape = data.shape + if isinstance(data, list): + dtype = type(data[0]) + shape = (len(data), len(data[0])) + else: + dtype = data.dtype + shape = data.shape else: shape = data[0].shape dtype = data[0].dtype @@ -306,13 +310,15 @@ def get_data_layout(cls, data, data_is_single_instance): return None if data_is_single_instance: - if ( + if (isinstance(data, list) and not isinstance(data[0], str)) or ( + isinstance(data, np.ndarray) and data.ndim == 1 + ): + return DataLayout.SINGLE_LEVEL + elif ( isinstance(data, list) or isinstance(data, np.ndarray) - and data.ndim == 1 + or isinstance(data, torch.Tensor) ): - return DataLayout.SINGLE_LEVEL - elif isinstance(data, np.ndarray) or isinstance(data, torch.Tensor): return DataLayout.NESTED_LEVEL if isinstance(data[0], list): diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py index 0a8438e684f..9503a48587b 100644 --- a/src/main/python/systemds/scuro/representations/aggregate.py +++ b/src/main/python/systemds/scuro/representations/aggregate.py @@ -71,7 +71,7 @@ def execute(self, modality): max_len = 0 for i, instance in enumerate(modality.data): data.append([]) - if isinstance(instance, np.ndarray): + if isinstance(instance, np.ndarray) or isinstance(instance, list): if ( modality.modality_type == ModalityType.IMAGE or modality.modality_type == ModalityType.VIDEO diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 4d486bff59d..be579c0dd6c 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -22,7 +22,7 @@ from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation import torch -from transformers import BertTokenizerFast, BertModel +from transformers import AutoTokenizer, AutoModel from systemds.scuro.representations.utils import save_embeddings from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.operator_registry import register_representation @@ -37,15 +37,18 @@ class TextDataset(Dataset): def __init__(self, texts): self.texts = [] - for text in texts: - if text is None: - self.texts.append("") - elif isinstance(text, np.ndarray): - self.texts.append(str(text.item()) if text.size == 1 else str(text)) - elif not isinstance(text, str): - self.texts.append(str(text)) - else: - self.texts.append(text) + if isinstance(texts, list): + self.texts = texts + else: + for text in texts: + if text is None: + self.texts.append("") + elif isinstance(text, np.ndarray): + self.texts.append(str(text.item()) if text.size == 1 else str(text)) + elif not isinstance(text, str): + self.texts.append(str(text)) + else: + self.texts.append(text) def __len__(self): return len(self.texts) @@ -54,36 +57,61 @@ def __getitem__(self, idx): return self.texts[idx] -@register_representation(ModalityType.TEXT) -class Bert(UnimodalRepresentation): - def __init__(self, model_name="bert", output_file=None, max_seq_length=512): - parameters = {"model_name": "bert"} +class BertFamily(UnimodalRepresentation): + def __init__( + self, + representation_name, + model_name, + layer, + parameters={}, + output_file=None, + max_seq_length=512, + ): self.model_name = model_name - super().__init__("Bert", ModalityType.EMBEDDING, parameters) + super().__init__(representation_name, ModalityType.EMBEDDING, parameters) + self.layer_name = layer self.output_file = output_file self.max_seq_length = max_seq_length + self.needs_context = True + self.initial_context_length = 350 def transform(self, modality): transformed_modality = TransformedModality(modality, self) - model_name = "bert-base-uncased" - tokenizer = BertTokenizerFast.from_pretrained( - model_name, clean_up_tokenization_spaces=True + tokenizer = AutoTokenizer.from_pretrained( + self.model_name, clean_up_tokenization_spaces=True ) + self.model = AutoModel.from_pretrained(self.model_name).to(get_device()) + self.bert_output = None + + def get_activation(name): + def hook(model, input, output): + self.bert_output = output.detach().cpu().numpy() - model = BertModel.from_pretrained(model_name).to(get_device()) + return hook - embeddings = self.create_embeddings(modality, model, tokenizer) + if self.layer_name != "cls": + for name, layer in self.model.named_modules(): + if name == self.layer_name: + layer.register_forward_hook(get_activation(name)) + break + + if isinstance(modality.data[0], list): + embeddings = [] + for d in modality.data: + embeddings.append(self.create_embeddings(d, self.model, tokenizer)) + else: + embeddings = self.create_embeddings(modality.data, self.model, tokenizer) if self.output_file is not None: save_embeddings(embeddings, self.output_file) transformed_modality.data_type = np.float32 - transformed_modality.data = np.array(embeddings) + transformed_modality.data = embeddings return transformed_modality - def create_embeddings(self, modality, model, tokenizer): - dataset = TextDataset(modality.data) + def create_embeddings(self, data, model, tokenizer): + dataset = TextDataset(data) dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=None) cls_embeddings = [] for batch in dataloader: @@ -94,27 +122,146 @@ def create_embeddings(self, modality, model, tokenizer): padding="max_length", return_attention_mask=True, truncation=True, - max_length=512, # TODO: make this dynamic + max_length=512, # TODO: make this dynamic with parameter to tune ) inputs.to(get_device()) - ModalityType.TEXT.add_field_for_instances( - modality.metadata, - "token_to_character_mapping", - inputs.data["offset_mapping"].tolist(), - ) - - ModalityType.TEXT.add_field_for_instances( - modality.metadata, - "attention_masks", - inputs.data["attention_mask"].tolist(), - ) + # ModalityType.TEXT.add_field_for_instances( + # modality.metadata, + # "token_to_character_mapping", + # inputs.data["offset_mapping"].tolist(), + # ) + # + # ModalityType.TEXT.add_field_for_instances( + # modality.metadata, + # "attention_masks", + # inputs.data["attention_mask"].tolist(), + # ) del inputs.data["offset_mapping"] with torch.no_grad(): outputs = model(**inputs) - - cls_embedding = outputs.last_hidden_state.detach().cpu().numpy() + if self.layer_name == "cls": + cls_embedding = outputs.last_hidden_state.detach().cpu().numpy() + else: + cls_embedding = self.bert_output cls_embeddings.extend(cls_embedding) return np.array(cls_embeddings) + + +@register_representation(ModalityType.TEXT) +class Bert(BertFamily): + def __init__(self, layer="cls", output_file=None, max_seq_length=512): + parameters = { + "layer_name": [ + "cls", + "encoder.layer.0", + "encoder.layer.1", + "encoder.layer.2", + "encoder.layer.3", + "encoder.layer.4", + "encoder.layer.5", + "encoder.layer.6", + "encoder.layer.7", + "encoder.layer.8", + "encoder.layer.9", + "encoder.layer.10", + "encoder.layer.11", + "pooler", + "pooler.activation", + ] + } + super().__init__( + "Bert", "bert-base-uncased", layer, parameters, output_file, max_seq_length + ) + + +@register_representation(ModalityType.TEXT) +class RoBERTa(BertFamily): + def __init__(self, layer="cls", output_file=None, max_seq_length=512): + parameters = { + "layer_name": [ + "cls", + "encoder.layer.0", + "encoder.layer.1", + "encoder.layer.2", + "encoder.layer.3", + "encoder.layer.4", + "encoder.layer.5", + "encoder.layer.6", + "encoder.layer.7", + "encoder.layer.8", + "encoder.layer.9", + "encoder.layer.10", + "encoder.layer.11", + "pooler", + "pooler.activation", + ] + } + super().__init__( + "RoBERTa", "roberta-base", layer, parameters, output_file, max_seq_length + ) + + +@register_representation(ModalityType.TEXT) +class DistillBERT(BertFamily): + def __init__(self, layer="cls", output_file=None, max_seq_length=512): + parameters = { + "layer_name": [ + "cls", + "transformer.layer.0", + "transformer.layer.1", + "transformer.layer.2", + "transformer.layer.3", + "transformer.layer.4", + "transformer.layer.5", + ] + } + super().__init__( + "DistillBERT", + "distilbert-base-uncased", + layer, + parameters, + output_file, + max_seq_length, + ) + + +@register_representation(ModalityType.TEXT) +class ALBERT(BertFamily): + def __init__(self, layer="cls", output_file=None, max_seq_length=512): + parameters = {"layer_name": ["cls", "encoder.albert_layer_groups.0", "pooler"]} + super().__init__( + "ALBERT", "albert-base-v2", layer, parameters, output_file, max_seq_length + ) + + +@register_representation(ModalityType.TEXT) +class ELECTRA(BertFamily): + def __init__(self, layer="cls", output_file=None, max_seq_length=512): + parameters = { + "layer_name": [ + "cls", + "encoder.layer.0", + "encoder.layer.1", + "encoder.layer.2", + "encoder.layer.3", + "encoder.layer.4", + "encoder.layer.5", + "encoder.layer.6", + "encoder.layer.7", + "encoder.layer.8", + "encoder.layer.9", + "encoder.layer.10", + "encoder.layer.11", + ] + } + super().__init__( + "ELECTRA", + "google/electra-base-discriminator", + layer, + parameters, + output_file, + max_seq_length, + ) diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py index 504681f2537..a431e52761c 100644 --- a/src/main/python/systemds/scuro/representations/clip.py +++ b/src/main/python/systemds/scuro/representations/clip.py @@ -119,13 +119,20 @@ def __init__(self, output_file=None): ) self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") self.output_file = output_file + self.needs_context = True + self.initial_context_length = 55 def transform(self, modality): transformed_modality = TransformedModality( modality, self, self.output_modality_type ) - embeddings = self.create_text_embeddings(modality.data, self.model) + if isinstance(modality.data[0], list): + embeddings = [] + for d in modality.data: + embeddings.append(self.create_text_embeddings(d, self.model)) + else: + embeddings = self.create_text_embeddings(modality.data, self.model) if self.output_file is not None: save_embeddings(embeddings, self.output_file) diff --git a/src/main/python/systemds/scuro/representations/elmo.py b/src/main/python/systemds/scuro/representations/elmo.py new file mode 100644 index 00000000000..ba2a99f8e1d --- /dev/null +++ b/src/main/python/systemds/scuro/representations/elmo.py @@ -0,0 +1,154 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from systemds.scuro.utils.torch_dataset import CustomDataset +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.representations.unimodal import UnimodalRepresentation +from systemds.scuro.drsearch.operator_registry import register_representation +import torch.utils.data +import torch +import numpy as np +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.utils.static_variables import get_device +from flair.embeddings import ELMoEmbeddings +from flair.data import Sentence +from torch.utils.data import Dataset +from torch.utils.data import DataLoader + + +class TextDataset(Dataset): + def __init__(self, texts): + + self.texts = [] + if isinstance(texts, list): + self.texts = texts + else: + for text in texts: + if text is None: + self.texts.append("") + elif isinstance(text, np.ndarray): + self.texts.append(str(text.item()) if text.size == 1 else str(text)) + elif not isinstance(text, str): + self.texts.append(str(text)) + else: + self.texts.append(text) + + def __len__(self): + return len(self.texts) + + def __getitem__(self, idx): + return self.texts[idx] + + +# @register_representation([ModalityType.TEXT]) +class ELMoRepresentation(UnimodalRepresentation): + def __init__( + self, model_name="elmo-original", layer="mix", pooling="mean", output_file=None + ): + self.data_type = torch.float32 + self.model_name = model_name + self.layer_name = layer + self.pooling = pooling # "mean", "max", "first", "last", or "all" (no pooling) + parameters = self._get_parameters() + super().__init__("ELMo", ModalityType.EMBEDDING, parameters) + + self.output_file = output_file + + @property + def model_name(self): + return self._model_name + + @model_name.setter + def model_name(self, model_name): + self._model_name = model_name + + if model_name == "elmo-original": + self.model = ELMoEmbeddings("original") + self.embedding_dim = 1024 + elif model_name == "elmo-small": + self.model = ELMoEmbeddings("small") + self.embedding_dim = 256 + elif model_name == "elmo-medium": + self.model = ELMoEmbeddings("medium") + self.embedding_dim = 512 + else: + raise NotImplementedError(f"Model {model_name} not supported") + + self.model = self.model.to(get_device()) + + def _get_parameters(self): + parameters = { + "model_name": ["elmo-original", "elmo-small", "elmo-medium"], + "layer_name": [ + "mix", + "layer_0", + "layer_1", + "layer_2", + ], + "pooling": ["mean", "max", "first", "last", "all"], + } + return parameters + + def transform(self, modality): + transformed_modality = TransformedModality( + modality, self, ModalityType.EMBEDDING + ) + dataset = TextDataset(modality.data) + dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=None) + embeddings = [] + for batch in dataloader: + texts = batch + for text in texts: + sentence = Sentence(text) + self.model.embed(sentence) + token_embeddings = [] + for token in sentence: + if self.layer_name == "mix": + embedding = token.embedding + elif self.layer_name == "layer_0": + embedding = token.get_embedding(self.model.name + "-0") + elif self.layer_name == "layer_1": + embedding = token.get_embedding(self.model.name + "-1") + elif self.layer_name == "layer_2": + embedding = token.get_embedding(self.model.name + "-2") + else: + embedding = token.embedding + + token_embeddings.append(embedding.cpu().numpy()) + + token_embeddings = np.array(token_embeddings) + + if self.pooling == "mean": + sentence_embedding = np.mean(token_embeddings, axis=0) + elif self.pooling == "max": + sentence_embedding = np.max(token_embeddings, axis=0) + elif self.pooling == "first": + sentence_embedding = token_embeddings[0] + elif self.pooling == "last": + sentence_embedding = token_embeddings[-1] + elif self.pooling == "all": + sentence_embedding = token_embeddings.flatten() + else: + sentence_embedding = np.mean(token_embeddings, axis=0) + + embeddings.append(sentence_embedding.astype(np.float32)) + + transformed_modality.data = np.array(embeddings) + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py index 9076efecfc9..74f487bd79d 100644 --- a/src/main/python/systemds/scuro/representations/glove.py +++ b/src/main/python/systemds/scuro/representations/glove.py @@ -18,8 +18,10 @@ # under the License. # # ------------------------------------------------------------- +import zipfile import numpy as np from gensim.utils import tokenize +from huggingface_hub import hf_hub_download from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation @@ -39,11 +41,17 @@ def load_glove_embeddings(file_path): return embeddings -# @register_representation(ModalityType.TEXT) +@register_representation(ModalityType.TEXT) class GloVe(UnimodalRepresentation): - def __init__(self, glove_path, output_file=None): + def __init__(self, output_file=None): super().__init__("GloVe", ModalityType.TEXT) - self.glove_path = glove_path + file_path = hf_hub_download( + repo_id="stanfordnlp/glove", filename="glove.6B.zip" + ) + with zipfile.ZipFile(file_path, "r") as zip_ref: + zip_ref.extractall("./glove_extracted") + + self.glove_path = "./glove_extracted/glove.6B.100d.txt" self.output_file = output_file def transform(self, modality): @@ -67,6 +75,5 @@ def transform(self, modality): if self.output_file is not None: save_embeddings(np.array(embeddings), self.output_file) - transformed_modality.data_type = np.float32 transformed_modality.data = np.array(embeddings) return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/text_context.py b/src/main/python/systemds/scuro/representations/text_context.py new file mode 100644 index 00000000000..b98b90e187f --- /dev/null +++ b/src/main/python/systemds/scuro/representations/text_context.py @@ -0,0 +1,221 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import re +from typing import List, Any + +from systemds.scuro.drsearch.operator_registry import register_context_operator +from systemds.scuro.representations.context import Context +from systemds.scuro.modality.type import ModalityType + + +def _split_into_words(text: str) -> List[str]: + """Split text into words, preserving whitespace structure.""" + if not text or not isinstance(text, str): + return [] + return text.split() + + +def _split_into_sentences(text: str) -> List[str]: + """ + Split text into sentences using regex. + Handles common sentence endings: . ! ? + """ + if not text or not isinstance(text, str): + return [] + + sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)" + sentences = re.split(sentence_pattern, text.strip()) + + sentences = [s.strip() for s in sentences if s.strip()] + + if not sentences: + return [text] + + return sentences + + +def _count_words(text: str) -> int: + """ + Count the number of words in a text string. + """ + if not text or not isinstance(text, str): + return 0 + return len(text.split()) + + +def _extract_text(instance: Any) -> str: + if isinstance(instance, str): + text = instance + else: + text = str(instance) + + if not text or not text.strip(): + return "" + return text + + +@register_context_operator(ModalityType.TEXT) +class SentenceBoundarySplit(Context): + """ + Splits text at sentence boundaries while respecting maximum word count. + + Parameters: + max_words (int): Maximum number of words per chunk (default: 55) + min_words (int): Minimum number of words per chunk before splitting (default: 10) + """ + + def __init__(self, max_words=55, min_words=10): + parameters = { + "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450], + "min_words": [10, 20, 30], + } + super().__init__("SentenceBoundarySplit", parameters) + self.max_words = int(max_words) + self.min_words = max(1, int(min_words)) + + def execute(self, modality): + """ + Split each text instance at sentence boundaries, respecting max_words. + + Returns: + List of lists, where each inner list contains text chunks (strings) + """ + chunked_data = [] + + for instance in modality.data: + text = _extract_text(instance) + if not text: + chunked_data.append([""]) + continue + + sentences = _split_into_sentences(text) + + if not sentences: + chunked_data.append([text]) + continue + + chunks = [] + current_chunk = [] + current_word_count = 0 + + for sentence in sentences: + sentence_word_count = _count_words(sentence) + + if sentence_word_count > self.max_words: + if current_chunk and current_word_count >= self.min_words: + chunks.append("".join(current_chunk)) + current_chunk = [] + current_word_count = 0 + + words = _split_into_words(sentence) + for i in range(0, len(words), self.max_words): + chunk_words = words[i : i + self.max_words] + chunks.append(" ".join(chunk_words)) + + elif current_word_count + sentence_word_count > self.max_words: + if current_chunk and current_word_count >= self.min_words: + chunks.append(" ".join(current_chunk)) + current_chunk = [sentence] + current_word_count = sentence_word_count + else: + current_chunk.append(sentence) + current_word_count += sentence_word_count + else: + current_chunk.append(sentence) + current_word_count += sentence_word_count + + # Add remaining chunk + if current_chunk: + chunks.append(" ".join(current_chunk)) + + if not chunks: + chunks = [text] + + chunked_data.append(chunks) + + return chunked_data + + +@register_context_operator(ModalityType.TEXT) +class OverlappingSplit(Context): + """ + Splits text with overlapping chunks using a sliding window approach. + + Parameters: + max_words (int): Maximum number of words per chunk (default: 55) + overlap (float): percentage of overlapping words between chunks (default: 50%) + stride (int, optional): Step size in words. If None, stride = max_words - overlap_words + """ + + def __init__(self, max_words=55, overlap=0.5, stride=None): + overlap_words = int(max_words * overlap) + if stride is None: + stride = max_words - overlap_words + + parameters = { + "max_words": [40, 55, 70, 250, 300, 350, 400, 450], + "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + "stride": [10, 15, 20, 30], + } + super().__init__("OverlappingSplit", parameters) + self.max_words = max_words + self.overlap = overlap + self.stride = stride + + def execute(self, modality): + """ + Split each text instance with overlapping chunks. + + Returns: + List of lists, where each inner list contains text chunks (strings) + """ + chunked_data = [] + + for instance in modality.data: + text = _extract_text(instance) + if not text: + chunked_data.append("") + continue + + words = _split_into_words(text) + + if len(words) <= self.max_words: + chunked_data.append([text]) + continue + + chunks = [] + + # Create overlapping chunks with specified stride + for i in range(0, len(words), self.stride): + chunk_words = words[i : i + self.max_words] + if chunk_words: + chunk_text = " ".join(chunk_words) + chunks.append(chunk_text) + + if i + self.max_words >= len(words): + break + + if not chunks: + chunks = [text] + + chunked_data.append(chunks) + + return chunked_data diff --git a/src/main/python/systemds/scuro/representations/text_context_with_indices.py b/src/main/python/systemds/scuro/representations/text_context_with_indices.py new file mode 100644 index 00000000000..cc7070306ba --- /dev/null +++ b/src/main/python/systemds/scuro/representations/text_context_with_indices.py @@ -0,0 +1,300 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import re +from typing import List, Any + +from systemds.scuro.drsearch.operator_registry import register_context_operator +from systemds.scuro.representations.context import Context +from systemds.scuro.modality.type import ModalityType + +# TODO: Use this to get indices for text chunks based on different splitting strategies +# To use this approach a differnt extration of text chunks is needed in either the TextModality or the Representations + + +def _split_into_words(text: str) -> List[str]: + """Split text into words, preserving whitespace structure.""" + if not text or not isinstance(text, str): + return [] + return text.split() + + +def _split_into_sentences(text: str) -> List[str]: + """ + Split text into sentences using regex. + Handles common sentence endings: . ! ? + """ + if not text or not isinstance(text, str): + return [] + + sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)" + sentences = re.split(sentence_pattern, text.strip()) + + sentences = [s.strip() for s in sentences if s.strip()] + + if not sentences: + return [text] + + return sentences + + +def _count_words(text: str) -> int: + """ + Count the number of words in a text string. + """ + if not text or not isinstance(text, str): + return 0 + return len(text.split()) + + +def _extract_text(instance: Any) -> str: + if isinstance(instance, str): + text = instance + else: + text = str(instance) + + if not text or not text.strip(): + return "" + return text + + +# @register_context_operator(ModalityType.TEXT) +class WordCountSplitIndices(Context): + """ + Splits text after a fixed number of words. + + Parameters: + max_words (int): Maximum number of words per chunk (default: 55) + overlap (int): Number of overlapping words between chunks (default: 0) + """ + + def __init__(self, max_words=55, overlap=0): + parameters = { + "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450], + "overlap": [0, 10, 20, 30], + } + super().__init__("WordCountSplit", parameters) + self.max_words = int(max_words) + self.overlap = max(0, int(overlap)) + + def execute(self, modality): + """ + Split each text instance into chunks of max_words words. + + Returns: + List of tuples, where each tuple contains the start and end index of text chunks + """ + chunked_data = [] + + for instance in modality.data: + text = _extract_text(instance) + + if not text: + chunked_data.append((0, 0)) + continue + + words = _split_into_words(text) + + if len(words) <= self.max_words: + chunked_data.append([(0, len(text))]) + continue + + chunks = [] + stride = self.max_words - self.overlap + + start = 0 + for i in range(0, len(words), stride): + chunk_words = words[i : i + self.max_words] + chunk_text = " ".join(chunk_words) + chunks.append((start, start + len(chunk_text))) + start += len(chunk_text) + 1 + + if i + self.max_words >= len(words): + break + + chunked_data.append(chunks) + + return chunked_data + + +@register_context_operator(ModalityType.TEXT) +class SentenceBoundarySplitIndices(Context): + """ + Splits text at sentence boundaries while respecting maximum word count. + + Parameters: + max_words (int): Maximum number of words per chunk (default: 55) + min_words (int): Minimum number of words per chunk before splitting (default: 10) + """ + + def __init__(self, max_words=55, min_words=10, overlap=0.1): + parameters = { + "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450], + "min_words": [10, 20, 30], + } + super().__init__("SentenceBoundarySplit", parameters) + self.max_words = int(max_words) + self.min_words = max(1, int(min_words)) + self.overlap = overlap + self.stride = max(1, int(max_words * (1 - overlap))) + + def execute(self, modality): + """ + Split each text instance at sentence boundaries, respecting max_words. + + Returns: + List of lists, where each inner list contains text chunks (strings) + """ + chunked_data = [] + + for instance in modality.data: + text = _extract_text(instance) + if not text: + chunked_data.append((0, 0)) + continue + + sentences = _split_into_sentences(text) + + if not sentences: + chunked_data.append((0, len(text))) + continue + + chunks = [] + current_chunk = None + current_word_count = 0 + start = 0 + for sentence in sentences: + sentence_word_count = _count_words(sentence) + + if sentence_word_count > self.max_words: + if current_chunk and current_word_count >= self.min_words: + chunks.append(current_chunk) + current_chunk = [] + current_word_count = 0 + + words = _split_into_words(sentence) + for i in range(0, len(words), self.max_words): + chunk_words = words[i : i + self.max_words] + current_chunk = ( + (start, start + len(" ".join(chunk_words))) + if not current_chunk + else (current_chunk[0], start + len(" ".join(chunk_words))) + ) + start += len(" ".join(chunk_words)) + 1 + + elif current_word_count + sentence_word_count > self.max_words: + if current_chunk and current_word_count >= self.min_words: + chunks.append(current_chunk) + current_chunk = (start, start + len(sentence)) + start += len(sentence) + 1 + current_word_count = sentence_word_count + else: + current_chunk = (current_chunk[0], start + len(sentence)) + start += len(sentence) + 1 + current_word_count += sentence_word_count + else: + current_chunk = ( + (start, start + len(sentence)) + if not current_chunk + else (current_chunk[0], start + len(sentence)) + ) + start += len(sentence) + 1 + current_word_count += sentence_word_count + + # Add remaining chunk + if current_chunk: + chunks.append(current_chunk) + + if not chunks: + chunks = [(0, len(text))] + + chunked_data.append(chunks) + + return chunked_data + + +@register_context_operator(ModalityType.TEXT) +class OverlappingSplitIndices(Context): + """ + Splits text with overlapping chunks using a sliding window approach. + + Parameters: + max_words (int): Maximum number of words per chunk (default: 55) + overlap (int): percentage of overlapping words between chunks (default: 50%) + stride (int, optional): Step size in words. If None, stride = max_words - overlap_words + """ + + def __init__(self, max_words=55, overlap=0.5, stride=None): + overlap_words = int(max_words * overlap) + if stride is None: + stride = max_words - overlap_words + + parameters = { + "max_words": [40, 55, 70, 250, 300, 350, 400, 450], + "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + "stride": [10, 15, 20, 30], + } + super().__init__("OverlappingSplit", parameters) + self.max_words = max_words + self.overlap = overlap + self.stride = stride + + def execute(self, modality): + """ + Split each text instance with overlapping chunks. + + Returns: + List of tuples, where each tuple contains start and end index to the text chunks + """ + chunked_data = [] + + for instance in modality.data: + text = _extract_text(instance) + if not text: + chunked_data.append((0, 0)) + continue + + words = _split_into_words(text) + + if len(words) <= self.max_words: + chunked_data.append((0, len(text))) + continue + + chunks = [] + + # Create overlapping chunks with specified stride + start = 0 + for i in range(0, len(words), self.stride): + chunk_words = words[i : i + self.max_words] + if chunk_words: + chunk_text = " ".join(chunk_words) + chunks.append((start, start + len(chunk_text))) + start += len(chunk_text) - len( + " ".join(chunk_words[self.stride - len(chunk_words) :]) + ) + if i + self.max_words >= len(words): + break + + if not chunks: + chunks = [(0, len(text))] + + chunked_data.append(chunks) + + return chunked_data diff --git a/src/main/python/systemds/scuro/representations/unimodal.py b/src/main/python/systemds/scuro/representations/unimodal.py index 362888aa278..2bb34733e26 100644 --- a/src/main/python/systemds/scuro/representations/unimodal.py +++ b/src/main/python/systemds/scuro/representations/unimodal.py @@ -38,6 +38,8 @@ def __init__( if parameters is None: parameters = {} self.self_contained = self_contained + self.needs_context = False + self.initial_context_length = None @abc.abstractmethod def transform(self, data): diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py index f40b28ea871..4d4ec19c5b9 100644 --- a/src/main/python/systemds/scuro/representations/window_aggregation.py +++ b/src/main/python/systemds/scuro/representations/window_aggregation.py @@ -59,11 +59,11 @@ def aggregation_function(self, value): self._aggregation_function = Aggregation(value) -@register_context_operator() +@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO]) class WindowAggregation(Window): def __init__(self, aggregation_function="mean", window_size=10, pad=False): super().__init__("WindowAggregation", aggregation_function) - self.parameters["window_size"] = [window_size] + self.parameters["window_size"] = [5, 10, 15, 25, 50, 100] self.window_size = int(window_size) self.pad = pad @@ -167,7 +167,7 @@ def window_aggregate_nested_level(self, instance, new_length): return np.array(result) -@register_context_operator() +@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO]) class StaticWindow(Window): def __init__(self, aggregation_function="mean", num_windows=100): super().__init__("StaticWindow", aggregation_function) @@ -198,7 +198,7 @@ def execute(self, modality): return np.array(windowed_data) -@register_context_operator() +@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO]) class DynamicWindow(Window): def __init__(self, aggregation_function="mean", num_windows=100): super().__init__("DynamicWindow", aggregation_function) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 5bec163fe76..9da0aa82c0a 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -26,6 +26,11 @@ import random import os +from sklearn import svm +from sklearn.metrics import classification_report +from sklearn.model_selection import train_test_split + +from systemds.scuro.models.model import Model from systemds.scuro.dataloader.base_loader import BaseLoader from systemds.scuro.dataloader.video_loader import VideoLoader from systemds.scuro.dataloader.audio_loader import AudioLoader @@ -33,6 +38,7 @@ from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.modality.type import ModalityType +from systemds.scuro.drsearch.task import Task class TestDataLoader(BaseLoader): @@ -130,7 +136,7 @@ def create_timeseries_data(self, num_instances, sequence_length, num_features=1) } return data, metadata - def create_text_data(self, num_instances): + def create_text_data(self, num_instances, num_sentences_per_instance=1): subjects = [ "The cat", "A dog", @@ -172,18 +178,24 @@ def create_text_data(self, num_instances): "precisely", "methodically", ] + punctuation = [".", "?", "!"] sentences = [] for _ in range(num_instances): - include_adverb = np.random.random() < 0.7 - - subject = np.random.choice(subjects) - verb = np.random.choice(verbs) - obj = np.random.choice(objects) - adverb = np.random.choice(adverbs) if include_adverb else "" - - sentence = f"{subject} {adverb} {verb} {obj}" - + sentence = "" + for i in range(num_sentences_per_instance): + include_adverb = np.random.random() < 0.7 + + subject = np.random.choice(subjects) + verb = np.random.choice(verbs) + obj = np.random.choice(objects) + adverb = np.random.choice(adverbs) if include_adverb else "" + punct = np.random.choice(punctuation) + + sentence += " " if i > 0 else "" + sentence += f"{subject}" + sentence += f" {adverb}" if include_adverb else "" + sentence += f" {verb} {obj}{punct}" sentences.append(sentence) metadata = { @@ -382,3 +394,57 @@ def __create_audio_data(self, idx, duration, speed_factor): audio_data = 0.5 * np.sin(2 * np.pi * frequency * t) write(path, sample_rate, audio_data) + + +class TestSVM(Model): + def __init__(self, name): + super().__init__(name) + + def fit(self, X, y, X_test, y_test): + if X.ndim > 2: + X = X.reshape(X.shape[0], -1) + self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False) + self.clf = self.clf.fit(X, np.array(y)) + y_pred = self.clf.predict(X) + + return { + "accuracy": classification_report( + y, y_pred, output_dict=True, digits=3, zero_division=1 + )["accuracy"] + }, 0 + + def test(self, test_X: np.ndarray, test_y: np.ndarray): + if test_X.ndim > 2: + test_X = test_X.reshape(test_X.shape[0], -1) + y_pred = self.clf.predict(np.array(test_X)) # noqa] + + return { + "accuracy": classification_report( + np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1 + )["accuracy"] + }, 0 + + +class TestTask(Task): + def __init__(self, name, model_name, num_instances): + self.labels = ModalityRandomDataGenerator().create_balanced_labels( + num_instances=10 + ) + split = train_test_split( + np.array(range(num_instances)), + self.labels, + test_size=0.2, + random_state=42, + stratify=self.labels, + ) + self.train_indizes, self.val_indizes = [int(i) for i in split[0]], [ + int(i) for i in split[1] + ] + + super().__init__( + name, + TestSVM(model_name), + self.labels, + self.train_indizes, + self.val_indizes, + ) diff --git a/src/main/python/tests/scuro/test_hp_tuner.py b/src/main/python/tests/scuro/test_hp_tuner.py index 8484a352e44..f163498dab4 100644 --- a/src/main/python/tests/scuro/test_hp_tuner.py +++ b/src/main/python/tests/scuro/test_hp_tuner.py @@ -22,17 +22,12 @@ import unittest import numpy as np -from sklearn import svm -from sklearn.metrics import classification_report -from sklearn.model_selection import train_test_split from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer from systemds.scuro.representations.average import Average from systemds.scuro.representations.concatenation import Concatenation from systemds.scuro.representations.lstm import LSTM from systemds.scuro.drsearch.operator_registry import Registry -from systemds.scuro.models.model import Model -from systemds.scuro.drsearch.task import Task from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer from systemds.scuro.representations.spectrogram import Spectrogram @@ -45,70 +40,15 @@ from systemds.scuro.representations.bow import BoW from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader +from tests.scuro.data_generator import ( + ModalityRandomDataGenerator, + TestDataLoader, + TestTask, +) from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.hyperparameter_tuner import HyperparameterTuner - -class TestSVM(Model): - def __init__(self): - super().__init__("TestSVM") - - def fit(self, X, y, X_test, y_test): - if X.ndim > 2: - X = X.reshape(X.shape[0], -1) - self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False) - self.clf = self.clf.fit(X, np.array(y)) - y_pred = self.clf.predict(X) - - return { - "accuracy": classification_report( - y, y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - def test(self, test_X: np.ndarray, test_y: np.ndarray): - if test_X.ndim > 2: - test_X = test_X.reshape(test_X.shape[0], -1) - y_pred = self.clf.predict(np.array(test_X)) # noqa] - - return { - "accuracy": classification_report( - np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - -class TestSVM2(Model): - def __init__(self): - super().__init__("TestSVM2") - - def fit(self, X, y, X_test, y_test): - if X.ndim > 2: - X = X.reshape(X.shape[0], -1) - self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False) - self.clf = self.clf.fit(X, np.array(y)) - y_pred = self.clf.predict(X) - - return { - "accuracy": classification_report( - y, y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - def test(self, test_X: np.ndarray, test_y: np.ndarray): - if test_X.ndim > 2: - test_X = test_X.reshape(test_X.shape[0], -1) - y_pred = self.clf.predict(np.array(test_X)) # noqa - - return { - "accuracy": classification_report( - np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - from unittest.mock import patch @@ -120,36 +60,10 @@ class TestHPTuner(unittest.TestCase): def setUpClass(cls): cls.num_instances = 10 cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - cls.labels = ModalityRandomDataGenerator().create_balanced_labels( - num_instances=cls.num_instances - ) cls.indices = np.array(range(cls.num_instances)) - - split = train_test_split( - cls.indices, - cls.labels, - test_size=0.2, - random_state=42, - ) - cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ - int(i) for i in split[1] - ] - cls.tasks = [ - Task( - "UnimodalRepresentationTask1", - TestSVM(), - cls.labels, - cls.train_indizes, - cls.val_indizes, - ), - Task( - "UnimodalRepresentationTask2", - TestSVM2(), - cls.labels, - cls.train_indizes, - cls.val_indizes, - ), + TestTask("UnimodalRepresentationTask1", "TestSVM1", cls.num_instances), + TestTask("UnimodalRepresentationTask2", "TestSVM2", cls.num_instances), ] def test_hp_tuner_for_audio_modality(self): diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py index f98824a16a5..e89843afcd7 100644 --- a/src/main/python/tests/scuro/test_multimodal_fusion.py +++ b/src/main/python/tests/scuro/test_multimodal_fusion.py @@ -22,8 +22,6 @@ import unittest import numpy as np -from sklearn import svm -from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer @@ -32,7 +30,6 @@ from systemds.scuro.representations.lstm import LSTM from systemds.scuro.representations.average import Average from systemds.scuro.drsearch.operator_registry import Registry -from systemds.scuro.models.model import Model from systemds.scuro.drsearch.task import Task from systemds.scuro.representations.spectrogram import Spectrogram @@ -43,70 +40,13 @@ from tests.scuro.data_generator import ( TestDataLoader, ModalityRandomDataGenerator, + TestTask, ) from systemds.scuro.modality.type import ModalityType from unittest.mock import patch -class TestSVM(Model): - def __init__(self): - super().__init__("TestSVM") - - def fit(self, X, y, X_test, y_test): - if X.ndim > 2: - X = X.reshape(X.shape[0], -1) - self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False) - self.clf = self.clf.fit(X, np.array(y)) - y_pred = self.clf.predict(X) - - return { - "accuracy": classification_report( - y, y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - def test(self, test_X: np.ndarray, test_y: np.ndarray): - if test_X.ndim > 2: - test_X = test_X.reshape(test_X.shape[0], -1) - y_pred = self.clf.predict(np.array(test_X)) # noqa - - return { - "accuracy": classification_report( - np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - -class TestCNN(Model): - def __init__(self): - super().__init__("TestCNN") - - def fit(self, X, y, X_test, y_test): - if X.ndim > 2: - X = X.reshape(X.shape[0], -1) - self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False) - self.clf = self.clf.fit(X, np.array(y)) - y_pred = self.clf.predict(X) - - return { - "accuracy": classification_report( - y, y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - def test(self, test_X: np.ndarray, test_y: np.ndarray): - if test_X.ndim > 2: - test_X = test_X.reshape(test_X.shape[0], -1) - y_pred = self.clf.predict(np.array(test_X)) # noqa - - return { - "accuracy": classification_report( - np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - class TestMultimodalRepresentationOptimizer(unittest.TestCase): test_file_path = None data_generator = None @@ -116,30 +56,10 @@ class TestMultimodalRepresentationOptimizer(unittest.TestCase): def setUpClass(cls): cls.num_instances = 10 cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - cls.labels = ModalityRandomDataGenerator().create_balanced_labels( - num_instances=cls.num_instances - ) cls.indices = np.array(range(cls.num_instances)) - split = train_test_split( - cls.indices, - cls.labels, - test_size=0.2, - random_state=42, - stratify=cls.labels, - ) - cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ - int(i) for i in split[1] - ] - def test_multimodal_fusion(self): - task = Task( - "MM_Fusion_Task1", - TestSVM(), - self.labels, - self.train_indizes, - self.val_indizes, - ) + task = TestTask("MM_Fusion_Task1", "Test1", self.num_instances) audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( self.num_instances, 1000 @@ -199,13 +119,7 @@ def test_multimodal_fusion(self): ) def test_parallel_multimodal_fusion(self): - task = Task( - "MM_Fusion_Task1", - TestSVM(), - self.labels, - self.train_indizes, - self.val_indizes, - ) + task = TestTask("MM_Fusion_Task1", "Test2", self.num_instances) audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( self.num_instances, 1000 diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index c33eb5fcc2b..2edada07396 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -21,7 +21,14 @@ import unittest -from systemds.scuro import FrequencyMagnitude +from systemds.scuro.representations.text_context import ( + SentenceBoundarySplit, + OverlappingSplit, +) +from systemds.scuro.representations.text_context_with_indices import ( + SentenceBoundarySplitIndices, + OverlappingSplitIndices, +) from systemds.scuro.representations.covarep_audio_features import ( ZeroCrossing, Spectral, @@ -124,11 +131,17 @@ def test_text_representations_in_registry(self): def test_context_operator_in_registry(self): registry = Registry() - assert registry.get_context_operators() == [ + assert registry.get_context_operators(ModalityType.TIMESERIES) == [ WindowAggregation, StaticWindow, DynamicWindow, ] + assert registry.get_context_operators(ModalityType.TEXT) == [ + SentenceBoundarySplit, + OverlappingSplit, + SentenceBoundarySplitIndices, + OverlappingSplitIndices, + ] # def test_fusion_operator_in_registry(self): # registry = Registry() diff --git a/src/main/python/tests/scuro/test_text_context_operators.py b/src/main/python/tests/scuro/test_text_context_operators.py new file mode 100644 index 00000000000..1f041654076 --- /dev/null +++ b/src/main/python/tests/scuro/test_text_context_operators.py @@ -0,0 +1,113 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + + +import unittest +from systemds.scuro.representations.text_context import ( + SentenceBoundarySplit, + OverlappingSplit, +) +from systemds.scuro.representations.text_context_with_indices import ( + SentenceBoundarySplitIndices, + OverlappingSplitIndices, +) +from tests.scuro.data_generator import ( + ModalityRandomDataGenerator, + TestDataLoader, + TestTask, +) +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.modality.type import ModalityType + + +class TestTextContextOperator(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.data_generator = ModalityRandomDataGenerator() + cls.data, cls.md = cls.data_generator.create_text_data(10, 50) + cls.text_modality = UnimodalModality( + TestDataLoader( + [i for i in range(0, 10)], + None, + ModalityType.TEXT, + cls.data, + str, + cls.md, + ) + ) + cls.text_modality.extract_raw_data() + cls.task = TestTask("TextContextTask", "Test1", 10) + + def test_sentence_boundary_split(self): + sentence_boundary_split = SentenceBoundarySplit(10, min_words=4) + chunks = sentence_boundary_split.execute(self.text_modality) + for i in range(0, len(chunks)): + for chunk in chunks[i]: + assert len(chunk.split(" ")) <= 10 and ( + chunk[-1] == "." or chunk[-1] == "!" or chunk[-1] == "?" + ) + + def test_overlapping_split(self): + overlapping_split = OverlappingSplit(40, 0.05) + chunks = overlapping_split.execute(self.text_modality) + for i in range(len(chunks)): + prev_chunk = "" + for j, chunk in enumerate(chunks[i]): + if j > 0: + prev_words = prev_chunk.split(" ") + curr_words = chunk.split(" ") + assert prev_words[-2:] == curr_words[:2] + prev_chunk = chunk + assert len(chunk.split(" ")) <= 40 + + def test_sentence_boundary_split_indices(self): + sentence_boundary_split = SentenceBoundarySplitIndices(10, min_words=4) + chunks = sentence_boundary_split.execute(self.text_modality) + for i in range(0, len(chunks)): + for chunk in chunks[i]: + text = self.text_modality.data[i][chunk[0] : chunk[1]].split(" ") + assert len(text) <= 10 and ( + text[-1][-1] == "." or text[-1][-1] == "!" or text[-1][-1] == "?" + ) + + def test_overlapping_split_indices(self): + overlapping_split = OverlappingSplitIndices(40, 0.1) + chunks = overlapping_split.execute(self.text_modality) + for i in range(len(chunks)): + prev_chunk = (0, 0) + for j, chunk in enumerate(chunks[i]): + if j > 0: + prev_words = self.text_modality.data[i][ + prev_chunk[0] : prev_chunk[1] + ].split(" ") + curr_words = self.text_modality.data[i][chunk[0] : chunk[1]].split( + " " + ) + assert prev_words[-4:] == curr_words[:4] + prev_chunk = chunk + assert ( + len(self.text_modality.data[i][chunk[0] : chunk[1]].split(" ")) + <= 40 + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index ca54ee64b1d..0d8ae901778 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -23,17 +23,11 @@ import unittest import numpy as np -from sklearn import svm -from sklearn.metrics import classification_report -from sklearn.model_selection import train_test_split - from systemds.scuro.representations.timeseries_representations import ( Mean, ACF, ) from systemds.scuro.drsearch.operator_registry import Registry -from systemds.scuro.models.model import Model -from systemds.scuro.drsearch.task import Task from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer from systemds.scuro.representations.spectrogram import Spectrogram @@ -44,69 +38,14 @@ from systemds.scuro.representations.bow import BoW from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader +from tests.scuro.data_generator import ( + ModalityRandomDataGenerator, + TestDataLoader, + TestTask, +) from systemds.scuro.modality.type import ModalityType - -class TestSVM(Model): - def __init__(self): - super().__init__("TestSVM") - - def fit(self, X, y, X_test, y_test): - if X.ndim > 2: - X = X.reshape(X.shape[0], -1) - self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False) - self.clf = self.clf.fit(X, np.array(y)) - y_pred = self.clf.predict(X) - - return { - "accuracy": classification_report( - y, y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - def test(self, test_X: np.ndarray, test_y: np.ndarray): - if test_X.ndim > 2: - test_X = test_X.reshape(test_X.shape[0], -1) - y_pred = self.clf.predict(np.array(test_X)) # noqa - - return { - "accuracy": classification_report( - np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - -class TestCNN(Model): - def __init__(self): - super().__init__("TestCNN") - - def fit(self, X, y, X_test, y_test): - if X.ndim > 2: - X = X.reshape(X.shape[0], -1) - self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False) - self.clf = self.clf.fit(X, np.array(y)) - y_pred = self.clf.predict(X) - - return { - "accuracy": classification_report( - y, y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - def test(self, test_X: np.ndarray, test_y: np.ndarray): - if test_X.ndim > 2: - test_X = test_X.reshape(test_X.shape[0], -1) - y_pred = self.clf.predict(np.array(test_X)) # noqa - - return { - "accuracy": classification_report( - np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1 - )["accuracy"] - }, 0 - - from unittest.mock import patch @@ -118,36 +57,12 @@ class TestUnimodalRepresentationOptimizer(unittest.TestCase): def setUpClass(cls): cls.num_instances = 10 cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - cls.labels = ModalityRandomDataGenerator().create_balanced_labels( - num_instances=cls.num_instances - ) - cls.indices = np.array(range(cls.num_instances)) - split = train_test_split( - cls.indices, - cls.labels, - test_size=0.2, - random_state=42, - ) - cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ - int(i) for i in split[1] - ] + cls.indices = np.array(range(cls.num_instances)) cls.tasks = [ - Task( - "UnimodalRepresentationTask1", - TestSVM(), - cls.labels, - cls.train_indizes, - cls.val_indizes, - ), - Task( - "UnimodalRepresentationTask2", - TestCNN(), - cls.labels, - cls.train_indizes, - cls.val_indizes, - ), + TestTask("UnimodalRepresentationTask1", "Test1", cls.num_instances), + TestTask("UnimodalRepresentationTask2", "Test2", cls.num_instances), ] def test_unimodal_optimizer_for_audio_modality(self):