From b43a04e05de1291bf5777139a5335517c9dbc143 Mon Sep 17 00:00:00 2001 From: Luiz Henrique Pereira Niero Date: Mon, 31 Jan 2022 12:43:49 -0300 Subject: [PATCH] Addapting preProcessing for some models of HF Some models of HuggingFace, like 'pucpr/bioBERTpt-squad-v1.1-portuguese' uses 'None' as self.pad_token. It is a problem because return expects a vector of numbers and receives a vector with numbesr (for existing words) and None (for padding words). This if fix it. --- src/NERDA/preprocessing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/NERDA/preprocessing.py b/src/NERDA/preprocessing.py index 4dd7dae..4d481ae 100644 --- a/src/NERDA/preprocessing.py +++ b/src/NERDA/preprocessing.py @@ -99,6 +99,10 @@ def __getitem__(self, item): # compute padding length if self.pad_sequences: padding_len = self.max_len - len(input_ids) + if self.pad_token_id == None: + input_ids = input_ids + ([0] * padding_len) + else: + input_ids = input_ids + ([self.pad_token_id] * padding_len) input_ids = input_ids + ([self.pad_token_id] * padding_len) masks = masks + ([0] * padding_len) offsets = offsets + ([0] * padding_len)