Thanks for sharing the data! I found some annoying errors that might be caused by the data.
It raises an error below. Seems that the file data/processed/bee/cmu0000025765.h5 does not contain the text/bertpart. By further investigation, I found many of the h5 in bee lost their text/bert part.
G:\PATS\pats\data\dataUtils.py:134: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
self.df = self.df.append(pd.read_csv((Path(self.path2data)/'cmu_intervals_df_transforms.csv').as_posix())) ## file with evil twins
0%| | 0/576 [00:00<?, ?it/s]
data/processed/bee/cmu0000025765.h5 text/bert
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File G:\PATS\pats\data\dataUtils.py:485, in MiniData.__init__(self, path2h5, modalities, fs_new, time, modality_classes, window_hop, style, repeat_text, text_in_modalities, filler, **kwargs)
484 try:
--> 485 data, h5 = self.load(self.path2h5, modality)
486 except:
File G:\PATS\pats\data\common.py:39, in HDF5.load(filename, key)
36 # if key not in h5:
37 # print(f"{key} not in P{filename}!")
38 # return [], h5
---> 39 data = h5[key]
40 return data, h5
File h5py\_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py\_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File ~\anaconda3\envs\pats\lib\site-packages\h5py\_hl\group.py:305, in Group.__getitem__(self, name)
304 elif isinstance(name, (bytes, str)):
--> 305 oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
306 else:
File h5py\_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py\_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File h5py\h5o.pyx:190, in h5py.h5o.open()
KeyError: 'Unable to open object (component not found)'
During handling of the above exception, another exception occurred:
SystemExit Traceback (most recent call last)
[... skipping hidden 1 frame]
Input In [3], in <cell line: 1>()
----> 1 data = Data(**common_kwargs)
File G:\PATS\pats\data\dataUtils.py:155, in Data.__init__(self, path2data, speaker, modalities, fs_new, time, split, batch_size, shuffle, num_workers, window_hop, load_data, style_iters, num_training_sample, sample_all_styles, repeat_text, quantile_sample, quantile_num_training_sample, weighted, filler, num_training_iters)
153 #if self.load_data:
154 ## get train-dev-test split
--> 155 self.datasets = self.tdt_split()
156 self.dataLoader_kwargs = {'batch_size':batch_size,
157 'shuffle':shuffle,
158 'num_workers':num_workers,
159 'pin_memory':False}
File G:\PATS\pats\data\dataUtils.py:292, in Data.tdt_split(self)
290 self.test_intervals = test_intervals
--> 292 dataset_train = ConcatDatasetIndex(self.get_minidata_list(train_intervals))
293 dataset_dev = ConcatDatasetIndex(self.get_minidata_list(dev_intervals))
File G:\PATS\pats\data\dataUtils.py:247, in Data.get_minidata_list(self, intervals)
246 def get_minidata_list(self, intervals):
--> 247 return [MiniData(self.getPath2file(interval_id), style=self.getStyle(interval_id), **self.minidataKwargs)
248 for interval_id in tqdm(intervals)]
File G:\PATS\pats\data\dataUtils.py:247, in <listcomp>(.0)
246 def get_minidata_list(self, intervals):
--> 247 return [MiniData(self.getPath2file(interval_id), style=self.getStyle(interval_id), **self.minidataKwargs)
248 for interval_id in tqdm(intervals)]
File G:\PATS\pats\data\dataUtils.py:491, in MiniData.__init__(self, path2h5, modalities, fs_new, time, modality_classes, window_hop, style, repeat_text, text_in_modalities, filler, **kwargs)
488 # print(f"{modality} not in {self.path2h5}")
489 # return
--> 491 sys.exit(1)
493 self.shapes.append(data.shape)
SystemExit: 1
During handling of the above exception, another exception occurred:
AssertionError Traceback (most recent call last)
[... skipping hidden 1 frame]
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\interactiveshell.py:1972, in InteractiveShell.showtraceback(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)
1969 if exception_only:
1970 stb = ['An exception has occurred, use %tb to see '
1971 'the full traceback.\n']
-> 1972 stb.extend(self.InteractiveTB.get_exception_only(etype,
1973 value))
1974 else:
1975 try:
1976 # Exception classes can customise their traceback - we
1977 # use this in IPython.parallel for exceptions occurring
1978 # in the engines. This should return a list of strings.
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\ultratb.py:578, in ListTB.get_exception_only(self, etype, value)
570 def get_exception_only(self, etype, value):
571 """Only print the exception type and message, without a traceback.
572
573 Parameters
(...)
576 value : exception value
577 """
--> 578 return ListTB.structured_traceback(self, etype, value)
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\ultratb.py:436, in ListTB.structured_traceback(self, etype, evalue, etb, tb_offset, context)
433 chained_exc_ids.add(id(exception[1]))
434 chained_exceptions_tb_offset = 0
435 out_list = (
--> 436 self.structured_traceback(
437 etype, evalue, (etb, chained_exc_ids),
438 chained_exceptions_tb_offset, context)
439 + chained_exception_message
440 + out_list)
442 return out_list
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\ultratb.py:1105, in AutoFormattedTB.structured_traceback(self, etype, value, tb, tb_offset, number_of_lines_of_context)
1103 else:
1104 self.tb = tb
-> 1105 return FormattedTB.structured_traceback(
1106 self, etype, value, tb, tb_offset, number_of_lines_of_context)
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\ultratb.py:999, in FormattedTB.structured_traceback(self, etype, value, tb, tb_offset, number_of_lines_of_context)
996 mode = self.mode
997 if mode in self.verbose_modes:
998 # Verbose modes need a full traceback
--> 999 return VerboseTB.structured_traceback(
1000 self, etype, value, tb, tb_offset, number_of_lines_of_context
1001 )
1002 elif mode == 'Minimal':
1003 return ListTB.get_exception_only(self, etype, value)
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\ultratb.py:852, in VerboseTB.structured_traceback(self, etype, evalue, etb, tb_offset, number_of_lines_of_context)
850 """Return a nice text document describing the traceback."""
851 assert etb is not None
--> 852 formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,
853 tb_offset)
855 colors = self.Colors # just a shorthand + quicker name lookup
856 colorsnormal = colors.Normal # used a lot
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\ultratb.py:786, in VerboseTB.format_exception_as_a_whole(self, etype, evalue, etb, number_of_lines_of_context, tb_offset)
784 assert isinstance(tb_offset, int)
785 head = self.prepare_header(etype, self.long_header)
--> 786 records = self.get_records(etb, number_of_lines_of_context, tb_offset)
788 frames = []
789 skipped = 0
File ~\anaconda3\envs\pats\lib\site-packages\IPython\core\ultratb.py:840, in VerboseTB.get_records(self, etb, number_of_lines_of_context, tb_offset)
834 options = stack_data.Options(
835 before=before,
836 after=after,
837 pygments_formatter=formatter,
838 )
839 assert etb is not None
--> 840 return list(stack_data.FrameInfo.stack_data(etb, options=options))[tb_offset:]
File ~\anaconda3\envs\pats\lib\site-packages\stack_data\core.py:546, in FrameInfo.stack_data(cls, frame_or_tb, options, collapse_repeated_frames)
530 @classmethod
531 def stack_data(
532 cls,
(...)
536 collapse_repeated_frames: bool = True
537 ) -> Iterator[Union['FrameInfo', RepeatedFrames]]:
538 """
539 An iterator of FrameInfo and RepeatedFrames objects representing
540 a full traceback or stack. Similar consecutive frames are collapsed into RepeatedFrames
(...)
544 and optionally an Options object to configure.
545 """
--> 546 stack = list(iter_stack(frame_or_tb))
548 # Reverse the stack from a frame so that it's in the same order
549 # as the order from a traceback, which is the order of a printed
550 # traceback when read top to bottom (most recent call last)
551 if is_frame(frame_or_tb):
File ~\anaconda3\envs\pats\lib\site-packages\stack_data\utils.py:98, in iter_stack(frame_or_tb)
96 while frame_or_tb:
97 yield frame_or_tb
---> 98 if is_frame(frame_or_tb):
99 frame_or_tb = frame_or_tb.f_back
100 else:
File ~\anaconda3\envs\pats\lib\site-packages\stack_data\utils.py:91, in is_frame(frame_or_tb)
90 def is_frame(frame_or_tb: Union[FrameType, TracebackType]) -> bool:
---> 91 assert_(isinstance(frame_or_tb, (types.FrameType, types.TracebackType)))
92 return isinstance(frame_or_tb, (types.FrameType,))
File ~\anaconda3\envs\pats\lib\site-packages\stack_data\utils.py:172, in assert_(condition, error)
170 if isinstance(error, str):
171 error = AssertionError(error)
--> 172 raise error
AssertionError:
Thanks for sharing the data! I found some annoying errors that might be caused by the data.
When I run this:
It raises an error below. Seems that the file
data/processed/bee/cmu0000025765.h5does not contain thetext/bertpart. By further investigation, I found many of the h5 inbeelost theirtext/bertpart.