diff --git a/docs/special_characters.py b/docs/special_characters.py new file mode 100644 index 0000000..6abb8f1 --- /dev/null +++ b/docs/special_characters.py @@ -0,0 +1,8 @@ +"module with a list of special characters" +import typing as t +# Special characters encounter in sample sheet +special_characters_string: str = "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ" +special_characters_list: t.List[str] = list(set(special_characters_string)) +special_characters_list.remove(" ") +special_characters_string: str = ("").join(special_characters_list) + diff --git a/sample_sheet/__init__.py b/sample_sheet/__init__.py index 7a322b5..6d25620 100644 --- a/sample_sheet/__init__.py +++ b/sample_sheet/__init__.py @@ -6,6 +6,9 @@ import sys import warnings +# special character import from special_character file +from docs.special_characters import special_characters + from contextlib import ExitStack from itertools import chain, repeat, islice from pathlib import Path @@ -60,7 +63,7 @@ # https://www.illumina.com/content/dam/illumina-marketing/ # documents/products/technotes/ # sequencing-sheet-format-specifications-technical-note-970-2017-004.pdf -VALID_ASCII: Set[str] = set(ascii_letters + digits + punctuation + ' \n\r') +VALID_ASCII: Set[str] = set(ascii_letters + digits + punctuation + ' \n\r' + special_characters) class ReadStructure(object): @@ -397,7 +400,7 @@ class SampleSheet(object): """ - _encoding: str = 'utf8' + _encoding: str = 'ISO-8859-1' _section_header_re = re.compile(r'\[(.*)\]') _whitespace_re = re.compile(r'\s+') @@ -485,8 +488,10 @@ def _parse(self, handle: TextIO) -> None: section_name: str = '' sample_header: Optional[List[str]] = None - lines = list(csv.reader(handle, skipinitialspace=True)) - + with open(path, encoding=self._encoding) as handle: + lines = list(csv.reader(handle, skipinitialspace=True)) + + specChrLineNum = [] for i, line in enumerate(lines): # Skip to next line if this line is empty to support formats of # sample sheets with multiple newlines as section seperators. @@ -501,46 +506,70 @@ def _parse(self, handle: TextIO) -> None: character not in VALID_ASCII for character in set(''.join(line)) ): - raise ValueError( - f'Sample sheet contains invalid characters on line ' - f'{i + 1}: {"".join(line)}' - ) - - header_match = self._section_header_re.match(line[0]) - - # If we enter a section save it's name and continue to next line. - if header_match: - section_name, *_ = header_match.groups() - if ( - section_name not in self._sections - and section_name not in REQUIRED_SECTIONS + specChrLineNum.append(f'{i+1}') + + if len(specChrLineNum) != 0 : + raise ValueError( + f'Sample sheet contains invalid characters on line ' + f'{(", ").join(specChrLineNum)}' + ) + else: + + for i, line in enumerate(lines): + # Skip to next line if this line is empty to support formats of + # sample sheets with multiple newlines as section seperators. + # + # https://github.com/clintval/sample-sheet/issues/46 + # + if not ''.join(line).strip(): + continue + + # Raise exception if we encounter invalid characters. + if any( + character not in VALID_ASCII + for character in set(''.join(line)) ): - self.add_section(section_name) - continue - - # [Reads] - vertical list of integers. - if section_name == 'Reads': - self.Reads.append(int(line[0])) - continue - - # [Data] - delimited data with the first line a header. - elif section_name == 'Data': - if sample_header is not None: - self.add_sample(Sample(dict(zip(sample_header, line)))) - elif any(key == '' for key in line): raise ValueError( - f'Header for [Data] section is not allowed to ' - f'have empty fields: {line}' + f'Sample sheet contains invalid characters on line ' + f'{i + 1}: {"".join(line)}' ) - else: - sample_header = line - continue - # [] - keys in first column and values in second column. - elif len(line) >= 2: - key, value = (line[0], line[1]) - section: Section = getattr(self, section_name) - section[key] = value + + header_match = self._section_header_re.match(line[0]) + + # If we enter a section save it's name and continue to next line. + if header_match: + section_name, *_ = header_match.groups() + if ( + section_name not in self._sections + and section_name not in REQUIRED_SECTIONS + ): + self.add_section(section_name) + continue + + # [Reads] - vertical list of integers. + if section_name == 'Reads': + self.Reads.append(int(line[0])) + continue + + # [Data] - delimited data with the first line a header. + elif section_name == 'Data': + if sample_header is not None: + self.add_sample(Sample(dict(zip(sample_header, line)))) + elif any(key == '' for key in line): + raise ValueError( + f'Header for [Data] section is not allowed to ' + f'have empty fields: {line}' + ) + else: + sample_header = line + continue + + # [] - keys in first column and values in second column. + elif len(line) >= 2: + key, value = (line[0], line[1]) + section: Section = getattr(self, section_name) + section[key] = value def add_sample(self, sample: Sample) -> None: """Add a :class:`Sample` to this :class:`SampleSheet`. @@ -706,7 +735,7 @@ def to_json(self, **kwargs: Mapping) -> str: 'Data': [sample.to_json() for sample in self.samples], **{title: dict(getattr(self, title)) for title in self._sections}, } - return json.dumps(content, **kwargs) # type: ignore + return json.dumps(content, **kwargs), content # type: ignore def to_picard_basecalling_params( self,