From 42fd53254652efb86561aa2fef35ae145ae2c439 Mon Sep 17 00:00:00 2001 From: Gaurav Srivastava Date: Tue, 23 Nov 2021 14:06:58 +0530 Subject: [PATCH 1/2] (66) special-character imported from special_character file. and added to VALID ASCII. (403) _encoding changed from utf8 to ISO-8859-1. (489-512) code check for new special charactet not currently present added to VALID ASCII. (733) return also return content variable which is json in real form. First valiable return json in str format. Issue1: special character. Fixes #112 --- docs/special_characters.py | 7 +++ sample_sheet/__init__.py | 108 +++++++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 40 deletions(-) create mode 100644 docs/special_characters.py diff --git a/docs/special_characters.py b/docs/special_characters.py new file mode 100644 index 0000000..b82995e --- /dev/null +++ b/docs/special_characters.py @@ -0,0 +1,7 @@ + +# Special characters encounter in sample sheet +special_characters = 'À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ' +special_characters = list(set(special_characters)) +special_characters.remove(" ") +special_characters = ("").join(special_characters) + diff --git a/sample_sheet/__init__.py b/sample_sheet/__init__.py index 18ec4c9..e2eec81 100644 --- a/sample_sheet/__init__.py +++ b/sample_sheet/__init__.py @@ -6,6 +6,9 @@ import sys import warnings +# special character import from special_character file +from docs.special_characters import special_characters + from contextlib import ExitStack from itertools import chain, repeat, islice from pathlib import Path @@ -60,7 +63,7 @@ # https://www.illumina.com/content/dam/illumina-marketing/ # documents/products/technotes/ # sequencing-sheet-format-specifications-technical-note-970-2017-004.pdf -VALID_ASCII: Set[str] = set(ascii_letters + digits + punctuation + ' \n\r') +VALID_ASCII: Set[str] = set(ascii_letters + digits + punctuation + ' \n\r' + special_characters) class ReadStructure(object): @@ -397,7 +400,7 @@ class SampleSheet(object): """ - _encoding: str = 'utf8' + _encoding: str = 'ISO-8859-1' _section_header_re = re.compile(r'\[(.*)\]') _whitespace_re = re.compile(r'\s+') @@ -482,7 +485,8 @@ def _parse(self, path: Union[Path, str]) -> None: with open(path, encoding=self._encoding) as handle: lines = list(csv.reader(handle, skipinitialspace=True)) - + + specChrLineNum = [] for i, line in enumerate(lines): # Skip to next line if this line is empty to support formats of # sample sheets with multiple newlines as section seperators. @@ -497,46 +501,70 @@ def _parse(self, path: Union[Path, str]) -> None: character not in VALID_ASCII for character in set(''.join(line)) ): - raise ValueError( - f'Sample sheet contains invalid characters on line ' - f'{i + 1}: {"".join(line)}' - ) - - header_match = self._section_header_re.match(line[0]) - - # If we enter a section save it's name and continue to next line. - if header_match: - section_name, *_ = header_match.groups() - if ( - section_name not in self._sections - and section_name not in REQUIRED_SECTIONS + specChrLineNum.append(f'{i+1}') + + if len(specChrLineNum) != 0 : + raise ValueError( + f'Sample sheet contains invalid characters on line ' + f'{(", ").join(specChrLineNum)}' + ) + else: + + for i, line in enumerate(lines): + # Skip to next line if this line is empty to support formats of + # sample sheets with multiple newlines as section seperators. + # + # https://github.com/clintval/sample-sheet/issues/46 + # + if not ''.join(line).strip(): + continue + + # Raise exception if we encounter invalid characters. + if any( + character not in VALID_ASCII + for character in set(''.join(line)) ): - self.add_section(section_name) - continue - - # [Reads] - vertical list of integers. - if section_name == 'Reads': - self.Reads.append(int(line[0])) - continue - - # [Data] - delimited data with the first line a header. - elif section_name == 'Data': - if sample_header is not None: - self.add_sample(Sample(dict(zip(sample_header, line)))) - elif any(key == '' for key in line): raise ValueError( - f'Header for [Data] section is not allowed to ' - f'have empty fields: {line}' + f'Sample sheet contains invalid characters on line ' + f'{i + 1}: {"".join(line)}' ) - else: - sample_header = line - continue - # [] - keys in first column and values in second column. - elif len(line) >= 2: - key, value = (line[0], line[1]) - section: Section = getattr(self, section_name) - section[key] = value + + header_match = self._section_header_re.match(line[0]) + + # If we enter a section save it's name and continue to next line. + if header_match: + section_name, *_ = header_match.groups() + if ( + section_name not in self._sections + and section_name not in REQUIRED_SECTIONS + ): + self.add_section(section_name) + continue + + # [Reads] - vertical list of integers. + if section_name == 'Reads': + self.Reads.append(int(line[0])) + continue + + # [Data] - delimited data with the first line a header. + elif section_name == 'Data': + if sample_header is not None: + self.add_sample(Sample(dict(zip(sample_header, line)))) + elif any(key == '' for key in line): + raise ValueError( + f'Header for [Data] section is not allowed to ' + f'have empty fields: {line}' + ) + else: + sample_header = line + continue + + # [] - keys in first column and values in second column. + elif len(line) >= 2: + key, value = (line[0], line[1]) + section: Section = getattr(self, section_name) + section[key] = value def add_sample(self, sample: Sample) -> None: """Add a :class:`Sample` to this :class:`SampleSheet`. @@ -702,7 +730,7 @@ def to_json(self, **kwargs: Mapping) -> str: 'Data': [sample.to_json() for sample in self.samples], **{title: dict(getattr(self, title)) for title in self._sections}, } - return json.dumps(content, **kwargs) # type: ignore + return json.dumps(content, **kwargs), content # type: ignore def to_picard_basecalling_params( self, From 7314271d984f2cbb05631543f94f3b2d926c2f1b Mon Sep 17 00:00:00 2001 From: Gaurav Srivastava <91732923+gaurav-basesolve@users.noreply.github.com> Date: Fri, 20 Jan 2023 17:21:23 +0530 Subject: [PATCH 2/2] resolving pylance --- docs/special_characters.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/special_characters.py b/docs/special_characters.py index b82995e..6abb8f1 100644 --- a/docs/special_characters.py +++ b/docs/special_characters.py @@ -1,7 +1,8 @@ - +"module with a list of special characters" +import typing as t # Special characters encounter in sample sheet -special_characters = 'À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ' -special_characters = list(set(special_characters)) -special_characters.remove(" ") -special_characters = ("").join(special_characters) +special_characters_string: str = "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ" +special_characters_list: t.List[str] = list(set(special_characters_string)) +special_characters_list.remove(" ") +special_characters_string: str = ("").join(special_characters_list)