Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/special_characters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"module with a list of special characters"
import typing as t
# Special characters encounter in sample sheet
special_characters_string: str = "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ"
special_characters_list: t.List[str] = list(set(special_characters_string))
special_characters_list.remove(" ")
special_characters_string: str = ("").join(special_characters_list)

111 changes: 70 additions & 41 deletions sample_sheet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import sys
import warnings

# special character import from special_character file
from docs.special_characters import special_characters

from contextlib import ExitStack
from itertools import chain, repeat, islice
from pathlib import Path
Expand Down Expand Up @@ -60,7 +63,7 @@
# https://www.illumina.com/content/dam/illumina-marketing/
# documents/products/technotes/
# sequencing-sheet-format-specifications-technical-note-970-2017-004.pdf
VALID_ASCII: Set[str] = set(ascii_letters + digits + punctuation + ' \n\r')
VALID_ASCII: Set[str] = set(ascii_letters + digits + punctuation + ' \n\r' + special_characters)


class ReadStructure(object):
Expand Down Expand Up @@ -397,7 +400,7 @@ class SampleSheet(object):

"""

_encoding: str = 'utf8'
_encoding: str = 'ISO-8859-1'
_section_header_re = re.compile(r'\[(.*)\]')
_whitespace_re = re.compile(r'\s+')

Expand Down Expand Up @@ -485,8 +488,10 @@ def _parse(self, handle: TextIO) -> None:
section_name: str = ''
sample_header: Optional[List[str]] = None

lines = list(csv.reader(handle, skipinitialspace=True))

with open(path, encoding=self._encoding) as handle:
lines = list(csv.reader(handle, skipinitialspace=True))

specChrLineNum = []
for i, line in enumerate(lines):
# Skip to next line if this line is empty to support formats of
# sample sheets with multiple newlines as section seperators.
Expand All @@ -501,46 +506,70 @@ def _parse(self, handle: TextIO) -> None:
character not in VALID_ASCII
for character in set(''.join(line))
):
raise ValueError(
f'Sample sheet contains invalid characters on line '
f'{i + 1}: {"".join(line)}'
)

header_match = self._section_header_re.match(line[0])

# If we enter a section save it's name and continue to next line.
if header_match:
section_name, *_ = header_match.groups()
if (
section_name not in self._sections
and section_name not in REQUIRED_SECTIONS
specChrLineNum.append(f'{i+1}')

if len(specChrLineNum) != 0 :
raise ValueError(
f'Sample sheet contains invalid characters on line '
f'{(", ").join(specChrLineNum)}'
)
else:

for i, line in enumerate(lines):
# Skip to next line if this line is empty to support formats of
# sample sheets with multiple newlines as section seperators.
#
# https://github.com/clintval/sample-sheet/issues/46
#
if not ''.join(line).strip():
continue

# Raise exception if we encounter invalid characters.
if any(
character not in VALID_ASCII
for character in set(''.join(line))
):
self.add_section(section_name)
continue

# [Reads] - vertical list of integers.
if section_name == 'Reads':
self.Reads.append(int(line[0]))
continue

# [Data] - delimited data with the first line a header.
elif section_name == 'Data':
if sample_header is not None:
self.add_sample(Sample(dict(zip(sample_header, line))))
elif any(key == '' for key in line):
raise ValueError(
f'Header for [Data] section is not allowed to '
f'have empty fields: {line}'
f'Sample sheet contains invalid characters on line '
f'{i + 1}: {"".join(line)}'
)
else:
sample_header = line
continue

# [<Other>] - keys in first column and values in second column.
elif len(line) >= 2:
key, value = (line[0], line[1])
section: Section = getattr(self, section_name)
section[key] = value

header_match = self._section_header_re.match(line[0])

# If we enter a section save it's name and continue to next line.
if header_match:
section_name, *_ = header_match.groups()
if (
section_name not in self._sections
and section_name not in REQUIRED_SECTIONS
):
self.add_section(section_name)
continue

# [Reads] - vertical list of integers.
if section_name == 'Reads':
self.Reads.append(int(line[0]))
continue

# [Data] - delimited data with the first line a header.
elif section_name == 'Data':
if sample_header is not None:
self.add_sample(Sample(dict(zip(sample_header, line))))
elif any(key == '' for key in line):
raise ValueError(
f'Header for [Data] section is not allowed to '
f'have empty fields: {line}'
)
else:
sample_header = line
continue

# [<Other>] - keys in first column and values in second column.
elif len(line) >= 2:
key, value = (line[0], line[1])
section: Section = getattr(self, section_name)
section[key] = value

def add_sample(self, sample: Sample) -> None:
"""Add a :class:`Sample` to this :class:`SampleSheet`.
Expand Down Expand Up @@ -706,7 +735,7 @@ def to_json(self, **kwargs: Mapping) -> str:
'Data': [sample.to_json() for sample in self.samples],
**{title: dict(getattr(self, title)) for title in self._sections},
}
return json.dumps(content, **kwargs) # type: ignore
return json.dumps(content, **kwargs), content # type: ignore

def to_picard_basecalling_params(
self,
Expand Down