Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ create-samplesheet [ARGS]
- `-j --json`: Ouptut JSON formatted samplesheet
- `-y --yaml`: Output YAML formatted samplesheet
- `-m --msa-dir`: Directory to search for corresponding MSA files in (Only accessible in yaml output)
- `--yaml-rfaa`: Output YAML formatted sequence files with a samplesheet.csv file

#### `--msa-dir`
When using the YAML output mode (`-y`, `--yaml`), you can provide a path to a directory containg sample's pre-computed multiple sequence alignment files (`.a3m` files). In order for these files to automatically be associated with it's corresponding sample, the filenames must follow the following format:
Expand Down
47 changes: 44 additions & 3 deletions samplesheetutils/binaries/create_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
MODE_DIR_JSON = 3
MODE_STRING_YAML = 4
MODE_DIR_YAML = 5
MODE_STRING_YAML_RFAA = 8
MODE_DIR_YAML_RFAA = 9

# Set up logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -40,6 +42,7 @@ def create_samplesheet():
parser.add_argument('-f', '--fasta-header', help='Column name for fasta path', default='fasta', dest='fasta_header')
parser.add_argument('-j', '--json', help='Output json format instead of csv', action='store_true', dest='json')
parser.add_argument('-y', '--yaml', help='Output yaml format instead of csv', action='store_true', dest='yaml')
parser.add_argument('--yaml-rfaa', help='Output yaml format for RFAA', action='store_true', dest='yaml_rfaa')
parser.add_argument('-t', '--fasta-dir', help='Output directory for temporary fasta files', default=os.getcwd(), dest='fasta_dir')
parser.add_argument('-r', '--fasta-match', help='Regex to match for fasta files in directory mode', default='.*\.(fa(a)?(sta)?|y(a)?ml).*$', dest='fasta_regex')
parser.add_argument('--monomer', help='Create a samplesheet entry for each sample in a fasta file', default=False, action='store_true', dest='monomer')
Expand Down Expand Up @@ -68,14 +71,15 @@ def create_samplesheet():
if (type(args.seq_chars) is not int):
raise ValueError("seq_chars is not a number")

if (args.json and args.yaml):
raise ValueError("Invaid mode combination. You cannot set --json and --yaml at the same time")
if (sum([args.json, args.yaml, args.yaml_rfaa]) > 1):
raise ValueError("Invaid mode combination. You cannot set --json, --yaml and --yaml-rfaa at the same time")

# Mode
mode = 0
mode |= bool(args.dir)
mode |= args.json << 1
mode |= args.yaml << 2
mode |= args.yaml_rfaa << 3
logger.debug(f"mode: {mode}")
logger.debug("Will attempt to locate MSAs" if args.msa_dir else "Will NOT attempt to locate MSAs")

Expand Down Expand Up @@ -189,9 +193,46 @@ def create_samplesheet():
args.output_file = args.output_file.replace(".csv", ".yaml")
samplesheet_path = args.output_file

samplesheet_path = args.output_file
with open(samplesheet_path, "w") as ss_fp:
create_yaml_boltz(sample_data, ss_fp)

if mode == MODE_STRING_YAML_RFAA:
aa_sample_name = sample_name(args.aa_string, seq_chars=args.seq_chars)
# overwrite defaults
aa_sample_file_name = file_name(aa_sample_name, prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml")
aa_path = args.fasta_dir + "/" + aa_sample_file_name

# Create the fasta file
sample_data = Sample(aa_sample_name, aa_path, args.aa_string)
with open(aa_sample_file_name, "w") as s_fp:
create_sample_rfaa(sample_data, s_fp)

samplesheet_path = args.output_file

with open(samplesheet_path, "w") as ss_fp:
create_yaml_boltz(sample_data, ss_fp)
create_csv([sample_data], args.seq_header, args.fasta_header, ss_fp)
if mode == MODE_DIR_YAML_RFAA:
logger.debug(f"Checking {args.dir} for fasta files")
file_list = [os.path.join(args.dir, f) for f in os.listdir(args.dir) if os.path.isfile(os.path.join(args.dir, f))]
logger.debug(f"Fasta files: {file_list}")
file_list = [i for i in file_list if re.search(args.fasta_regex, i)]
logger.debug(f"File list aginst regex: {file_list}")
sample_data = []

for i_file_name in file_list:
with open(i_file_name, "r") as fp:
fasta_data = read_fasta(fp, read_data=True, single_line=False)
for i in range(len(fasta_data)):
fasta_data[i].path = file_name(sanitize_input(fasta_data[i].name), prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml")
# Attempt to find MSA if the MSA directory flag is set
sample_data.extend(fasta_data)
logger.debug(f"Added sample {i_file_name}, {fasta_data}")
logger.debug(f"Sample data {sample_data[-1].data}")

for sample in sample_data:
with open(sample.path, "w") as s_fp:
create_sample_rfaa(sample, s_fp)

with open(args.output_file, "w") as ss_fp:
create_csv(sample_data, args.seq_header, args.fasta_header, ss_fp)
54 changes: 54 additions & 0 deletions samplesheetutils/tests/test_create_yaml_rfaa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import unittest, yaml, os
from samplesheetutils.utils.output import *
from samplesheetutils.utils.sample import *

class TestCreateYAML(unittest.TestCase):
def test_create_yaml_single_sample(self):
sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA")

with open(".tmp.yaml", "w") as fp:
create_yaml_rfaa(sample_input, fp)

fp = open(".tmp.yaml", "r")
fp_data = fp.read()
fp.close()

self.assertEqual(fp_data, 'sequences:\n- protein:\n id: TEST\n sequence: AAAAAA\nversion: 1\n')
os.remove(".tmp.yaml")

def test_create_yaml_multiple_sample(self):
sample_input = []
sample_input.append(Sample("TEST1", ".tmp1.yaml", "AAAAAA"))
sample_input.append(Sample("TEST2", ".tmp2.yaml", "AAAAAA"))
sample_input.append(Sample("TEST3", ".tmp3.yaml", "AAAAAA"))

for i in range(len(sample_input)):
with open(f".tmp.{str(i)}.yaml", "w") as fp:
create_yaml_rfaa(sample_input[i], fp)

for i in range(len(sample_input)):
fp = open(f".tmp.{str(i)}.yaml", "r")
fp_data = fp.read()
fp.close()

self.assertEqual(fp_data, 'sequences:\n- protein:\n id: TEST\n sequence: AAAAAA\nversion: 1\n')
for i in range(len(sample_input)):
os.remove(f".tmp.{str(i)}.yaml")

def test_create_samplesheet_csv_single_sample(self):
sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA")

with open(".tmp.yaml", "w") as fp:
create_yaml_rfaa(sample_input, fp)

with open(".tmp.csv", "w") as fp:
create_csv([sample_input], "id", "fasta", fp)

fp = open(".tmp.csv")
fp_data = fp.read()
fp.close()

self.assertEqual(fp_data, "id,fasta\nTEST,.tmp.yaml\n")

os.remove(".tmp.yaml")
os.remove(".tmp.csv")
2 changes: 1 addition & 1 deletion samplesheetutils/utils/input.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def sanitize_input(input_str, disallowed_chars = [',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'):
def sanitize_input(input_str, disallowed_chars = ['|', '[', ']', ',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'):
for i in disallowed_chars:
input_str = input_str.replace(i, '_')
return input_str
Expand Down
16 changes: 16 additions & 0 deletions samplesheetutils/utils/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ def create_yaml_boltz(data, fp):

yaml.dump(output_data, fp, default_flow_style=False)

def create_yaml_rfaa(data, fp):
# Used for RFAA
output_data = {
"version": 1,
"sequences": []
}

output_data["sequences"].append({
"protein": {
"id": data.name[:min(4,len(data.name))],
"sequence": data.data
}
})

yaml.dump(output_data, fp, default_flow_style=False)

def create_json(data, fp):
dict_data = {"entities": []}

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="samplesheetutils",
version="1.1.2",
version="1.2",
author="Nathan Glades",
author_email="n.glades@unsw.edu.au",
packages=find_packages(),
Expand Down