From 3dea3d15bb6cadb86eb7b7bb4cc2ab0c7ad97283 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Mon, 28 Apr 2025 14:02:15 +1000 Subject: [PATCH 1/5] feat(rfaa-yaml): Added yaml rfaa --- .../binaries/create_samplesheet.py | 47 +++++++++++++++++-- samplesheetutils/utils/output.py | 16 +++++++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/samplesheetutils/binaries/create_samplesheet.py b/samplesheetutils/binaries/create_samplesheet.py index a67ff1b..04ba340 100644 --- a/samplesheetutils/binaries/create_samplesheet.py +++ b/samplesheetutils/binaries/create_samplesheet.py @@ -11,6 +11,8 @@ MODE_DIR_JSON = 3 MODE_STRING_YAML = 4 MODE_DIR_YAML = 5 +MODE_STRING_YAML_RFAA = 8 +MODE_DIR_YAML_RFAA = 9 # Set up logging logger = logging.getLogger(__name__) @@ -40,6 +42,7 @@ def create_samplesheet(): parser.add_argument('-f', '--fasta-header', help='Column name for fasta path', default='fasta', dest='fasta_header') parser.add_argument('-j', '--json', help='Output json format instead of csv', action='store_true', dest='json') parser.add_argument('-y', '--yaml', help='Output yaml format instead of csv', action='store_true', dest='yaml') + parser.add_argument('--yaml-rfaa', help='Output yaml format for RFAA', action='store_true', dest='yaml_rfaa') parser.add_argument('-t', '--fasta-dir', help='Output directory for temporary fasta files', default=os.getcwd(), dest='fasta_dir') parser.add_argument('-r', '--fasta-match', help='Regex to match for fasta files in directory mode', default='.*\.(fa(a)?(sta)?|y(a)?ml).*$', dest='fasta_regex') parser.add_argument('--monomer', help='Create a samplesheet entry for each sample in a fasta file', default=False, action='store_true', dest='monomer') @@ -68,14 +71,15 @@ def create_samplesheet(): if (type(args.seq_chars) is not int): raise ValueError("seq_chars is not a number") - if (args.json and args.yaml): - raise ValueError("Invaid mode combination. You cannot set --json and --yaml at the same time") + if (sum([args.json, args.yaml, args.yaml_rfaa]) > 1): + raise ValueError("Invaid mode combination. You cannot set --json, --yaml and --yaml-rfaa at the same time") # Mode mode = 0 mode |= bool(args.dir) mode |= args.json << 1 mode |= args.yaml << 2 + mode |= args.yaml_rfaa << 3 logger.debug(f"mode: {mode}") logger.debug("Will attempt to locate MSAs" if args.msa_dir else "Will NOT attempt to locate MSAs") @@ -189,9 +193,46 @@ def create_samplesheet(): args.output_file = args.output_file.replace(".csv", ".yaml") samplesheet_path = args.output_file + samplesheet_path = args.output_file + with open(samplesheet_path, "w") as ss_fp: + create_yaml_boltz(sample_data, ss_fp) + if mode == MODE_STRING_YAML_RFAA: + aa_sample_name = sample_name(args.aa_string, seq_chars=args.seq_chars) + # overwrite defaults + aa_sample_file_name = file_name(aa_sample_name, prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml") + aa_path = args.fasta_dir + "/" + aa_sample_file_name + + # Create the fasta file + sample_data = Sample(aa_sample_name, aa_path, args.aa_string) + with open(aa_sample_file_name, "w") as s_fp: + create_yaml_sample(sample_data, s_fp) samplesheet_path = args.output_file + with open(samplesheet_path, "w") as ss_fp: - create_yaml_boltz(sample_data, ss_fp) + create_csv([sample_data], args.seq_header, args.fasta_header, ss_fp) + if mode == MODE_DIR_YAML_RFAA: + logger.debug(f"Checking {args.dir} for fasta files") + file_list = [os.path.join(args.dir, f) for f in os.listdir(args.dir) if os.path.isfile(os.path.join(args.dir, f))] + logger.debug(f"Fasta files: {file_list}") + file_list = [i for i in file_list if re.search(args.fasta_regex, i)] + logger.debug(f"File list aginst regex: {file_list}") + sample_data = [] + + for i_file_name in file_list: + with open(i_file_name, "r") as fp: + fasta_data = read_fasta(fp, read_data=True, single_line=False) + for i in range(len(fasta_data)): + fasta_data[i].path = file_name(sanitize_input(fasta_data[i].name), prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml") + # Attempt to find MSA if the MSA directory flag is set + sample_data.extend(fasta_data) + logger.debug(f"Added sample {i_file_name}, {fasta_data}") + logger.debug(f"Sample data {sample_data[-1].data}") + for sample in sample_data: + with open(sample.path, "w") as s_fp: + create_yaml_sample(sample, s_fp) + + with open(args.output_file, "w") as ss_fp: + create_csv(sample_data, args.seq_header, args.fasta_header, ss_fp) diff --git a/samplesheetutils/utils/output.py b/samplesheetutils/utils/output.py index 7e4c917..5eec918 100644 --- a/samplesheetutils/utils/output.py +++ b/samplesheetutils/utils/output.py @@ -27,6 +27,22 @@ def create_yaml_boltz(data, fp): yaml.dump(output_data, fp, default_flow_style=False) +def create_yaml_sample(data, fp): + # Used for RFAA + output_data = { + "version": 1, + "sequences": [] + } + + output_data["sequences"].append({ + "protein": { + "id": data.name[:min(4,len(data.name))], + "sequence": data.data + } + }) + + yaml.dump(output_data, fp, default_flow_style=False) + def create_json(data, fp): dict_data = {"entities": []} From 6b39a5bad901038dec6d48341e90df77a4656627 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Mon, 28 Apr 2025 14:02:41 +1000 Subject: [PATCH 2/5] feat(blacklist): Added chars to default blacklisty --- samplesheetutils/utils/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samplesheetutils/utils/input.py b/samplesheetutils/utils/input.py index 3dac69c..dc0dcbe 100644 --- a/samplesheetutils/utils/input.py +++ b/samplesheetutils/utils/input.py @@ -1,4 +1,4 @@ -def sanitize_input(input_str, disallowed_chars = [',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'): +def sanitize_input(input_str, disallowed_chars = ['|', '[', ']', ',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'): for i in disallowed_chars: input_str = input_str.replace(i, '_') return input_str From 6d529bb87c77d412af52d967e40e505e7b103fd6 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Mon, 28 Apr 2025 14:31:44 +1000 Subject: [PATCH 3/5] feat(tests): Add tests --- .../binaries/create_samplesheet.py | 4 +- .../tests/test_create_yaml_rfaa.py | 54 +++++++++++++++++++ samplesheetutils/utils/output.py | 2 +- 3 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 samplesheetutils/tests/test_create_yaml_rfaa.py diff --git a/samplesheetutils/binaries/create_samplesheet.py b/samplesheetutils/binaries/create_samplesheet.py index 04ba340..a21d36d 100644 --- a/samplesheetutils/binaries/create_samplesheet.py +++ b/samplesheetutils/binaries/create_samplesheet.py @@ -206,7 +206,7 @@ def create_samplesheet(): # Create the fasta file sample_data = Sample(aa_sample_name, aa_path, args.aa_string) with open(aa_sample_file_name, "w") as s_fp: - create_yaml_sample(sample_data, s_fp) + create_sample_rfaa(sample_data, s_fp) samplesheet_path = args.output_file @@ -232,7 +232,7 @@ def create_samplesheet(): for sample in sample_data: with open(sample.path, "w") as s_fp: - create_yaml_sample(sample, s_fp) + create_sample_rfaa(sample, s_fp) with open(args.output_file, "w") as ss_fp: create_csv(sample_data, args.seq_header, args.fasta_header, ss_fp) diff --git a/samplesheetutils/tests/test_create_yaml_rfaa.py b/samplesheetutils/tests/test_create_yaml_rfaa.py new file mode 100644 index 0000000..1010881 --- /dev/null +++ b/samplesheetutils/tests/test_create_yaml_rfaa.py @@ -0,0 +1,54 @@ +import unittest, yaml, os +from samplesheetutils.utils.output import * +from samplesheetutils.utils.sample import * + +class TestCreateYAML(unittest.TestCase): + def test_create_yaml_single_sample(self): + sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA") + + with open(".tmp.yaml", "w") as fp: + create_yaml_rfaa(sample_input, fp) + + fp = open(".tmp.yaml", "r") + fp_data = fp.read() + fp.close() + + self.assertEqual(fp_data, 'sequences:\n- protein:\n id: TEST\n sequence: AAAAAA\nversion: 1\n') + os.remove(".tmp.yaml") + + def test_create_yaml_multiple_sample(self): + sample_input = [] + sample_input.append(Sample("TEST1", ".tmp1.yaml", "AAAAAA")) + sample_input.append(Sample("TEST2", ".tmp2.yaml", "AAAAAA")) + sample_input.append(Sample("TEST3", ".tmp3.yaml", "AAAAAA")) + + for i in range(len(sample_input)): + with open(f".tmp.{str(i)}.yaml", "w") as fp: + create_yaml_rfaa(sample_input[i], fp) + + for i in range(len(sample_input)): + fp = open(f".tmp.{str(i)}.yaml", "r") + fp_data = fp.read() + fp.close() + + self.assertEqual(fp_data, 'sequences:\n- protein:\n id: TEST\n sequence: AAAAAA\nversion: 1\n') + for i in range(len(sample_input)): + os.remove(f".tmp.{str(i)}.yaml") + + def test_create_samplesheet_csv_single_sample(self): + sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA") + + with open(".tmp.yaml", "w") as fp: + create_yaml_rfaa(sample_input, fp) + + with open(".tmp.csv", "w") as fp: + create_csv([sample_input], "id", "fasta", fp) + + fp = open(".tmp.csv") + fp_data = fp.read() + fp.close() + + self.assertEqual(fp_data, "id,fasta\nTEST,.tmp.yaml\n") + + os.remove(".tmp.yaml") + os.remove(".tmp.csv") diff --git a/samplesheetutils/utils/output.py b/samplesheetutils/utils/output.py index 5eec918..4b14662 100644 --- a/samplesheetutils/utils/output.py +++ b/samplesheetutils/utils/output.py @@ -27,7 +27,7 @@ def create_yaml_boltz(data, fp): yaml.dump(output_data, fp, default_flow_style=False) -def create_yaml_sample(data, fp): +def create_yaml_rfaa(data, fp): # Used for RFAA output_data = { "version": 1, From 371fa605db3b8d5c2be32a07ca581b91577a3c25 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Mon, 28 Apr 2025 14:33:57 +1000 Subject: [PATCH 4/5] feat(readme): Update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e2d44ba..f4a80c8 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ create-samplesheet [ARGS] - `-j --json`: Ouptut JSON formatted samplesheet - `-y --yaml`: Output YAML formatted samplesheet - `-m --msa-dir`: Directory to search for corresponding MSA files in (Only accessible in yaml output) +- `--yaml-rfaa`: Output YAML formatted sequence files with a samplesheet.csv file #### `--msa-dir` When using the YAML output mode (`-y`, `--yaml`), you can provide a path to a directory containg sample's pre-computed multiple sequence alignment files (`.a3m` files). In order for these files to automatically be associated with it's corresponding sample, the filenames must follow the following format: From 33caf2d7e6d2414a9cfce1d8df1e6bb00b936413 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Mon, 28 Apr 2025 14:37:24 +1000 Subject: [PATCH 5/5] feat(version): Updated version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fa2dfef..060cf2c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="samplesheetutils", - version="1.1.2", + version="1.2", author="Nathan Glades", author_email="n.glades@unsw.edu.au", packages=find_packages(),