nbtm-sh · nbtm-sh · Apr 28, 2025 · Apr 28, 2025 · Apr 28, 2025 · Apr 28, 2025
diff --git a/README.md b/README.md
@@ -36,6 +36,7 @@ create-samplesheet [ARGS]
 - `-j --json`: Ouptut JSON formatted samplesheet
 - `-y --yaml`: Output YAML formatted samplesheet
 - `-m --msa-dir`: Directory to search for corresponding MSA files in (Only accessible in yaml output)
+- `--yaml-rfaa`: Output YAML formatted sequence files with a samplesheet.csv file
 
 #### `--msa-dir`
 When using the YAML output mode (`-y`, `--yaml`), you can provide a path to a directory containg sample's pre-computed multiple sequence alignment files (`.a3m` files). In order for these files to automatically be associated with it's corresponding sample, the filenames must follow the following format:

diff --git a/samplesheetutils/binaries/create_samplesheet.py b/samplesheetutils/binaries/create_samplesheet.py
@@ -11,6 +11,8 @@
 MODE_DIR_JSON = 3
 MODE_STRING_YAML = 4
 MODE_DIR_YAML = 5
+MODE_STRING_YAML_RFAA = 8
+MODE_DIR_YAML_RFAA = 9
 
 # Set up logging
 logger = logging.getLogger(__name__)
@@ -40,6 +42,7 @@ def create_samplesheet():
     parser.add_argument('-f', '--fasta-header', help='Column name for fasta path', default='fasta', dest='fasta_header')
     parser.add_argument('-j', '--json', help='Output json format instead of csv', action='store_true', dest='json')
     parser.add_argument('-y', '--yaml', help='Output yaml format instead of csv', action='store_true', dest='yaml')
+    parser.add_argument('--yaml-rfaa', help='Output yaml format for RFAA', action='store_true', dest='yaml_rfaa')
     parser.add_argument('-t', '--fasta-dir', help='Output directory for temporary fasta files', default=os.getcwd(), dest='fasta_dir')
     parser.add_argument('-r', '--fasta-match', help='Regex to match for fasta files in directory mode', default='.*\.(fa(a)?(sta)?|y(a)?ml).*$', dest='fasta_regex')
     parser.add_argument('--monomer', help='Create a samplesheet entry for each sample in a fasta file', default=False, action='store_true', dest='monomer')
@@ -68,14 +71,15 @@ def create_samplesheet():
     if (type(args.seq_chars) is not int):
         raise ValueError("seq_chars is not a number")
 
-    if (args.json and args.yaml):
-        raise ValueError("Invaid mode combination. You cannot set --json and --yaml at the same time")
+    if (sum([args.json, args.yaml, args.yaml_rfaa]) > 1):
+        raise ValueError("Invaid mode combination. You cannot set --json, --yaml and --yaml-rfaa at the same time")
 
     # Mode
     mode = 0
     mode |= bool(args.dir)
     mode |= args.json << 1
     mode |= args.yaml << 2
+    mode |= args.yaml_rfaa << 3
     logger.debug(f"mode: {mode}")
     logger.debug("Will attempt to locate MSAs" if args.msa_dir else "Will NOT attempt to locate MSAs")
 
@@ -189,9 +193,46 @@ def create_samplesheet():
             args.output_file = args.output_file.replace(".csv", ".yaml")
         samplesheet_path = args.output_file
 
+        samplesheet_path = args.output_file
+        with open(samplesheet_path, "w") as ss_fp:
+            create_yaml_boltz(sample_data, ss_fp)
 
+    if mode == MODE_STRING_YAML_RFAA:
+        aa_sample_name = sample_name(args.aa_string, seq_chars=args.seq_chars)
+        # overwrite defaults
+        aa_sample_file_name = file_name(aa_sample_name, prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml")
+        aa_path = args.fasta_dir + "/" + aa_sample_file_name
+
+        # Create the fasta file
+        sample_data = Sample(aa_sample_name, aa_path, args.aa_string)
+        with open(aa_sample_file_name, "w") as s_fp:
+            create_sample_rfaa(sample_data, s_fp)
 
         samplesheet_path = args.output_file
+
         with open(samplesheet_path, "w") as ss_fp:
-            create_yaml_boltz(sample_data, ss_fp)
+            create_csv([sample_data], args.seq_header, args.fasta_header, ss_fp)
+    if mode == MODE_DIR_YAML_RFAA:
+        logger.debug(f"Checking {args.dir} for fasta files")
+        file_list = [os.path.join(args.dir, f) for f in os.listdir(args.dir) if os.path.isfile(os.path.join(args.dir, f))]
+        logger.debug(f"Fasta files: {file_list}")
+        file_list = [i for i in file_list if re.search(args.fasta_regex, i)]
+        logger.debug(f"File list aginst regex: {file_list}")
+        sample_data = []
+
+        for i_file_name in file_list:
+            with open(i_file_name, "r") as fp:
+                fasta_data = read_fasta(fp, read_data=True, single_line=False)
+                for i in range(len(fasta_data)):
+                    fasta_data[i].path = file_name(sanitize_input(fasta_data[i].name), prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml")
+                # Attempt to find MSA if the MSA directory flag is set
+                sample_data.extend(fasta_data)
+                logger.debug(f"Added sample {i_file_name}, {fasta_data}")
+                logger.debug(f"Sample data {sample_data[-1].data}")
 
+        for sample in sample_data:
+            with open(sample.path, "w") as s_fp:
+                create_sample_rfaa(sample, s_fp)
+
+        with open(args.output_file, "w") as ss_fp:
+            create_csv(sample_data, args.seq_header, args.fasta_header, ss_fp)
diff --git a/samplesheetutils/tests/test_create_yaml_rfaa.py b/samplesheetutils/tests/test_create_yaml_rfaa.py
@@ -0,0 +1,54 @@
+import unittest, yaml, os
+from samplesheetutils.utils.output import *
+from samplesheetutils.utils.sample import *
+
+class TestCreateYAML(unittest.TestCase):
+    def test_create_yaml_single_sample(self):
+        sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA")
+
+        with open(".tmp.yaml", "w") as fp:
+            create_yaml_rfaa(sample_input, fp)
+
+        fp = open(".tmp.yaml", "r")
+        fp_data = fp.read()
+        fp.close()
+
+        self.assertEqual(fp_data, 'sequences:\n- protein:\n    id: TEST\n    sequence: AAAAAA\nversion: 1\n')
+        os.remove(".tmp.yaml")
+
+    def test_create_yaml_multiple_sample(self):
+        sample_input = []
+        sample_input.append(Sample("TEST1", ".tmp1.yaml", "AAAAAA"))
+        sample_input.append(Sample("TEST2", ".tmp2.yaml", "AAAAAA"))
+        sample_input.append(Sample("TEST3", ".tmp3.yaml", "AAAAAA"))
+
+        for i in range(len(sample_input)):
+            with open(f".tmp.{str(i)}.yaml", "w") as fp:
+                create_yaml_rfaa(sample_input[i], fp)
+
+        for i in range(len(sample_input)):
+            fp = open(f".tmp.{str(i)}.yaml", "r")
+            fp_data = fp.read()
+            fp.close()
+
+            self.assertEqual(fp_data, 'sequences:\n- protein:\n    id: TEST\n    sequence: AAAAAA\nversion: 1\n')
+        for i in range(len(sample_input)):
+            os.remove(f".tmp.{str(i)}.yaml")
+
+    def test_create_samplesheet_csv_single_sample(self):
+        sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA")
+
+        with open(".tmp.yaml", "w") as fp:
+            create_yaml_rfaa(sample_input, fp)
+
+        with open(".tmp.csv", "w") as fp:
+            create_csv([sample_input], "id", "fasta", fp)
+
+        fp = open(".tmp.csv")
+        fp_data = fp.read()
+        fp.close()
+
+        self.assertEqual(fp_data, "id,fasta\nTEST,.tmp.yaml\n")
+
+        os.remove(".tmp.yaml")
+        os.remove(".tmp.csv") 
diff --git a/samplesheetutils/utils/input.py b/samplesheetutils/utils/input.py
@@ -1,4 +1,4 @@
-def sanitize_input(input_str, disallowed_chars = [',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'):
+def sanitize_input(input_str, disallowed_chars = ['|', '[', ']', ',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'):
     for i in disallowed_chars:
         input_str = input_str.replace(i, '_')
     return input_str

diff --git a/samplesheetutils/utils/output.py b/samplesheetutils/utils/output.py
@@ -27,6 +27,22 @@ def create_yaml_boltz(data, fp):
 
     yaml.dump(output_data, fp, default_flow_style=False)
 
+def create_yaml_rfaa(data, fp):
+    # Used for RFAA
+    output_data = {
+        "version": 1,
+        "sequences": []
+    }
+
+    output_data["sequences"].append({
+        "protein": {
+            "id": data.name[:min(4,len(data.name))],
+            "sequence": data.data
+        }
+    })
+
+    yaml.dump(output_data, fp, default_flow_style=False)
+
 def create_json(data, fp):
     dict_data = {"entities": []}
 

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="samplesheetutils",
-    version="1.1.2",
+    version="1.2",
     author="Nathan Glades",
     author_email="n.glades@unsw.edu.au",
     packages=find_packages(),