diff --git a/README.md b/README.md index e2d44ba..f4a80c8 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ create-samplesheet [ARGS] - `-j --json`: Ouptut JSON formatted samplesheet - `-y --yaml`: Output YAML formatted samplesheet - `-m --msa-dir`: Directory to search for corresponding MSA files in (Only accessible in yaml output) +- `--yaml-rfaa`: Output YAML formatted sequence files with a samplesheet.csv file #### `--msa-dir` When using the YAML output mode (`-y`, `--yaml`), you can provide a path to a directory containg sample's pre-computed multiple sequence alignment files (`.a3m` files). In order for these files to automatically be associated with it's corresponding sample, the filenames must follow the following format: diff --git a/samplesheetutils/binaries/create_samplesheet.py b/samplesheetutils/binaries/create_samplesheet.py index a67ff1b..a21d36d 100644 --- a/samplesheetutils/binaries/create_samplesheet.py +++ b/samplesheetutils/binaries/create_samplesheet.py @@ -11,6 +11,8 @@ MODE_DIR_JSON = 3 MODE_STRING_YAML = 4 MODE_DIR_YAML = 5 +MODE_STRING_YAML_RFAA = 8 +MODE_DIR_YAML_RFAA = 9 # Set up logging logger = logging.getLogger(__name__) @@ -40,6 +42,7 @@ def create_samplesheet(): parser.add_argument('-f', '--fasta-header', help='Column name for fasta path', default='fasta', dest='fasta_header') parser.add_argument('-j', '--json', help='Output json format instead of csv', action='store_true', dest='json') parser.add_argument('-y', '--yaml', help='Output yaml format instead of csv', action='store_true', dest='yaml') + parser.add_argument('--yaml-rfaa', help='Output yaml format for RFAA', action='store_true', dest='yaml_rfaa') parser.add_argument('-t', '--fasta-dir', help='Output directory for temporary fasta files', default=os.getcwd(), dest='fasta_dir') parser.add_argument('-r', '--fasta-match', help='Regex to match for fasta files in directory mode', default='.*\.(fa(a)?(sta)?|y(a)?ml).*$', dest='fasta_regex') parser.add_argument('--monomer', help='Create a samplesheet entry for each sample in a fasta file', default=False, action='store_true', dest='monomer') @@ -68,14 +71,15 @@ def create_samplesheet(): if (type(args.seq_chars) is not int): raise ValueError("seq_chars is not a number") - if (args.json and args.yaml): - raise ValueError("Invaid mode combination. You cannot set --json and --yaml at the same time") + if (sum([args.json, args.yaml, args.yaml_rfaa]) > 1): + raise ValueError("Invaid mode combination. You cannot set --json, --yaml and --yaml-rfaa at the same time") # Mode mode = 0 mode |= bool(args.dir) mode |= args.json << 1 mode |= args.yaml << 2 + mode |= args.yaml_rfaa << 3 logger.debug(f"mode: {mode}") logger.debug("Will attempt to locate MSAs" if args.msa_dir else "Will NOT attempt to locate MSAs") @@ -189,9 +193,46 @@ def create_samplesheet(): args.output_file = args.output_file.replace(".csv", ".yaml") samplesheet_path = args.output_file + samplesheet_path = args.output_file + with open(samplesheet_path, "w") as ss_fp: + create_yaml_boltz(sample_data, ss_fp) + if mode == MODE_STRING_YAML_RFAA: + aa_sample_name = sample_name(args.aa_string, seq_chars=args.seq_chars) + # overwrite defaults + aa_sample_file_name = file_name(aa_sample_name, prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml") + aa_path = args.fasta_dir + "/" + aa_sample_file_name + + # Create the fasta file + sample_data = Sample(aa_sample_name, aa_path, args.aa_string) + with open(aa_sample_file_name, "w") as s_fp: + create_sample_rfaa(sample_data, s_fp) samplesheet_path = args.output_file + with open(samplesheet_path, "w") as ss_fp: - create_yaml_boltz(sample_data, ss_fp) + create_csv([sample_data], args.seq_header, args.fasta_header, ss_fp) + if mode == MODE_DIR_YAML_RFAA: + logger.debug(f"Checking {args.dir} for fasta files") + file_list = [os.path.join(args.dir, f) for f in os.listdir(args.dir) if os.path.isfile(os.path.join(args.dir, f))] + logger.debug(f"Fasta files: {file_list}") + file_list = [i for i in file_list if re.search(args.fasta_regex, i)] + logger.debug(f"File list aginst regex: {file_list}") + sample_data = [] + + for i_file_name in file_list: + with open(i_file_name, "r") as fp: + fasta_data = read_fasta(fp, read_data=True, single_line=False) + for i in range(len(fasta_data)): + fasta_data[i].path = file_name(sanitize_input(fasta_data[i].name), prefix=args.aa_prefix, suffix=args.aa_suffix, extension="yaml") + # Attempt to find MSA if the MSA directory flag is set + sample_data.extend(fasta_data) + logger.debug(f"Added sample {i_file_name}, {fasta_data}") + logger.debug(f"Sample data {sample_data[-1].data}") + for sample in sample_data: + with open(sample.path, "w") as s_fp: + create_sample_rfaa(sample, s_fp) + + with open(args.output_file, "w") as ss_fp: + create_csv(sample_data, args.seq_header, args.fasta_header, ss_fp) diff --git a/samplesheetutils/tests/test_create_yaml_rfaa.py b/samplesheetutils/tests/test_create_yaml_rfaa.py new file mode 100644 index 0000000..1010881 --- /dev/null +++ b/samplesheetutils/tests/test_create_yaml_rfaa.py @@ -0,0 +1,54 @@ +import unittest, yaml, os +from samplesheetutils.utils.output import * +from samplesheetutils.utils.sample import * + +class TestCreateYAML(unittest.TestCase): + def test_create_yaml_single_sample(self): + sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA") + + with open(".tmp.yaml", "w") as fp: + create_yaml_rfaa(sample_input, fp) + + fp = open(".tmp.yaml", "r") + fp_data = fp.read() + fp.close() + + self.assertEqual(fp_data, 'sequences:\n- protein:\n id: TEST\n sequence: AAAAAA\nversion: 1\n') + os.remove(".tmp.yaml") + + def test_create_yaml_multiple_sample(self): + sample_input = [] + sample_input.append(Sample("TEST1", ".tmp1.yaml", "AAAAAA")) + sample_input.append(Sample("TEST2", ".tmp2.yaml", "AAAAAA")) + sample_input.append(Sample("TEST3", ".tmp3.yaml", "AAAAAA")) + + for i in range(len(sample_input)): + with open(f".tmp.{str(i)}.yaml", "w") as fp: + create_yaml_rfaa(sample_input[i], fp) + + for i in range(len(sample_input)): + fp = open(f".tmp.{str(i)}.yaml", "r") + fp_data = fp.read() + fp.close() + + self.assertEqual(fp_data, 'sequences:\n- protein:\n id: TEST\n sequence: AAAAAA\nversion: 1\n') + for i in range(len(sample_input)): + os.remove(f".tmp.{str(i)}.yaml") + + def test_create_samplesheet_csv_single_sample(self): + sample_input = Sample("TEST", ".tmp.yaml", "AAAAAA") + + with open(".tmp.yaml", "w") as fp: + create_yaml_rfaa(sample_input, fp) + + with open(".tmp.csv", "w") as fp: + create_csv([sample_input], "id", "fasta", fp) + + fp = open(".tmp.csv") + fp_data = fp.read() + fp.close() + + self.assertEqual(fp_data, "id,fasta\nTEST,.tmp.yaml\n") + + os.remove(".tmp.yaml") + os.remove(".tmp.csv") diff --git a/samplesheetutils/utils/input.py b/samplesheetutils/utils/input.py index 3dac69c..dc0dcbe 100644 --- a/samplesheetutils/utils/input.py +++ b/samplesheetutils/utils/input.py @@ -1,4 +1,4 @@ -def sanitize_input(input_str, disallowed_chars = [',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'): +def sanitize_input(input_str, disallowed_chars = ['|', '[', ']', ',', ' ', '<', '>', '.', "'", '"', ';', ':', '(', ')'], replcement='_'): for i in disallowed_chars: input_str = input_str.replace(i, '_') return input_str diff --git a/samplesheetutils/utils/output.py b/samplesheetutils/utils/output.py index 7e4c917..4b14662 100644 --- a/samplesheetutils/utils/output.py +++ b/samplesheetutils/utils/output.py @@ -27,6 +27,22 @@ def create_yaml_boltz(data, fp): yaml.dump(output_data, fp, default_flow_style=False) +def create_yaml_rfaa(data, fp): + # Used for RFAA + output_data = { + "version": 1, + "sequences": [] + } + + output_data["sequences"].append({ + "protein": { + "id": data.name[:min(4,len(data.name))], + "sequence": data.data + } + }) + + yaml.dump(output_data, fp, default_flow_style=False) + def create_json(data, fp): dict_data = {"entities": []} diff --git a/setup.py b/setup.py index fa2dfef..060cf2c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="samplesheetutils", - version="1.1.2", + version="1.2", author="Nathan Glades", author_email="n.glades@unsw.edu.au", packages=find_packages(),