diff --git a/README.md b/README.md index 6e9e617..fae5736 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # BaseSpace Invaders This contains a collection of scripts (one for now!) that I found useful to -retrieve files from Illumina's BaseSpace. Please run a script with no options +retrieve files from Illumina's BaseSpace. Please run a script with --help to see its usage. _ __ ## ## _ __ @@ -18,6 +18,7 @@ to see its usage. ### Python BaseSpace SDK Please download and install the [python basespace sdk](http://github.com/basespace/basespace-python-sdk). +It is recommended to use a virtual environment for this (e.g. via [conda](http://continuum.io/downloads#all)). ### Illumina's BaseSpace Developer Credentials @@ -35,7 +36,7 @@ Access Token". You will need to provide the credentials for your app either via the command line (security risk) or with a master config file (preferred). -To create a master config file, create a file named ~/.basespace.cfg with the following content, +To create a master config file, create a file named ~/.basespacepy.cfg with the following content, filling in the clientKey, clientSecret, and accessToken (optionally appSessionId):
 [DEFAULT]
@@ -44,14 +45,14 @@ clientKey =
 clientSecret = 
 accessToken = 
 appSessionId =
-apiServer = https://api.cloud-hoth.illumina.com/
+apiServer = https://api.basespace.illumina.com/
 apiVersion = v1pre3
 
You can put in '' for appSessionId if you do not have one. ## Get sample files -The samples2files.py script downloads +The download_files.py script downloads the sample-level files from BaseSpace. The user can specify project Id, project name, sample Id, and sample Name. Project Id and project name should not be specified together; similarly sample Id and sample name should not be diff --git a/src/scripts/download_files.py b/src/scripts/download_files.py new file mode 100644 index 0000000..e45c8a2 --- /dev/null +++ b/src/scripts/download_files.py @@ -0,0 +1,223 @@ +################################################################################ +# Copyright 2014 Nils Homer +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +################################################################################ +# This tool was adapted with permission from Mayank Tyagi +# and subsequently updated by Mario Giovacchini https://github.com/mariogiov +################################################################################ +from __future__ import print_function + + +import argparse +import os +import re +import sys + +from BaseSpacePy.api.BaseSpaceAPI import BaseSpaceAPI +from BaseSpacePy.model.QueryParameters import QueryParameters as qp +from ConfigParser import ConfigParser +from functools import partial + + +## TODO use QueryParameters to filter? +## TODO let user pick Run, File by name/id +## TODO Implement separate script for listing project trees +## TODO Consider implementing separate download_basespace_ fns for projects, samples, etc. + +print_stderr = partial(print, file=sys.stderr) + +def download_basespace_files(config_file_path=None, client_key=None, client_secret=None, access_token=None, + project_id_list=None, project_name_list=None, sample_id_list=None, sample_name_list=None, + dry_run=False, output_directory=None, recreate_basespace_dir_tree=True): + # Check input parameters / load from config file / defaults + if not project_id_list: project_id_list = [] + if not project_name_list: project_name_list = [] + if not sample_id_list: sample_id_list = [] + if not sample_name_list: sample_name_list = [] + if not output_directory: + output_directory = os.getcwd() + print_stderr("Output directory not specified; using current directory ({})".format(output_directory)) + else: + output_directory = os.path.abspath(output_directory) + if not dry_run: + safe_makedir(output_directory) + config_dict = {} + if config_file_path: + config_parser = ConfigParser() + config_parser.read(config_file_path) + config_dict = config_parser._defaults + if not client_key: client_key = config_dict.get('clientkey') + if not client_secret: client_secret = config_dict.get('clientsecret') + if not access_token: access_token = config_dict.get('accesstoken') + if not (client_key and client_secret and access_token): + missing_params = [] + if not client_key: missing_params.append("client_key") + if not client_secret: missing_params.append("client_secret") + if not access_token: missing_params.append("access_token") + print_stderr('Error: Required parameters not supplied either in config ' + 'file ({}) or via arguments.'.format(config_file_path, + ', '.join(missing_params))) + sys.exit(1) + app_session_id = config_dict.get("appsessionid") or "" + api_server = config_dict.get("apiserver") or "https://api.basespace.illumina.com" + api_version = config_dict.get("apiversion") or "v1pre3" + # Get the API connection object + myAPI = BaseSpaceAPI(clientKey=client_key, clientSecret=client_secret, + apiServer=api_server, version=api_version, + appSessionId=app_session_id, AccessToken=access_token) + basespace_projects = myAPI.getProjectByUser(qp({'Limit' : 1024})) + user = myAPI.getUserById('current') + # If user specified projects, get them by name or id + project_objects = [] + if project_name_list: + project_objects.extend(_select_from_object(filter_list=project_name_list, + search_list=basespace_projects, + key_attr="Name", + obj_type="project", + user=user)) + if project_id_list: + digit_pattern = re.compile(r'^\d+$') + project_filtered_id_list = [] + for project_id in project_id_list: + if not digit_pattern.match(project_id): + print_stderr('Error: Invalid format for user-specified project id ' + '"{}": project ids are strictly numeric. Did you mean ' + 'to pass this as a project name?'.format(project_id)) + else: + project_filtered_id_list.append(project_id) + project_objects.extend(_select_from_object(filter_list=project_filtered_id_list, + search_list=basespace_projects, + key_attr="Id", + obj_type="project", + user=user)) + if not (project_name_list or project_id_list): + # Get all projects if none are specified by user + project_objects = basespace_projects + + basespace_samples = [] + for project_obj in project_objects: + basespace_samples.extend(project_obj.getSamples(myAPI)) + sample_objects = [] + if sample_name_list: + sample_objects.extend(_select_from_object(filter_list=sample_name_list, + search_list=basespace_samples, + key_attr="Name", + obj_type="sample", + user=user)) + if sample_id_list: + digit_pattern = re.compile(r'^\d+$') + sample_filtered_id_list = [] + for sample_id in sample_id_list: + if not digit_pattern.match(sample_id): + print_stderr('Error: Invalid format for user-specified sample id ' + '"{}": sample ids are strictly numeric. Did you mean ' + 'to pass this as a sample name?'.format(sample_id)) + else: + sample_filtered_id_list.append(sample_id) + sample_objects.extend(_select_from_object(filter_list=sample_filtered_id_list, + search_list=basespace_samples, + key_attr="Id", + obj_type="sample", + user=user)) + if not (sample_name_list or sample_id_list): + # Get all samples if none are specified by user + sample_objects = basespace_samples + + files_to_download = [] + for sample_obj in sample_objects: + files_to_download.extend(sample_obj.getFiles(myAPI)) + + if files_to_download: + print_stderr("Found {} files to download: ".format(len(files_to_download))) + for file_obj in files_to_download: + print_stderr("\t- {}".format(file_obj)) + print_stderr('Downloading files to output directory {}'.format(output_directory)) + if recreate_basespace_dir_tree: + print_stderr("Recreating BaseSpace project directory tree for file.") + if dry_run: + print_stderr("-> Dry run: not downloading any data.") + for i, file_obj in enumerate(files_to_download): + print_stderr('[{}/{}] Downloading file "{}"'.format(i+1, len(files_to_download), + file_obj)) + if not dry_run: + file_obj.downloadFile(api=myAPI, localDir=output_directory, + createBsDir=recreate_basespace_dir_tree) + print_stderr('Download completed; files are located in "{}"'.format(output_directory)) + else: + print_stderr("Error: no files found to download.") + + +def _select_from_object(filter_list, search_list, key_attr, obj_type=None, user=None): + object_attr_list = [] + object_attr_dict = { getattr(obj,key_attr): obj for obj in search_list } + if not obj_type: obj_type = type(search_list[0]) + user_string = 'for user "{}"'.format(user) if user else "" + for search_value in filter_list: + try: + object_attr_list.append(object_attr_dict[search_value]) + except KeyError: + print_stderr('Warning: user-specified {obj_type} {key_attr} "{user_value}" ' + 'not found in {obj_type}s {user_string}'.format(obj_type=obj_type, + key_attr=key_attr.lower(), + user_value=search_value, + user_string=user_string)) + return object_attr_list + + +def safe_makedir(dname, mode=0777): + """Make a directory (tree) if it doesn't exist, handling concurrent race + conditions. + """ + if not os.path.exists(dname): + try: + os.makedirs(dname, mode=mode) + except OSError: + if not os.path.isdir(dname): + raise + return dname + + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Navigate the byzantine corridors of Basespace and download your files to win") + + cred_group = parser.add_argument_group("Credential options (note that specifying these via '-K', '-S', and '-A' is not secure;\n\t\t you are recommended to pass a configuration file with '-c')") + cred_group.add_argument('-c', '--config', dest="config_file_path", default=os.path.expandvars('$HOME/.basespacepy.cfg'), + help='the path to the configuration file (default $HOME/.basespacepy.cfg)') + cred_group.add_argument('-K', '--client-key', help='the developer.basespace.illumina.com client key') + cred_group.add_argument('-S', '--client-secret', help='the developer.basespace.illumina.com client token') + cred_group.add_argument('-A', '--access-token', help='the developer.basespace.illumina.com access token') + + query_group = parser.add_argument_group("Query arguments") + query_group.add_argument('-s', '--sample-id', action="append", dest="sample_id_list", + help='the sample identifier (optional); specify multiple times for multiple samples') + query_group.add_argument('-x', '--sample-name', action="append", dest="sample_name_list", + help='the sample name (optional); specify multiple times for multiple samples') + query_group.add_argument('-p', '--project-id', action="append", dest="project_id_list", + help='the project identifier (optional); specify multiple times for multiple projects') + query_group.add_argument('-y', '--project-name', action="append", dest="project_name_list", + help='the project name (optional); specify multiple times for multiple projects') + ## Add RunId + ## Add FileId + ## Add User + + misc_group = parser.add_argument_group("Miscellaneous arguments") + misc_group.add_argument('-d', '--dry-run', action='store_true', help='dry run; don\'t download any files') + misc_group.add_argument('-o', '--output-directory', default=os.getcwd(), help='the directory in which to store the files') + misc_group.add_argument('-b', '--recreate-basespace-dir-tree', action="store_false", + help='recreate the basespace directory structure in the output directory') + + args = vars(parser.parse_args()) + download_basespace_files(**args) diff --git a/src/scripts/samples2files.py b/src/scripts/samples2files.py deleted file mode 100644 index 8aa7faa..0000000 --- a/src/scripts/samples2files.py +++ /dev/null @@ -1,165 +0,0 @@ -################################################################################ -# Copyright 2014 Nils Homer -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -################################################################################ -# This tool was adapted with permission from Mayank Tyagi -################################################################################ - -import os, sys -from optparse import OptionParser, OptionGroup -from urllib2 import Request, urlopen, URLError -from BaseSpacePy.api.BaseSpaceAPI import BaseSpaceAPI -from BaseSpacePy.model.QueryParameters import QueryParameters as qp -import logging - -class Samples: - - logging.basicConfig() - - @staticmethod - def __get_files_to_download(myAPI, projectId, sampleId, sampleName, sampleLimit=1024, sampleFileLmit=1024): - filesToDownload = [] - samples = myAPI.getSamplesByProject(Id=projectId, queryPars=qp({'Limit' : sampleLimit})) - for sample in samples: - if None != sampleId and sampleId != sample.Id: - continue - elif None != sampleName and sampleName != sample.Name: - continue - sampleFiles = myAPI.getSampleFilesById(Id=sample.Id, queryPars=qp({'Limit' : sampleFileLmit})) - for sampleFile in sampleFiles: - filesToDownload.append(sampleFile) - return filesToDownload - - @staticmethod - def download(clientKey=None, clientSecret=None, accessToken=None, sampleId=None, projectId=None, sampleName=None, projectName=None, outputDirectory='\.', createBsDir=True): - ''' - Downloads sample-level files. - - Project Id and project name should - not be specified together; similarly sample Id and sample name should not be - specified together. - - 1. If only a project Id or only a project name is given, all files for all - samples will be downloaded within that project. If additionally a sample Id or - sample name is given, then only the first matching sample within the project - will be downloaded. - 2. If only a sample Id is given, then all files for that sample will be downloaded. - 3. If only a sample name is given, then all files within the first project - containing a sample with matching name will be downloaded. - - :param clientKey the Illumina developer app client key - :param clientSecret the Illumina developer app client secret - :param accessToken the Illumina developer app access token - :param sampleId the BaseSpace sample identifier - :param projectId the BaseSpace project identifier - :param sampleName the BaseSpace sample name - :param projectName the BaseSpace project name - :param outputDirectory the root output directory - :param createBsDir true to recreate the path structure within BaseSpace, false otherwise - ''' - appSessionId = '' - apiServer = 'https://api.basespace.illumina.com/' # or 'https://api.cloud-hoth.illumina.com/' - apiVersion = 'v1pre3' - projectLimit = 100 - sampleLimit = 1024 - sampleFileLimit = 1024 - - # init the API - if None != clientKey: - myAPI = BaseSpaceAPI(clientKey, clientSecret, apiServer, apiVersion, appSessionId, accessToken) - else: - myAPI = BaseSpaceAPI(profile='DEFAULT') - - # get the current user - user = myAPI.getUserById('current') - - filesToDownload = [] - if None != projectId: - filesToDownload = Samples.__get_files_to_download(myAPI, projectId, sampleId, sampleName, sampleLimit, sampleFileLimit) - else: - myProjects = myAPI.getProjectByUser(qp({'Limit' : projectLimit})) - for project in myProjects: - projectId = project.Id - if None != projectName and project.Name != projectName: - continue - filesToDownload = Samples.__get_files_to_download(myAPI, projectId, sampleId, sampleName, sampleLimit, sampleFileLimit) - if 0 < len(filesToDownload): - break - print "Will download %d files." % len(filesToDownload) - for i in range(len(filesToDownload)): - sampleFile = filesToDownload[i] - print 'Downloading (%d/%d): %s' % ((i+1), len(filesToDownload), str(sampleFile)) - print "File Path: %s" % sampleFile.Path - if not options.dryRun: - sampleFile.downloadFile(myAPI, outputDirectory, createBsDir=createBsDir) - print "Download complete." - -if __name__ == '__main__': - - def check_option(parser, value, name): - if None == value: - print 'Option ' + name + ' required.\n' - parser.print_help() - sys.exit(1) - - parser = OptionParser() - - group = OptionGroup(parser, "Credential options") - group.add_option('-K', '--client-key', help='the developer.basespace.illumina.com client key', dest='clientKey', default=None) - group.add_option('-S', '--client-secret', help='the developer.basespace.illumina.com client token', dest='clientSecret', default=None) - group.add_option('-A', '--access-token', help='the developer.basespace.illumina.com access token', dest='accessToken', default=None) - parser.add_option_group(group) - - group = OptionGroup(parser, "Query options") - group.add_option('-s', '--sample-id', help='the sample identifier (optional)', dest='sampleId', default=None) - group.add_option('-x', '--sample-name', help='the sample name (optional)', dest='sampleName', default=None) - group.add_option('-p', '--project-id', help='the project identifier (optional)', dest='projectId', default=None) - group.add_option('-y', '--project-name', help='the project name (optional)', dest='projectName', default=None) - parser.add_option_group(group) - - group = OptionGroup(parser, "Miscellaneous options") - group.add_option('-d', '--dry-run', help='dry run; do not download the files', dest='dryRun', action='store_true', default=False) - group.add_option('-o', '--output-directory', help='the output directory', dest='outputDirectory', default='./') - group.add_option('-b', '--create-basespace-directory-structure', help='recreate the basespace directory structure in the output directory', \ - dest='createBsDir', action='store_false', default=True) - parser.add_option_group(group) - - if len(sys.argv[1:]) < 1: - parser.print_help() - sys.exit(1) - - options, args = parser.parse_args() - if None != options.clientKey: - #check_option(parser, options.clientKey, '-K') - check_option(parser, options.clientSecret, '-S') - check_option(parser, options.accessToken, '-A') - if None == options.projectId and None == options.sampleId and None == options.projectName and None == options.sampleName: - print 'One of the query options must be given.\n' - parser.print_help() - sys.exit(1) - if None != options.sampleId and None != options.sampleName: - print 'Both -s or -y may not be given together.\n' - parser.print_help() - sys.exit(1) - if None != options.projectId and None != options.projectName: - print 'Both -p or -x may not be given together.\n' - parser.print_help() - sys.exit(1) - - Samples.download(options.clientKey, options.clientSecret, options.accessToken, \ - sampleId=options.sampleId, projectId=options.projectId, \ - sampleName=options.sampleName, projectName=options.projectName, \ - outputDirectory=options.outputDirectory, createBsDir=options.createBsDir)