Mercurial > repos > devteam > dgidb_annotator
comparison dgidb_annotator.py @ 0:28d72b995c6b draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/dgidb_annotator commit 5a4e0ca9992af3a6e5ed2b533f04bb82ce761e0b
| author | devteam |
|---|---|
| date | Mon, 09 Nov 2015 11:29:28 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:28d72b995c6b |
|---|---|
| 1 ''' | |
| 2 Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database. | |
| 3 ''' | |
| 4 | |
| 5 import optparse, json, urllib2, sys | |
| 6 | |
| 7 def __main__(): | |
| 8 # -- Parse command line. -- | |
| 9 parser = optparse.OptionParser() | |
| 10 parser.add_option('-g', '--gene-name-col', dest='gene_name_col', help='column of gene names') | |
| 11 parser.add_option('-a', '--print-all', dest='print_all', action='store_true', help='print all lines, even though without a result') | |
| 12 parser.add_option('-e', '--expert-curated', dest='expert_curated', action='store_true', help='use only expert curated results') | |
| 13 (options, args) = parser.parse_args() | |
| 14 gene_name_col = int(options.gene_name_col) - 1 | |
| 15 | |
| 16 # Open input stream. | |
| 17 if len(args) > 0: | |
| 18 input_file = open(args[0], 'r') | |
| 19 else: | |
| 20 input_file = sys.stdin | |
| 21 | |
| 22 # -- Set up gene list queries. -- | |
| 23 | |
| 24 # Get gene list. | |
| 25 gene_list = [] | |
| 26 lines = [] | |
| 27 for line in input_file: | |
| 28 entry = line.split('\t')[gene_name_col].strip() | |
| 29 # Some annotations may be of the form | |
| 30 # <gene_name>(<splicing_info>) or <gene_name>;<gene_name>(splicing_info) | |
| 31 gene_list.append(entry.split(';')[0].split('(')[0]) | |
| 32 lines.append(line.strip()) | |
| 33 | |
| 34 # Set up gene lists to be ~8K because this is near the max HTTP request length. | |
| 35 gene_list = ','.join(set(gene_list)) | |
| 36 queries = [] | |
| 37 MAX_QUERY_SIZE = 8000 | |
| 38 if len(gene_list) > MAX_QUERY_SIZE: | |
| 39 # Break queries. | |
| 40 queries = [ gene_list[i:i + MAX_QUERY_SIZE] for i in range(0, len(gene_list), MAX_QUERY_SIZE) ] | |
| 41 | |
| 42 # Adjust queries to include whole genes. | |
| 43 for i, query in enumerate( queries[1:] ): | |
| 44 part_gene, comma, remainder = query.partition(',') | |
| 45 queries[i] += part_gene | |
| 46 queries[i+1] = remainder | |
| 47 else: | |
| 48 queries = [ gene_list ] | |
| 49 | |
| 50 # -- Query and process results. -- | |
| 51 | |
| 52 # Query for results. | |
| 53 results = [] | |
| 54 for genes in queries: | |
| 55 query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % genes | |
| 56 if options.expert_curated: | |
| 57 query_str += '&source_trust_levels=Expert%20curated' | |
| 58 raw_results = urllib2.urlopen(query_str).read() | |
| 59 results_dict = json.loads(raw_results) | |
| 60 results.extend(results_dict['matchedTerms']) | |
| 61 | |
| 62 # Process results. | |
| 63 for result in results: | |
| 64 # Process result. | |
| 65 processed_results = [] | |
| 66 result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ] | |
| 67 for interaction in result['interactions']: | |
| 68 result_fields = result_fields[0:3] | |
| 69 result_fields.extend( [ | |
| 70 interaction['interactionType'], interaction['drugName'], interaction['source'] | |
| 71 ] ) | |
| 72 processed_results.append( '\t'.join( result_fields ) ) | |
| 73 | |
| 74 # Store processed results. | |
| 75 results_dict[ result['searchTerm'] ] = processed_results | |
| 76 | |
| 77 # -- Annotate input file and produce output. -- | |
| 78 for line in lines: | |
| 79 fields = line.split('\t') | |
| 80 gene = fields[gene_name_col] | |
| 81 if gene in results_dict: | |
| 82 for result in results_dict[gene]: | |
| 83 print line.strip() + '\t' + result | |
| 84 elif options.print_all: | |
| 85 print line | |
| 86 | |
| 87 if __name__=="__main__": __main__() |
