Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/tmhmm2.py @ 0:a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
| author | peterjc |
|---|---|
| date | Tue, 07 Jun 2011 17:37:26 -0400 |
| parents | |
| children | 9a8a7f680dd6 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a2eeeaa6f75e |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Wrapper for TMHMM v2.0 for use in Galaxy. | |
| 3 | |
| 4 This script takes exactly two command line arguments - an input protein FASTA | |
| 5 filename and an output tabular filename. It then calls the standalone TMHMM | |
| 6 v2.0 program (not the webservice) requesting the short output (one line per | |
| 7 protein). | |
| 8 | |
| 9 First major feature is cleaning up the tabular output. The raw output from | |
| 10 TMHMM v2.0 looks like this (six columns tab separated): | |
| 11 | |
| 12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o | |
| 13 gi|4959044|gb|AAD34209.1|AF069992_1 len=600 ExpAA=0.00 First60=0.00 PredHel=0 Topology=o | |
| 14 gi|671626|emb|CAA85685.1| len=473 ExpAA=0.19 First60=0.00 PredHel=0 Topology=o | |
| 15 gi|3298468|dbj|BAA31520.1| len=107 ExpAA=59.37 First60=31.17 PredHel=3 Topology=o23-45i52-74o89-106i | |
| 16 | |
| 17 In order to make it easier to use in Galaxy, this wrapper script simplifies | |
| 18 this to remove the redundant tags, and instead adds a comment line at the | |
| 19 top with the column names: | |
| 20 | |
| 21 #ID len ExpAA First60 PredHel Topology | |
| 22 gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o | |
| 23 gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o | |
| 24 gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o | |
| 25 gi|3298468|dbj|BAA31520.1| 107 59.37 31.17 3 o23-45i52-74o89-106i | |
| 26 | |
| 27 The second major potential feature is taking advantage of multiple cores | |
| 28 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file | |
| 29 into chunks and running multiple copies of TMHMM in parallel. I would normally | |
| 30 use Python's multiprocessing library in this situation but it requires at | |
| 31 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. | |
| 32 """ | |
| 33 import sys | |
| 34 import os | |
| 35 from seq_analysis_utils import stop_err, split_fasta, run_jobs | |
| 36 | |
| 37 FASTA_CHUNK = 500 | |
| 38 | |
| 39 if len(sys.argv) != 4: | |
| 40 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") | |
| 41 try: | |
| 42 num_threads = int(sys.argv[1]) | |
| 43 except: | |
| 44 num_threads = 0 | |
| 45 if num_threads < 1: | |
| 46 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) | |
| 47 fasta_file = sys.argv[2] | |
| 48 tabular_file = sys.argv[3] | |
| 49 | |
| 50 def clean_tabular(raw_handle, out_handle): | |
| 51 """Clean up tabular TMHMM output.""" | |
| 52 for line in raw_handle: | |
| 53 if not line: | |
| 54 continue | |
| 55 parts = line.rstrip("\r\n").split("\t") | |
| 56 try: | |
| 57 identifier, length, expAA, first60, predhel, topology = parts | |
| 58 except: | |
| 59 assert len(parts)!=6 | |
| 60 stop_err("Bad line: %r" % line) | |
| 61 assert length.startswith("len="), line | |
| 62 length = length[4:] | |
| 63 assert expAA.startswith("ExpAA="), line | |
| 64 expAA = expAA[6:] | |
| 65 assert first60.startswith("First60="), line | |
| 66 first60 = first60[8:] | |
| 67 assert predhel.startswith("PredHel="), line | |
| 68 predhel = predhel[8:] | |
| 69 assert topology.startswith("Topology="), line | |
| 70 topology = topology[9:] | |
| 71 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ | |
| 72 % (identifier, length, expAA, first60, predhel, topology)) | |
| 73 | |
| 74 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) | |
| 75 temp_files = [f+".out" for f in fasta_files] | |
| 76 jobs = ["tmhmm %s > %s" % (fasta, temp) | |
| 77 for fasta, temp in zip(fasta_files, temp_files)] | |
| 78 | |
| 79 def clean_up(file_list): | |
| 80 for f in file_list: | |
| 81 if os.path.isfile(f): | |
| 82 os.remove(f) | |
| 83 | |
| 84 if len(jobs) > 1 and num_threads > 1: | |
| 85 #A small "info" message for Galaxy to show the user. | |
| 86 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | |
| 87 results = run_jobs(jobs, num_threads) | |
| 88 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
| 89 error_level = results[cmd] | |
| 90 if error_level: | |
| 91 try: | |
| 92 output = open(temp).readline() | |
| 93 except IOError: | |
| 94 output = "" | |
| 95 clean_up(fasta_files) | |
| 96 clean_up(temp_files) | |
| 97 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | |
| 98 error_level) | |
| 99 del results | |
| 100 del jobs | |
| 101 | |
| 102 out_handle = open(tabular_file, "w") | |
| 103 out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n") | |
| 104 for temp in temp_files: | |
| 105 data_handle = open(temp) | |
| 106 clean_tabular(data_handle, out_handle) | |
| 107 data_handle.close() | |
| 108 out_handle.close() | |
| 109 | |
| 110 clean_up(fasta_files) | |
| 111 clean_up(temp_files) |
