Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/seq_analysis_utils.py @ 29:3cb02adf4326 draft
v0.2.9 Python style improvements
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:46:14 -0500 |
parents | 20139cb4c844 |
children | 6d9d7cdf00fc |
comparison
equal
deleted
inserted
replaced
28:22e71e53f534 | 29:3cb02adf4326 |
---|---|
10 import sys | 10 import sys |
11 import os | 11 import os |
12 import subprocess | 12 import subprocess |
13 from time import sleep | 13 from time import sleep |
14 | 14 |
15 __version__ = "0.0.1" | 15 __version__ = "0.0.2" |
16 | |
17 def sys_exit(msg, error_level=1): | |
18 """Print error message to stdout and quit with given error level.""" | |
19 sys.stderr.write("%s\n" % msg) | |
20 sys.exit(error_level) | |
21 | 16 |
22 try: | 17 try: |
23 from multiprocessing import cpu_count | 18 from multiprocessing import cpu_count |
24 except ImportError: | 19 except ImportError: |
25 #Must be under Python 2.5, this is copied from multiprocessing: | 20 # Must be under Python 2.5, this is copied from multiprocessing: |
26 def cpu_count(): | 21 def cpu_count(): |
27 """Returns the number of CPUs in the system.""" | 22 """Returns the number of CPUs in the system.""" |
28 if sys.platform == 'win32': | 23 if sys.platform == 'win32': |
29 try: | 24 try: |
30 num = int(os.environ['NUMBER_OF_PROCESSORS']) | 25 num = int(os.environ['NUMBER_OF_PROCESSORS']) |
52 | 47 |
53 | 48 |
54 def thread_count(command_line_arg, default=1): | 49 def thread_count(command_line_arg, default=1): |
55 try: | 50 try: |
56 num = int(command_line_arg) | 51 num = int(command_line_arg) |
57 except: | 52 except ValueError: |
58 num = default | 53 num = default |
59 if num < 1: | 54 if num < 1: |
60 sys_exit("Threads argument %r is not a positive integer" % command_line_arg) | 55 sys.exit("Threads argument %r is not a positive integer" % command_line_arg) |
61 #Cap this with the pysical limit of the machine, | 56 # Cap this with the pysical limit of the machine, |
62 try: | 57 try: |
63 num = min(num, cpu_count()) | 58 num = min(num, cpu_count()) |
64 except NotImplementedError: | 59 except NotImplementedError: |
65 pass | 60 pass |
66 #For debugging, | 61 # For debugging, |
67 #hostname = os.environ.get("HOSTNAME", "this machine") | 62 # hostname = os.environ.get("HOSTNAME", "this machine") |
68 #sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) | 63 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) |
69 return num | 64 return num |
70 | 65 |
71 | 66 |
72 def fasta_iterator(filename, max_len=None, truncate=None): | 67 def fasta_iterator(filename, max_len=None, truncate=None): |
73 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" | 68 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" |
77 if line.startswith(">"): | 72 if line.startswith(">"): |
78 if title: | 73 if title: |
79 if truncate: | 74 if truncate: |
80 seq = seq[:truncate] | 75 seq = seq[:truncate] |
81 if max_len and len(seq) > max_len: | 76 if max_len and len(seq) > max_len: |
82 raise ValueError("Sequence %s is length %i, max length %i" \ | 77 raise ValueError("Sequence %s is length %i, max length %i" |
83 % (title.split()[0], len(seq), max_len)) | 78 % (title.split()[0], len(seq), max_len)) |
84 yield title, seq | 79 yield title, seq |
85 title = line[1:].rstrip() | 80 title = line[1:].rstrip() |
86 seq = "" | 81 seq = "" |
87 elif title: | 82 elif title: |
88 seq += line.strip() | 83 seq += line.strip() |
89 elif not line.strip() or line.startswith("#"): | 84 elif not line.strip() or line.startswith("#"): |
90 #Ignore blank lines, and any comment lines | 85 # Ignore blank lines, and any comment lines |
91 #between records (starting with hash). | 86 # between records (starting with hash). |
92 pass | 87 pass |
93 else: | 88 else: |
94 handle.close() | 89 handle.close() |
95 raise ValueError("Bad FASTA line %r" % line) | 90 raise ValueError("Bad FASTA line %r" % line) |
96 handle.close() | 91 handle.close() |
97 if title: | 92 if title: |
98 if truncate: | 93 if truncate: |
99 seq = seq[:truncate] | 94 seq = seq[:truncate] |
100 if max_len and len(seq) > max_len: | 95 if max_len and len(seq) > max_len: |
101 raise ValueError("Sequence %s is length %i, max length %i" \ | 96 raise ValueError("Sequence %s is length %i, max length %i" |
102 % (title.split()[0], len(seq), max_len)) | 97 % (title.split()[0], len(seq), max_len)) |
103 yield title, seq | 98 yield title, seq |
104 raise StopIteration | 99 raise StopIteration |
100 | |
105 | 101 |
106 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): | 102 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): |
107 """Split FASTA file into sub-files each of at most n sequences. | 103 """Split FASTA file into sub-files each of at most n sequences. |
108 | 104 |
109 Returns a list of the filenames used (based on the input filename). | 105 Returns a list of the filenames used (based on the input filename). |
130 handle = open(new_filename, "w") | 126 handle = open(new_filename, "w") |
131 if keep_descr: | 127 if keep_descr: |
132 for title, seq in records: | 128 for title, seq in records: |
133 handle.write(">%s\n" % title) | 129 handle.write(">%s\n" % title) |
134 for i in range(0, len(seq), 60): | 130 for i in range(0, len(seq), 60): |
135 handle.write(seq[i:i+60] + "\n") | 131 handle.write(seq[i:i + 60] + "\n") |
136 else: | 132 else: |
137 for title, seq in records: | 133 for title, seq in records: |
138 handle.write(">%s\n" % title.split()[0]) | 134 handle.write(">%s\n" % title.split()[0]) |
139 for i in range(0, len(seq), 60): | 135 for i in range(0, len(seq), 60): |
140 handle.write(seq[i:i+60] + "\n") | 136 handle.write(seq[i:i + 60] + "\n") |
141 handle.close() | 137 handle.close() |
142 files.append(new_filename) | 138 files.append(new_filename) |
143 #print "%i records in %s" % (len(records), new_filename) | 139 # print "%i records in %s" % (len(records), new_filename) |
144 except ValueError, err: | 140 except ValueError, err: |
145 #Max length failure from parser - clean up | 141 # Max length failure from parser - clean up |
146 try: | 142 try: |
147 handle.close() | 143 handle.close() |
148 except: | 144 except Exception: |
149 pass | 145 pass |
150 for f in files: | 146 for f in files: |
151 if os.path.isfile(f): | 147 if os.path.isfile(f): |
152 os.remove(f) | 148 os.remove(f) |
153 raise err | 149 raise err |
154 for f in files: | 150 for f in files: |
155 assert os.path.isfile(f), "Missing split file %r (!??)" % f | 151 assert os.path.isfile(f), "Missing split file %r (!??)" % f |
156 return files | 152 return files |
157 | 153 |
154 | |
158 def run_jobs(jobs, threads, pause=10, verbose=False): | 155 def run_jobs(jobs, threads, pause=10, verbose=False): |
159 """Takes list of cmd strings, returns dict with error levels.""" | 156 """Takes list of cmd strings, returns dict with error levels.""" |
160 pending = jobs[:] | 157 pending = jobs[:] |
161 running = [] | 158 running = [] |
162 results = {} | 159 results = {} |
163 if threads == 1: | 160 if threads == 1: |
164 #Special case this for speed, don't need the waits | 161 # Special case this for speed, don't need the waits |
165 for cmd in jobs: | 162 for cmd in jobs: |
166 results[cmd] = subprocess.call(cmd, shell=True) | 163 results[cmd] = subprocess.call(cmd, shell=True) |
167 return results | 164 return results |
168 while pending or running: | 165 while pending or running: |
169 #See if any have finished | 166 # See if any have finished |
170 for (cmd, process) in running: | 167 for (cmd, process) in running: |
171 return_code = process.poll() #non-blocking | 168 return_code = process.poll() # non-blocking |
172 if return_code is not None: | 169 if return_code is not None: |
173 results[cmd] = return_code | 170 results[cmd] = return_code |
174 running = [(cmd, process) for (cmd, process) in running \ | 171 running = [(cmd, process) for (cmd, process) in running |
175 if cmd not in results] | 172 if cmd not in results] |
176 if verbose: | 173 if verbose: |
177 print "%i jobs pending, %i running, %i completed" \ | 174 print "%i jobs pending, %i running, %i completed" \ |
178 % (len(pending), len(running), len(results)) | 175 % (len(pending), len(running), len(results)) |
179 #See if we can start any new threads | 176 # See if we can start any new threads |
180 while pending and len(running) < threads: | 177 while pending and len(running) < threads: |
181 cmd = pending.pop(0) | 178 cmd = pending.pop(0) |
182 if verbose: | 179 if verbose: |
183 print cmd | 180 print cmd |
184 process = subprocess.Popen(cmd, shell=True) | 181 process = subprocess.Popen(cmd, shell=True) |
185 running.append((cmd, process)) | 182 running.append((cmd, process)) |
186 #Loop... | 183 # Loop... |
187 sleep(pause) | 184 sleep(pause) |
188 if verbose: | 185 if verbose: |
189 print "%i jobs completed" % len(results) | 186 print "%i jobs completed" % len(results) |
190 assert set(jobs) == set(results) | 187 assert set(jobs) == set(results) |
191 return results | 188 return results |