comparison tools/protein_analysis/seq_analysis_utils.py @ 29:3cb02adf4326 draft

v0.2.9 Python style improvements
author peterjc
date Wed, 01 Feb 2017 09:46:14 -0500
parents 20139cb4c844
children 6d9d7cdf00fc
comparison
equal deleted inserted replaced
28:22e71e53f534 29:3cb02adf4326
10 import sys 10 import sys
11 import os 11 import os
12 import subprocess 12 import subprocess
13 from time import sleep 13 from time import sleep
14 14
15 __version__ = "0.0.1" 15 __version__ = "0.0.2"
16
17 def sys_exit(msg, error_level=1):
18 """Print error message to stdout and quit with given error level."""
19 sys.stderr.write("%s\n" % msg)
20 sys.exit(error_level)
21 16
22 try: 17 try:
23 from multiprocessing import cpu_count 18 from multiprocessing import cpu_count
24 except ImportError: 19 except ImportError:
25 #Must be under Python 2.5, this is copied from multiprocessing: 20 # Must be under Python 2.5, this is copied from multiprocessing:
26 def cpu_count(): 21 def cpu_count():
27 """Returns the number of CPUs in the system.""" 22 """Returns the number of CPUs in the system."""
28 if sys.platform == 'win32': 23 if sys.platform == 'win32':
29 try: 24 try:
30 num = int(os.environ['NUMBER_OF_PROCESSORS']) 25 num = int(os.environ['NUMBER_OF_PROCESSORS'])
52 47
53 48
54 def thread_count(command_line_arg, default=1): 49 def thread_count(command_line_arg, default=1):
55 try: 50 try:
56 num = int(command_line_arg) 51 num = int(command_line_arg)
57 except: 52 except ValueError:
58 num = default 53 num = default
59 if num < 1: 54 if num < 1:
60 sys_exit("Threads argument %r is not a positive integer" % command_line_arg) 55 sys.exit("Threads argument %r is not a positive integer" % command_line_arg)
61 #Cap this with the pysical limit of the machine, 56 # Cap this with the pysical limit of the machine,
62 try: 57 try:
63 num = min(num, cpu_count()) 58 num = min(num, cpu_count())
64 except NotImplementedError: 59 except NotImplementedError:
65 pass 60 pass
66 #For debugging, 61 # For debugging,
67 #hostname = os.environ.get("HOSTNAME", "this machine") 62 # hostname = os.environ.get("HOSTNAME", "this machine")
68 #sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) 63 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname))
69 return num 64 return num
70 65
71 66
72 def fasta_iterator(filename, max_len=None, truncate=None): 67 def fasta_iterator(filename, max_len=None, truncate=None):
73 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" 68 """Simple FASTA parser yielding tuples of (title, sequence) strings."""
77 if line.startswith(">"): 72 if line.startswith(">"):
78 if title: 73 if title:
79 if truncate: 74 if truncate:
80 seq = seq[:truncate] 75 seq = seq[:truncate]
81 if max_len and len(seq) > max_len: 76 if max_len and len(seq) > max_len:
82 raise ValueError("Sequence %s is length %i, max length %i" \ 77 raise ValueError("Sequence %s is length %i, max length %i"
83 % (title.split()[0], len(seq), max_len)) 78 % (title.split()[0], len(seq), max_len))
84 yield title, seq 79 yield title, seq
85 title = line[1:].rstrip() 80 title = line[1:].rstrip()
86 seq = "" 81 seq = ""
87 elif title: 82 elif title:
88 seq += line.strip() 83 seq += line.strip()
89 elif not line.strip() or line.startswith("#"): 84 elif not line.strip() or line.startswith("#"):
90 #Ignore blank lines, and any comment lines 85 # Ignore blank lines, and any comment lines
91 #between records (starting with hash). 86 # between records (starting with hash).
92 pass 87 pass
93 else: 88 else:
94 handle.close() 89 handle.close()
95 raise ValueError("Bad FASTA line %r" % line) 90 raise ValueError("Bad FASTA line %r" % line)
96 handle.close() 91 handle.close()
97 if title: 92 if title:
98 if truncate: 93 if truncate:
99 seq = seq[:truncate] 94 seq = seq[:truncate]
100 if max_len and len(seq) > max_len: 95 if max_len and len(seq) > max_len:
101 raise ValueError("Sequence %s is length %i, max length %i" \ 96 raise ValueError("Sequence %s is length %i, max length %i"
102 % (title.split()[0], len(seq), max_len)) 97 % (title.split()[0], len(seq), max_len))
103 yield title, seq 98 yield title, seq
104 raise StopIteration 99 raise StopIteration
100
105 101
106 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): 102 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None):
107 """Split FASTA file into sub-files each of at most n sequences. 103 """Split FASTA file into sub-files each of at most n sequences.
108 104
109 Returns a list of the filenames used (based on the input filename). 105 Returns a list of the filenames used (based on the input filename).
130 handle = open(new_filename, "w") 126 handle = open(new_filename, "w")
131 if keep_descr: 127 if keep_descr:
132 for title, seq in records: 128 for title, seq in records:
133 handle.write(">%s\n" % title) 129 handle.write(">%s\n" % title)
134 for i in range(0, len(seq), 60): 130 for i in range(0, len(seq), 60):
135 handle.write(seq[i:i+60] + "\n") 131 handle.write(seq[i:i + 60] + "\n")
136 else: 132 else:
137 for title, seq in records: 133 for title, seq in records:
138 handle.write(">%s\n" % title.split()[0]) 134 handle.write(">%s\n" % title.split()[0])
139 for i in range(0, len(seq), 60): 135 for i in range(0, len(seq), 60):
140 handle.write(seq[i:i+60] + "\n") 136 handle.write(seq[i:i + 60] + "\n")
141 handle.close() 137 handle.close()
142 files.append(new_filename) 138 files.append(new_filename)
143 #print "%i records in %s" % (len(records), new_filename) 139 # print "%i records in %s" % (len(records), new_filename)
144 except ValueError, err: 140 except ValueError, err:
145 #Max length failure from parser - clean up 141 # Max length failure from parser - clean up
146 try: 142 try:
147 handle.close() 143 handle.close()
148 except: 144 except Exception:
149 pass 145 pass
150 for f in files: 146 for f in files:
151 if os.path.isfile(f): 147 if os.path.isfile(f):
152 os.remove(f) 148 os.remove(f)
153 raise err 149 raise err
154 for f in files: 150 for f in files:
155 assert os.path.isfile(f), "Missing split file %r (!??)" % f 151 assert os.path.isfile(f), "Missing split file %r (!??)" % f
156 return files 152 return files
157 153
154
158 def run_jobs(jobs, threads, pause=10, verbose=False): 155 def run_jobs(jobs, threads, pause=10, verbose=False):
159 """Takes list of cmd strings, returns dict with error levels.""" 156 """Takes list of cmd strings, returns dict with error levels."""
160 pending = jobs[:] 157 pending = jobs[:]
161 running = [] 158 running = []
162 results = {} 159 results = {}
163 if threads == 1: 160 if threads == 1:
164 #Special case this for speed, don't need the waits 161 # Special case this for speed, don't need the waits
165 for cmd in jobs: 162 for cmd in jobs:
166 results[cmd] = subprocess.call(cmd, shell=True) 163 results[cmd] = subprocess.call(cmd, shell=True)
167 return results 164 return results
168 while pending or running: 165 while pending or running:
169 #See if any have finished 166 # See if any have finished
170 for (cmd, process) in running: 167 for (cmd, process) in running:
171 return_code = process.poll() #non-blocking 168 return_code = process.poll() # non-blocking
172 if return_code is not None: 169 if return_code is not None:
173 results[cmd] = return_code 170 results[cmd] = return_code
174 running = [(cmd, process) for (cmd, process) in running \ 171 running = [(cmd, process) for (cmd, process) in running
175 if cmd not in results] 172 if cmd not in results]
176 if verbose: 173 if verbose:
177 print "%i jobs pending, %i running, %i completed" \ 174 print "%i jobs pending, %i running, %i completed" \
178 % (len(pending), len(running), len(results)) 175 % (len(pending), len(running), len(results))
179 #See if we can start any new threads 176 # See if we can start any new threads
180 while pending and len(running) < threads: 177 while pending and len(running) < threads:
181 cmd = pending.pop(0) 178 cmd = pending.pop(0)
182 if verbose: 179 if verbose:
183 print cmd 180 print cmd
184 process = subprocess.Popen(cmd, shell=True) 181 process = subprocess.Popen(cmd, shell=True)
185 running.append((cmd, process)) 182 running.append((cmd, process))
186 #Loop... 183 # Loop...
187 sleep(pause) 184 sleep(pause)
188 if verbose: 185 if verbose:
189 print "%i jobs completed" % len(results) 186 print "%i jobs completed" % len(results)
190 assert set(jobs) == set(results) 187 assert set(jobs) == set(results)
191 return results 188 return results