0
|
1 #!/usr/bin/env python3
|
|
2 """
|
|
3 Reconstructs germline sequences from alignment data
|
|
4 """
|
|
5 # Info
|
|
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
|
|
7 from changeo import __version__, __date__
|
|
8
|
|
9 # Imports
|
|
10 import os
|
|
11 import sys
|
|
12 from argparse import ArgumentParser
|
|
13 from collections import OrderedDict
|
|
14 from textwrap import dedent
|
|
15 from time import time
|
|
16
|
|
17 # Presto and change imports
|
|
18 from presto.Defaults import default_out_args
|
|
19 from presto.IO import getOutputHandle, printLog, printProgress
|
|
20 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
|
|
21 from changeo.IO import getDbWriter, readDbFile, countDbFile, getRepo
|
|
22 from changeo.Receptor import allele_regex, parseAllele
|
|
23
|
|
24 # Defaults
|
|
25 default_germ_types = 'dmask'
|
|
26 default_v_field = 'V_CALL'
|
|
27 default_seq_field = 'SEQUENCE_IMGT'
|
|
28
|
|
29
|
|
30 def joinGermline(align, repo_dict, germ_types, v_field, seq_field):
|
|
31 """
|
|
32 Join gapped germline sequences aligned with sample sequences
|
|
33
|
|
34 Arguments:
|
|
35 align = iterable yielding dictionaries of sample sequence data
|
|
36 repo_dict = dictionary of IMGT gapped germline sequences
|
|
37 germ_types = types of germline sequences to be output
|
|
38 (full germline, D-region masked, only V-region germline)
|
|
39 v_field = field in which to look for V call
|
|
40 seq_field = field in which to look for sequence
|
|
41
|
|
42 Returns:
|
|
43 dictionary of germline_type: germline_sequence
|
|
44 """
|
|
45 j_field = 'J_CALL'
|
|
46 germlines = {'full': '', 'dmask': '', 'vonly': ''}
|
|
47 result_log = OrderedDict()
|
|
48 result_log['ID'] = align['SEQUENCE_ID']
|
|
49
|
|
50 # Find germline V-region gene
|
|
51 if v_field == 'V_CALL_GENOTYPED':
|
|
52 vgene = parseAllele(align[v_field], allele_regex, 'list')
|
|
53 vkey = vgene
|
|
54 else:
|
|
55 vgene = parseAllele(align[v_field], allele_regex, 'first')
|
|
56 vkey = (vgene, )
|
|
57
|
|
58 # Build V-region germline
|
|
59 if vgene is not None:
|
|
60 result_log['V_CALL'] = ','.join(vkey)
|
|
61 if vkey in repo_dict:
|
|
62 vseq = repo_dict[vkey]
|
|
63 # Germline start
|
|
64 try: vstart = int(align['V_GERM_START_IMGT']) - 1
|
|
65 except (TypeError, ValueError): vstart = 0
|
|
66 # Germline length
|
|
67 try: vlen = int(align['V_GERM_LENGTH_IMGT'])
|
|
68 except (TypeError, ValueError): vlen = 0
|
|
69 # TODO: not sure what this line is doing here. it no make no sense.
|
|
70 vpad = vlen - len(vseq[vstart:])
|
|
71 if vpad < 0: vpad = 0
|
|
72 germ_vseq = vseq[vstart:(vstart + vlen)] + ('N' * vpad)
|
|
73 else:
|
|
74 result_log['ERROR'] = 'Germline %s not in repertoire' % ','.join(vkey)
|
|
75 return result_log, germlines
|
|
76 else:
|
|
77 result_log['V_CALL'] = None
|
|
78 try: vlen = int(align['V_GERM_LENGTH_IMGT'])
|
|
79 except (TypeError, ValueError): vlen = 0
|
|
80 germ_vseq = 'N' * vlen
|
|
81
|
|
82 # Find germline D-region gene
|
|
83 dgene = parseAllele(align['D_CALL'], allele_regex, 'first')
|
|
84
|
|
85 # Build D-region germline
|
|
86 if dgene is not None:
|
|
87 result_log['D_CALL'] = dgene
|
|
88 dkey = (dgene, )
|
|
89 if dkey in repo_dict:
|
|
90 dseq = repo_dict[dkey]
|
|
91 # Germline start
|
|
92 try: dstart = int(align['D_GERM_START']) - 1
|
|
93 except (TypeError, ValueError): dstart = 0
|
|
94 # Germline length
|
|
95 try: dlen = int(align['D_GERM_LENGTH'])
|
|
96 except (TypeError, ValueError): dlen = 0
|
|
97 germ_dseq = repo_dict[dkey][dstart:(dstart + dlen)]
|
|
98 else:
|
|
99 result_log['ERROR'] = 'Germline %s not in repertoire' % dgene
|
|
100 return result_log, germlines
|
|
101 else:
|
|
102 result_log['D_CALL'] = None
|
|
103 germ_dseq = ''
|
|
104
|
|
105 # Find germline J-region gene
|
|
106 jgene = parseAllele(align[j_field], allele_regex, 'first')
|
|
107
|
|
108 # Build D-region germline
|
|
109 if jgene is not None:
|
|
110 result_log['J_CALL'] = jgene
|
|
111 jkey = (jgene, )
|
|
112 if jkey in repo_dict:
|
|
113 jseq = repo_dict[jkey]
|
|
114 # Germline start
|
|
115 try: jstart = int(align['J_GERM_START']) - 1
|
|
116 except (TypeError, ValueError): jstart = 0
|
|
117 # Germline length
|
|
118 try: jlen = int(align['J_GERM_LENGTH'])
|
|
119 except (TypeError, ValueError): jlen = 0
|
|
120 # TODO: not sure what this line is doing either
|
|
121 jpad = jlen - len(jseq[jstart:])
|
|
122 if jpad < 0: jpad = 0
|
|
123 germ_jseq = jseq[jstart:(jstart + jlen)] + ('N' * jpad)
|
|
124 else:
|
|
125 result_log['ERROR'] = 'Germline %s not in repertoire' % jgene
|
|
126 return result_log, germlines
|
|
127 else:
|
|
128 result_log['J_CALL'] = None
|
|
129 try: jlen = int(align['J_GERM_LENGTH'])
|
|
130 except (TypeError, ValueError): jlen = 0
|
|
131 germ_jseq = 'N' * jlen
|
|
132
|
|
133 # Assemble pieces starting with V-region
|
|
134 germ_seq = germ_vseq
|
|
135 regions = 'V' * len(germ_vseq)
|
|
136
|
|
137 # Nucleotide additions before D (before J for light chains)
|
|
138 try: n1_len = int(align['N1_LENGTH'])
|
|
139 except (TypeError, ValueError): n1_len = 0
|
|
140 if n1_len < 0:
|
|
141 result_log['ERROR'] = 'N1_LENGTH is negative'
|
|
142 return result_log, germlines
|
|
143
|
|
144 germ_seq += 'N' * n1_len
|
|
145 regions += 'N' * n1_len
|
|
146
|
|
147 # Add D-region
|
|
148 germ_seq += germ_dseq
|
|
149 regions += 'D' * len(germ_dseq)
|
|
150 #print 'VD>', germ_seq, '\nVD>', regions
|
|
151
|
|
152 # Nucleotide additions after D (heavy chains only)
|
|
153 try: n2_len = int(align['N2_LENGTH'])
|
|
154 except (TypeError, ValueError): n2_len = 0
|
|
155 if n2_len < 0:
|
|
156 result_log['ERROR'] = 'N2_LENGTH is negative'
|
|
157 return result_log, germlines
|
|
158
|
|
159 germ_seq += 'N' * n2_len
|
|
160 regions += 'N' * n2_len
|
|
161
|
|
162 # Add J-region
|
|
163 germ_seq += germ_jseq
|
|
164 regions += 'J' * len(germ_jseq)
|
|
165
|
|
166 # Define return germlines
|
|
167 germlines['full'] = germ_seq
|
|
168 germlines['regions'] = regions
|
|
169 if 'dmask' in germ_types:
|
|
170 germlines['dmask'] = germ_seq[:len(germ_vseq)] + \
|
|
171 'N' * (len(germ_seq) - len(germ_vseq) - len(germ_jseq)) + \
|
|
172 germ_seq[-len(germ_jseq):]
|
|
173 if 'vonly' in germ_types:
|
|
174 germlines['vonly'] = germ_vseq
|
|
175
|
|
176 # Check that input and germline sequence match
|
|
177 if len(align[seq_field]) == 0:
|
|
178 result_log['ERROR'] = 'Sequence is missing from %s column' % seq_field
|
|
179 elif len(germlines['full']) != len(align[seq_field]):
|
|
180 result_log['ERROR'] = 'Germline sequence is %d nucleotides longer than input sequence' % \
|
|
181 (len(germlines['full']) - len(align[seq_field]))
|
|
182
|
|
183 # Convert to uppercase
|
|
184 for k, v in germlines.items(): germlines[k] = v.upper()
|
|
185
|
|
186 return result_log, germlines
|
|
187
|
|
188
|
|
189 def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
|
|
190 """
|
|
191 Write germline sequences to tab-delimited database file
|
|
192
|
|
193 Arguments:
|
|
194 db_file = input tab-delimited database file
|
|
195 repo = folder with germline repertoire files
|
|
196 germ_types = types of germline sequences to be output
|
|
197 (full germline, D-region masked, only V-region germline)
|
|
198 v_field = field in which to look for V call
|
|
199 seq_field = field in which to look for sequence
|
|
200 out_args = arguments for output preferences
|
|
201
|
|
202 Returns:
|
|
203 None
|
|
204 """
|
|
205 # Print parameter info
|
|
206 log = OrderedDict()
|
|
207 log['START'] = 'CreateGermlines'
|
|
208 log['DB_FILE'] = os.path.basename(db_file)
|
|
209 log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
|
|
210 log['CLONED'] = 'False'
|
|
211 log['V_FIELD'] = v_field
|
|
212 log['SEQ_FIELD'] = seq_field
|
|
213 printLog(log)
|
|
214
|
|
215 # Get repertoire and open Db reader
|
|
216 repo_dict = getRepo(repo)
|
|
217 reader = readDbFile(db_file, ig=False)
|
|
218
|
|
219 # Exit if V call field does not exist in reader
|
|
220 if v_field not in reader.fieldnames:
|
|
221 sys.exit('Error: V field does not exist in input database file.')
|
|
222
|
|
223 # Define log handle
|
|
224 if out_args['log_file'] is None:
|
|
225 log_handle = None
|
|
226 else:
|
|
227 log_handle = open(out_args['log_file'], 'w')
|
|
228
|
|
229 add_fields = []
|
|
230 seq_type = seq_field.split('_')[-1]
|
|
231 if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
|
|
232 if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
|
|
233 if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
|
|
234
|
|
235 # Create output file handle and Db writer
|
|
236 pass_handle = getOutputHandle(db_file, 'germ-pass',
|
|
237 out_dir=out_args['out_dir'],
|
|
238 out_name=out_args['out_name'],
|
|
239 out_type=out_args['out_type'])
|
|
240 pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)
|
|
241
|
|
242 if out_args['failed']:
|
|
243 fail_handle = getOutputHandle(db_file, 'germ-fail',
|
|
244 out_dir=out_args['out_dir'],
|
|
245 out_name=out_args['out_name'],
|
|
246 out_type=out_args['out_type'])
|
|
247 fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields)
|
|
248 else:
|
|
249 fail_handle = None
|
|
250 fail_writer = None
|
|
251
|
|
252 # Initialize time and total count for progress bar
|
|
253 start_time = time()
|
|
254 rec_count = countDbFile(db_file)
|
|
255 pass_count = fail_count = 0
|
|
256 # Iterate over rows
|
|
257 for i,row in enumerate(reader):
|
|
258 # Print progress
|
|
259 printProgress(i, rec_count, 0.05, start_time)
|
|
260
|
|
261 result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field)
|
|
262
|
|
263 # Add germline field(s) to dictionary
|
|
264 if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full']
|
|
265 if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
|
|
266 if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
|
|
267
|
|
268 # Write row to pass or fail file
|
|
269 if 'ERROR' in result_log:
|
|
270 fail_count += 1
|
|
271 if fail_writer is not None: fail_writer.writerow(row)
|
|
272 else:
|
|
273 result_log['SEQUENCE'] = row[seq_field]
|
|
274 result_log['GERMLINE'] = germlines['full']
|
|
275 result_log['REGIONS'] = germlines['regions']
|
|
276
|
|
277 pass_count += 1
|
|
278 pass_writer.writerow(row)
|
|
279 printLog(result_log, handle=log_handle)
|
|
280
|
|
281 # Print log
|
|
282 printProgress(i+1, rec_count, 0.05, start_time)
|
|
283 log = OrderedDict()
|
|
284 log['OUTPUT'] = os.path.basename(pass_handle.name)
|
|
285 log['RECORDS'] = rec_count
|
|
286 log['PASS'] = pass_count
|
|
287 log['FAIL'] = fail_count
|
|
288 log['END'] = 'CreateGermlines'
|
|
289 printLog(log)
|
|
290
|
|
291 # Close file handles
|
|
292 pass_handle.close()
|
|
293 if fail_handle is not None: fail_handle.close()
|
|
294 if log_handle is not None: log_handle.close()
|
|
295
|
|
296
|
|
297 def makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field, seq_field, counts, writers, out_args):
|
|
298 """
|
|
299 Determine consensus clone sequence and create germline for clone
|
|
300
|
|
301 Arguments:
|
|
302 clone = clone ID
|
|
303 clone_dict = iterable yielding dictionaries of sequence data from clone
|
|
304 repo_dict = dictionary of IMGT gapped germline sequences
|
|
305 germ_types = types of germline sequences to be output
|
|
306 (full germline, D-region masked, only V-region germline)
|
|
307 v_field = field in which to look for V call
|
|
308 seq_field = field in which to look for sequence
|
|
309 counts = dictionary of pass counter and fail counter
|
|
310 writers = dictionary with pass and fail DB writers
|
|
311 out_args = arguments for output preferences
|
|
312
|
|
313 Returns:
|
|
314 None
|
|
315 """
|
|
316 seq_type = seq_field.split('_')[-1]
|
|
317 j_field = 'J_CALL'
|
|
318
|
|
319 # Create dictionaries to count observed V/J calls
|
|
320 v_dict = OrderedDict()
|
|
321 j_dict = OrderedDict()
|
|
322
|
|
323 # Find longest sequence in clone
|
|
324 max_length = 0
|
|
325 for val in clone_dict.values():
|
|
326 v = val[v_field]
|
|
327 v_dict[v] = v_dict.get(v,0) + 1
|
|
328 j = val[j_field]
|
|
329 j_dict[j] = j_dict.get(j,0) + 1
|
|
330 if len(val[seq_field]) > max_length: max_length = len(val[seq_field])
|
|
331
|
|
332 # Consensus V and J having most observations
|
|
333 v_cons = [k for k in list(v_dict.keys()) if v_dict[k] == max(v_dict.values())]
|
|
334 j_cons = [k for k in list(j_dict.keys()) if j_dict[k] == max(j_dict.values())]
|
|
335 # Consensus sequence(s) with consensus V/J calls and longest sequence
|
|
336 cons = [val for val in list(clone_dict.values()) if val.get(v_field,'') in v_cons and \
|
|
337 val.get(j_field,'') in j_cons and \
|
|
338 len(val[seq_field])==max_length]
|
|
339 # Sequence(s) with consensus V/J are not longest
|
|
340 if not cons:
|
|
341 # Sequence(s) with consensus V/J (not longest)
|
|
342 cons = [val for val in list(clone_dict.values()) if val.get(v_field,'') in v_cons and val.get(j_field,'') in j_cons]
|
|
343
|
|
344 # No sequence has both consensus V and J call
|
|
345 if not cons:
|
|
346 result_log = OrderedDict()
|
|
347 result_log['ID'] = clone
|
|
348 result_log['V_CALL'] = ','.join(v_cons)
|
|
349 result_log['J_CALL'] = ','.join(j_cons)
|
|
350 result_log['ERROR'] = 'No consensus sequence for clone found'
|
|
351 else:
|
|
352 # Pad end of consensus sequence with gaps to make it the max length
|
|
353 cons = cons[0]
|
|
354 cons['J_GERM_LENGTH'] = str(int(cons['J_GERM_LENGTH'] or 0) + max_length - len(cons[seq_field]))
|
|
355 cons[seq_field] += '.'*(max_length - len(cons[seq_field]))
|
|
356 result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
|
|
357 result_log['ID'] = clone
|
|
358 result_log['CONSENSUS'] = cons['SEQUENCE_ID']
|
|
359 else:
|
|
360 cons = cons[0]
|
|
361 result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
|
|
362 result_log['ID'] = clone
|
|
363 result_log['CONSENSUS'] = cons['SEQUENCE_ID']
|
|
364
|
|
365 # Write sequences of clone
|
|
366 for val in clone_dict.values():
|
|
367 if 'ERROR' not in result_log:
|
|
368 # Update lengths padded to longest sequence in clone
|
|
369 val['J_GERM_LENGTH'] = str(int(val['J_GERM_LENGTH'] or 0) + max_length - len(val[seq_field]))
|
|
370 val[seq_field] += '.'*(max_length - len(val[seq_field]))
|
|
371
|
|
372 # Add column(s) to tab-delimited database file
|
|
373 if 'full' in germ_types: val['GERMLINE_' + seq_type] = germlines['full']
|
|
374 if 'dmask' in germ_types: val['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
|
|
375 if 'vonly' in germ_types: val['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
|
|
376
|
|
377 result_log['SEQUENCE'] = cons[seq_field]
|
|
378 result_log['GERMLINE'] = germlines['full']
|
|
379 result_log['REGIONS'] = germlines['regions']
|
|
380
|
|
381 # Write to pass file
|
|
382 counts['pass'] += 1
|
|
383 writers['pass'].writerow(val)
|
|
384 else:
|
|
385 # Write to fail file
|
|
386 counts['fail'] += 1
|
|
387 if writers['fail'] is not None: writers['fail'].writerow(val)
|
|
388 # Return log
|
|
389 return result_log
|
|
390
|
|
391
|
|
392 def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
|
|
393 """
|
|
394 Assemble one germline sequence for each clone in a tab-delimited database file
|
|
395
|
|
396 Arguments:
|
|
397 db_file = input tab-delimited database file
|
|
398 repo = folder with germline repertoire files
|
|
399 germ_types = types of germline sequences to be output
|
|
400 (full germline, D-region masked, only V-region germline)
|
|
401 v_field = field in which to look for V call
|
|
402 seq_field = field in which to look for sequence
|
|
403 out_args = arguments for output preferences
|
|
404
|
|
405 Returns:
|
|
406 None
|
|
407 """
|
|
408 # Print parameter info
|
|
409 log = OrderedDict()
|
|
410 log['START'] = 'CreateGermlines'
|
|
411 log['DB_FILE'] = os.path.basename(db_file)
|
|
412 log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
|
|
413 log['CLONED'] = 'True'
|
|
414 log['V_FIELD'] = v_field
|
|
415 log['SEQ_FIELD'] = seq_field
|
|
416 printLog(log)
|
|
417
|
|
418 # Get repertoire and open Db reader
|
|
419 repo_dict = getRepo(repo)
|
|
420 reader = readDbFile(db_file, ig=False)
|
|
421
|
|
422 # Exit if V call field does not exist in reader
|
|
423 if v_field not in reader.fieldnames:
|
|
424 sys.exit('Error: V field does not exist in input database file.')
|
|
425
|
|
426 # Define log handle
|
|
427 if out_args['log_file'] is None:
|
|
428 log_handle = None
|
|
429 else:
|
|
430 log_handle = open(out_args['log_file'], 'w')
|
|
431
|
|
432 add_fields = []
|
|
433 seq_type = seq_field.split('_')[-1]
|
|
434 if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
|
|
435 if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
|
|
436 if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
|
|
437
|
|
438 # Create output file handle and Db writer
|
|
439 writers = {}
|
|
440 pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'],
|
|
441 out_name=out_args['out_name'], out_type=out_args['out_type'])
|
|
442 writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields)
|
|
443
|
|
444 if out_args['failed']:
|
|
445 fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'],
|
|
446 out_name=out_args['out_name'], out_type=out_args['out_type'])
|
|
447 writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields)
|
|
448 else:
|
|
449 fail_handle = None
|
|
450 writers['fail'] = None
|
|
451
|
|
452 # Initialize time and total count for progress bar
|
|
453 start_time = time()
|
|
454 rec_count = countDbFile(db_file)
|
|
455 counts = {}
|
|
456 clone_count = counts['pass'] = counts['fail'] = 0
|
|
457 # Iterate over rows
|
|
458 clone = 'initial'
|
|
459 clone_dict = OrderedDict()
|
|
460 for i,row in enumerate(reader):
|
|
461 # Print progress
|
|
462 printProgress(i, rec_count, 0.05, start_time)
|
|
463
|
|
464 # Clone isn't over yet
|
|
465 if row.get('CLONE','') == clone:
|
|
466 clone_dict[row["SEQUENCE_ID"]] = row
|
|
467 # Clone just finished
|
|
468 elif clone_dict:
|
|
469 clone_count += 1
|
|
470 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types,
|
|
471 v_field, seq_field, counts, writers, out_args)
|
|
472 printLog(result_log, handle=log_handle)
|
|
473 # Now deal with current row (first of next clone)
|
|
474 clone = row['CLONE']
|
|
475 clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)])
|
|
476 # Last case is only for first row of file
|
|
477 else:
|
|
478 clone = row['CLONE']
|
|
479 clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)])
|
|
480 clone_count += 1
|
|
481 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
|
|
482 seq_field, counts, writers, out_args)
|
|
483 printLog(result_log, handle=log_handle)
|
|
484
|
|
485 # Print log
|
|
486 printProgress(i+1, rec_count, 0.05, start_time)
|
|
487 log = OrderedDict()
|
|
488 log['OUTPUT'] = os.path.basename(pass_handle.name)
|
|
489 log['CLONES'] = clone_count
|
|
490 log['RECORDS'] = rec_count
|
|
491 log['PASS'] = counts['pass']
|
|
492 log['FAIL'] = counts['fail']
|
|
493 log['END'] = 'CreateGermlines'
|
|
494 printLog(log)
|
|
495
|
|
496 # Close file handles
|
|
497 pass_handle.close()
|
|
498 if fail_handle is not None: fail_handle.close()
|
|
499 if log_handle is not None: log_handle.close()
|
|
500
|
|
501
|
|
502 def getArgParser():
|
|
503 """
|
|
504 Defines the ArgumentParser
|
|
505
|
|
506 Arguments:
|
|
507 None
|
|
508
|
|
509 Returns:
|
|
510 an ArgumentParser object
|
|
511 """
|
|
512 # Define input and output field help message
|
|
513 fields = dedent(
|
|
514 '''
|
|
515 output files:
|
|
516 germ-pass
|
|
517 database with assigned germline sequences.
|
|
518 germ-fail
|
|
519 database with records failing germline assignment.
|
|
520
|
|
521 required fields:
|
|
522 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ or SEQUENCE_IMGT,
|
|
523 V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL,
|
|
524 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
|
|
525 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
|
|
526 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH
|
|
527
|
|
528 optional fields:
|
|
529 CLONE
|
|
530
|
|
531 output fields:
|
|
532 GERMLINE_VDJ, GERMLINE_VDJ_D_MASK, GERMLINE_VDJ_V_REGION,
|
|
533 GERMLINE_IMGT, GERMLINE_IMGT_D_MASK, GERMLINE_IMGT_V_REGION
|
|
534 ''')
|
|
535
|
|
536 # Parent parser
|
|
537 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
|
|
538 annotation=False)
|
|
539 # Define argument parser
|
|
540 parser = ArgumentParser(description=__doc__, epilog=fields,
|
|
541 parents=[parser_parent],
|
|
542 formatter_class=CommonHelpFormatter)
|
|
543 parser.add_argument('--version', action='version',
|
|
544 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
|
|
545
|
|
546 parser.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
547 help='List of folders and/or fasta files with germline sequences.')
|
|
548 parser.add_argument('-g', action='store', dest='germ_types', default=default_germ_types,
|
|
549 nargs='+', choices=('full', 'dmask', 'vonly'),
|
|
550 help='Specify type(s) of germlines to include full germline, \
|
|
551 germline with D-region masked, or germline for V region only.')
|
|
552 parser.add_argument('--cloned', action='store_true', dest='cloned',
|
|
553 help='Specify to create only one germline per clone \
|
|
554 (assumes input file is sorted by clone column)')
|
|
555 parser.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
|
|
556 help='Specify field to use for germline V call')
|
|
557 parser.add_argument('--sf', action='store', dest='seq_field', default=default_seq_field,
|
|
558 help='Specify field to use for sequence')
|
|
559
|
|
560 return parser
|
|
561
|
|
562
|
|
563 if __name__ == "__main__":
|
|
564 """
|
|
565 Parses command line arguments and calls main
|
|
566 """
|
|
567
|
|
568 # Parse command line arguments
|
|
569 parser = getArgParser()
|
|
570 args = parser.parse_args()
|
|
571 args_dict = parseCommonArgs(args)
|
|
572 del args_dict['db_files']
|
|
573 del args_dict['cloned']
|
|
574 args_dict['v_field'] = args_dict['v_field'].upper()
|
|
575 args_dict['seq_field'] = args_dict['seq_field'].upper()
|
|
576
|
|
577 for f in args.__dict__['db_files']:
|
|
578 args_dict['db_file'] = f
|
|
579 if args.__dict__['cloned']:
|
|
580 assembleCloneGermline(**args_dict)
|
|
581 else:
|
|
582 assembleEachGermline(**args_dict)
|