comparison ParseDb.py @ 0:dda9b2e72e2b draft

Uploaded
author davidvanzessen
date Tue, 03 May 2016 09:52:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dda9b2e72e2b
1 #!/usr/bin/env python3
2 """
3 Parses tab delimited database files
4 """
5 # Info
6 __author__ = 'Jason Anthony Vander Heiden'
7 from changeo import __version__, __date__
8
9 # Imports
10 import csv
11 import os
12 import re
13 from argparse import ArgumentParser
14 from collections import OrderedDict
15
16 from textwrap import dedent
17 from time import time
18 from Bio import SeqIO
19 from Bio.Seq import Seq
20 from Bio.SeqRecord import SeqRecord
21 from Bio.Alphabet import IUPAC
22
23 # Presto and changeo imports
24 from presto.Defaults import default_delimiter, default_out_args
25 from presto.Annotation import flattenAnnotation
26 from presto.IO import getOutputHandle, printLog, printProgress, printMessage
27 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
28 from changeo.IO import getDbWriter, readDbFile, countDbFile
29
30 # Defaults
31 default_id_field = 'SEQUENCE_ID'
32 default_seq_field = 'SEQUENCE_IMGT'
33 default_germ_field = 'GERMLINE_IMGT_D_MASK'
34 default_index_field = 'INDEX'
35
36 # TODO: convert SQL-ish operations to modify_func() as per ParseHeaders
37
38 def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None,
39 delimiter=default_delimiter):
40 """
41 Parses a database record into a SeqRecord
42
43 Arguments:
44 db_record = a dictionary containing a database record
45 id_field = the field containing identifiers
46 seq_field = the field containing sequences
47 meta_fields = a list of fields to add to sequence annotations
48 delimiter = a tuple of delimiters for (fields, values, value lists)
49
50 Returns:
51 a SeqRecord
52 """
53 # Return None if ID or sequence fields are empty
54 if not db_record[id_field] or not db_record[seq_field]:
55 return None
56
57 # Create description string
58 desc_dict = OrderedDict([('ID', db_record[id_field])])
59 if meta_fields is not None:
60 desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record])
61 desc_str = flattenAnnotation(desc_dict, delimiter=delimiter)
62
63 # Create SeqRecord
64 seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna),
65 id=desc_str, name=desc_str, description='')
66
67 return seq_record
68
69
70 def splitDbFile(db_file, field, num_split=None, out_args=default_out_args):
71 """
72 Divides a tab-delimited database file into segments by description tags
73
74 Arguments:
75 db_file = filename of the tab-delimited database file to split
76 field = the field name by which to split db_file
77 num_split = the numerical threshold by which to group sequences;
78 if None treat field as textual
79 out_args = common output argument dictionary from parseCommonArgs
80
81 Returns:
82 a list of output file names
83 """
84 log = OrderedDict()
85 log['START'] = 'ParseDb'
86 log['COMMAND'] = 'split'
87 log['FILE'] = os.path.basename(db_file)
88 log['FIELD'] = field
89 log['NUM_SPLIT'] = num_split
90 printLog(log)
91
92 # Open IgRecord reader iter object
93 reader = readDbFile(db_file, ig=False)
94
95 # Determine total numbers of records
96 rec_count = countDbFile(db_file)
97
98 start_time = time()
99 count = 0
100 # Sort records into files based on textual field
101 if num_split is None:
102 # Create set of unique field tags
103 tmp_iter = readDbFile(db_file, ig=False)
104 tag_list = list(set([row[field] for row in tmp_iter]))
105
106 # Forbidden characters in filename and replacements
107 noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c',
108 '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'}
109 # Replace forbidden characters in tag_list
110 tag_dict = {}
111 for tag in tag_list:
112 for c,r in noGood.items():
113 tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \
114 if c in tag else tag_dict.get(tag, tag))
115
116 # Create output handles
117 handles_dict = {tag:getOutputHandle(db_file,
118 '%s-%s' % (field, label),
119 out_type = out_args['out_type'],
120 out_name = out_args['out_name'],
121 out_dir = out_args['out_dir'])
122 for tag, label in tag_dict.items()}
123
124 # Create Db writer instances
125 writers_dict = {tag:getDbWriter(handles_dict[tag], db_file)
126 for tag in tag_dict}
127
128 # Iterate over IgRecords
129 for row in reader:
130 printProgress(count, rec_count, 0.05, start_time)
131 count += 1
132 # Write row to appropriate file
133 tag = row[field]
134 writers_dict[tag].writerow(row)
135
136 # Sort records into files based on numeric num_split
137 else:
138 num_split = float(num_split)
139
140 # Create output handles
141 handles_dict = {'under':getOutputHandle(db_file,
142 'under-%.1f' % num_split,
143 out_type = out_args['out_type'],
144 out_name = out_args['out_name'],
145 out_dir = out_args['out_dir']),
146 'atleast':getOutputHandle(db_file,
147 'atleast-%.1f' % num_split,
148 out_type = out_args['out_type'],
149 out_name = out_args['out_name'],
150 out_dir = out_args['out_dir'])}
151
152 # Create Db writer instances
153 writers_dict = {'under':getDbWriter(handles_dict['under'], db_file),
154 'atleast':getDbWriter(handles_dict['atleast'], db_file)}
155
156 # Iterate over IgRecords
157 for row in reader:
158 printProgress(count, rec_count, 0.05, start_time)
159 count += 1
160 tag = row[field]
161 tag = 'under' if float(tag) < num_split else 'atleast'
162 writers_dict[tag].writerow(row)
163
164 # Write log
165 printProgress(count, rec_count, 0.05, start_time)
166 log = OrderedDict()
167 for i, k in enumerate(handles_dict):
168 log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
169 log['RECORDS'] = rec_count
170 log['PARTS'] = len(handles_dict)
171 log['END'] = 'ParseDb'
172 printLog(log)
173
174 # Close output file handles
175 for t in handles_dict: handles_dict[t].close()
176
177 return [handles_dict[t].name for t in handles_dict]
178
179
180 # TODO: SHOULD ALLOW FOR UNSORTED CLUSTER COLUMN
181 # TODO: SHOULD ALLOW FOR GROUPING FIELDS
182 def convertDbClip(db_file, id_field=default_id_field, seq_field=default_seq_field,
183 germ_field=default_germ_field, cluster_field=None,
184 meta_fields=None, out_args=default_out_args):
185 """
186 Builds fasta files from database records
187
188 Arguments:
189 db_file = the database file name
190 id_field = the field containing identifiers
191 seq_field = the field containing sample sequences
192 germ_field = the field containing germline sequences
193 cluster_field = the field containing clonal groupings
194 if None write the germline for each record
195 meta_fields = a list of fields to add to sequence annotations
196 out_args = common output argument dictionary from parseCommonArgs
197
198 Returns:
199 the output file name
200 """
201 log = OrderedDict()
202 log['START'] = 'ParseDb'
203 log['COMMAND'] = 'fasta'
204 log['FILE'] = os.path.basename(db_file)
205 log['ID_FIELD'] = id_field
206 log['SEQ_FIELD'] = seq_field
207 log['GERM_FIELD'] = germ_field
208 log['CLUSTER_FIELD'] = cluster_field
209 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields)
210 printLog(log)
211
212 # Open file handles
213 db_iter = readDbFile(db_file, ig=False)
214 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
215 out_name=out_args['out_name'], out_type='clip')
216 # Count records
217 result_count = countDbFile(db_file)
218
219 # Iterate over records
220 start_time = time()
221 rec_count = germ_count = pass_count = fail_count = 0
222 cluster_last = None
223 for rec in db_iter:
224 # Print progress for previous iteration
225 printProgress(rec_count, result_count, 0.05, start_time)
226 rec_count += 1
227
228 # Update cluster ID
229 cluster = rec.get(cluster_field, None)
230
231 # Get germline SeqRecord when needed
232 if cluster_field is None:
233 germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields,
234 delimiter=out_args['delimiter'])
235 germ.id = '>' + germ.id
236 elif cluster != cluster_last:
237 germ = getDbSeqRecord(rec, cluster_field, germ_field,
238 delimiter=out_args['delimiter'])
239 germ.id = '>' + germ.id
240 else:
241 germ = None
242
243 # Get read SeqRecord
244 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields,
245 delimiter=out_args['delimiter'])
246
247 # Write germline
248 if germ is not None:
249 germ_count += 1
250 SeqIO.write(germ, pass_handle, 'fasta')
251
252 # Write sequences
253 if seq is not None:
254 pass_count += 1
255 SeqIO.write(seq, pass_handle, 'fasta')
256 else:
257 fail_count += 1
258
259 # Set last cluster ID
260 cluster_last = cluster
261
262 # Print counts
263 printProgress(rec_count, result_count, 0.05, start_time)
264 log = OrderedDict()
265 log['OUTPUT'] = os.path.basename(pass_handle.name)
266 log['RECORDS'] = rec_count
267 log['GERMLINES'] = germ_count
268 log['PASS'] = pass_count
269 log['FAIL'] = fail_count
270 log['END'] = 'ParseDb'
271 printLog(log)
272
273 # Close file handles
274 pass_handle.close()
275
276 return pass_handle.name
277
278
279 def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field,
280 meta_fields=None, out_args=default_out_args):
281 """
282 Builds fasta files from database records
283
284 Arguments:
285 db_file = the database file name
286 id_field = the field containing identifiers
287 seq_field = the field containing sequences
288 meta_fields = a list of fields to add to sequence annotations
289 out_args = common output argument dictionary from parseCommonArgs
290
291 Returns:
292 the output file name
293 """
294 log = OrderedDict()
295 log['START'] = 'ParseDb'
296 log['COMMAND'] = 'fasta'
297 log['FILE'] = os.path.basename(db_file)
298 log['ID_FIELD'] = id_field
299 log['SEQ_FIELD'] = seq_field
300 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields)
301 printLog(log)
302
303 # Open file handles
304 out_type = 'fasta'
305 db_iter = readDbFile(db_file, ig=False)
306 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
307 out_name=out_args['out_name'], out_type=out_type)
308 # Count records
309 result_count = countDbFile(db_file)
310
311 # Iterate over records
312 start_time = time()
313 rec_count = pass_count = fail_count = 0
314 for rec in db_iter:
315 # Print progress for previous iteration
316 printProgress(rec_count, result_count, 0.05, start_time)
317 rec_count += 1
318
319 # Get SeqRecord
320 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter'])
321
322 # Write sequences
323 if seq is not None:
324 pass_count += 1
325 SeqIO.write(seq, pass_handle, out_type)
326 else:
327 fail_count += 1
328
329 # Print counts
330 printProgress(rec_count, result_count, 0.05, start_time)
331 log = OrderedDict()
332 log['OUTPUT'] = os.path.basename(pass_handle.name)
333 log['RECORDS'] = rec_count
334 log['PASS'] = pass_count
335 log['FAIL'] = fail_count
336 log['END'] = 'ParseDb'
337 printLog(log)
338
339 # Close file handles
340 pass_handle.close()
341
342 return pass_handle.name
343
344
345 def addDbFile(db_file, fields, values, out_args=default_out_args):
346 """
347 Adds field and value pairs to a database file
348
349 Arguments:
350 db_file = the database file name
351 fields = a list of fields to add
352 values = a list of values to assign to all rows of each field
353 out_args = common output argument dictionary from parseCommonArgs
354
355 Returns:
356 the output file name
357 """
358 log = OrderedDict()
359 log['START'] = 'ParseDb'
360 log['COMMAND'] = 'add'
361 log['FILE'] = os.path.basename(db_file)
362 log['FIELDS'] = ','.join(fields)
363 log['VALUES'] = ','.join(values)
364 printLog(log)
365
366 # Open file handles
367 db_iter = readDbFile(db_file, ig=False)
368 pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'],
369 out_name=out_args['out_name'], out_type='tab')
370 pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields)
371 # Count records
372 result_count = countDbFile(db_file)
373
374 # Define fields and values to append
375 add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames}
376
377 # Iterate over records
378 start_time = time()
379 rec_count = 0
380 for rec in db_iter:
381 # Print progress for previous iteration
382 printProgress(rec_count, result_count, 0.05, start_time)
383 rec_count += 1
384 # Write updated row
385 rec.update(add_dict)
386 pass_writer.writerow(rec)
387
388 # Print counts
389 printProgress(rec_count, result_count, 0.05, start_time)
390 log = OrderedDict()
391 log['OUTPUT'] = os.path.basename(pass_handle.name)
392 log['RECORDS'] = rec_count
393 log['END'] = 'ParseDb'
394 printLog(log)
395
396 # Close file handles
397 pass_handle.close()
398
399 return pass_handle.name
400
401
402 def indexDbFile(db_file, field=default_index_field, out_args=default_out_args):
403 """
404 Adds an index column to a database file
405
406 Arguments:
407 db_file = the database file name
408 field = the name of the index field to add
409 out_args = common output argument dictionary from parseCommonArgs
410
411 Returns:
412 the output file name
413 """
414 log = OrderedDict()
415 log['START'] = 'ParseDb'
416 log['COMMAND'] = 'index'
417 log['FILE'] = os.path.basename(db_file)
418 log['FIELD'] = field
419 printLog(log)
420
421 # Open file handles
422 db_iter = readDbFile(db_file, ig=False)
423 pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'],
424 out_name=out_args['out_name'], out_type='tab')
425 pass_writer = getDbWriter(pass_handle, db_file, add_fields=field)
426 # Count records
427 result_count = countDbFile(db_file)
428
429 # Iterate over records
430 start_time = time()
431 rec_count = 0
432 for rec in db_iter:
433 # Print progress for previous iteration
434 printProgress(rec_count, result_count, 0.05, start_time)
435 rec_count += 1
436
437 # Add count and write updated row
438 rec.update({field:rec_count})
439 pass_writer.writerow(rec)
440
441 # Print counts
442 printProgress(rec_count, result_count, 0.05, start_time)
443 log = OrderedDict()
444 log['OUTPUT'] = os.path.basename(pass_handle.name)
445 log['RECORDS'] = rec_count
446 log['END'] = 'ParseDb'
447 printLog(log)
448
449 # Close file handles
450 pass_handle.close()
451
452 return pass_handle.name
453
454
455 def dropDbFile(db_file, fields, out_args=default_out_args):
456 """
457 Deletes entire fields from a database file
458
459 Arguments:
460 db_file = the database file name
461 fields = a list of fields to drop
462 out_args = common output argument dictionary from parseCommonArgs
463
464 Returns:
465 the output file name
466 """
467 log = OrderedDict()
468 log['START'] = 'ParseDb'
469 log['COMMAND'] = 'add'
470 log['FILE'] = os.path.basename(db_file)
471 log['FIELDS'] = ','.join(fields)
472 printLog(log)
473
474 # Open file handles
475 db_iter = readDbFile(db_file, ig=False)
476 pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'],
477 out_name=out_args['out_name'], out_type='tab')
478 pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields)
479 # Count records
480 result_count = countDbFile(db_file)
481
482 # Iterate over records
483 start_time = time()
484 rec_count = 0
485 for rec in db_iter:
486 # Print progress for previous iteration
487 printProgress(rec_count, result_count, 0.05, start_time)
488 rec_count += 1
489 # Write row
490 pass_writer.writerow(rec)
491
492 # Print counts
493 printProgress(rec_count, result_count, 0.05, start_time)
494 log = OrderedDict()
495 log['OUTPUT'] = os.path.basename(pass_handle.name)
496 log['RECORDS'] = rec_count
497 log['END'] = 'ParseDb'
498 printLog(log)
499
500 # Close file handles
501 pass_handle.close()
502
503 return pass_handle.name
504
505
506 def deleteDbFile(db_file, fields, values, logic='any', regex=False,
507 out_args=default_out_args):
508 """
509 Deletes records from a database file
510
511 Arguments:
512 db_file = the database file name
513 fields = a list of fields to check for deletion criteria
514 values = a list of values defining deletion targets
515 logic = one of 'any' or 'all' defining whether one or all fields must have a match.
516 regex = if False do exact full string matches; if True allow partial regex matches.
517 out_args = common output argument dictionary from parseCommonArgs
518
519 Returns:
520 the output file name
521 """
522 # Define string match function
523 if regex:
524 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns])
525 else:
526 def _match_func(x, patterns): return x in patterns
527
528 # Define logic function
529 if logic == 'any':
530 _logic_func = any
531 elif logic == 'all':
532 _logic_func = all
533
534 log = OrderedDict()
535 log['START'] = 'ParseDb'
536 log['COMMAND'] = 'delete'
537 log['FILE'] = os.path.basename(db_file)
538 log['FIELDS'] = ','.join(fields)
539 log['VALUES'] = ','.join(values)
540 printLog(log)
541
542 # Open file handles
543 db_iter = readDbFile(db_file, ig=False)
544 pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'],
545 out_name=out_args['out_name'], out_type='tab')
546 pass_writer = getDbWriter(pass_handle, db_file)
547 # Count records
548 result_count = countDbFile(db_file)
549
550 # Iterate over records
551 start_time = time()
552 rec_count = pass_count = fail_count = 0
553 for rec in db_iter:
554 # Print progress for previous iteration
555 printProgress(rec_count, result_count, 0.05, start_time)
556 rec_count += 1
557
558 # Check for deletion values in all fields
559 delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
560
561 # Write sequences
562 if not delete:
563 pass_count += 1
564 pass_writer.writerow(rec)
565 else:
566 fail_count += 1
567
568 # Print counts
569 printProgress(rec_count, result_count, 0.05, start_time)
570 log = OrderedDict()
571 log['OUTPUT'] = os.path.basename(pass_handle.name)
572 log['RECORDS'] = rec_count
573 log['KEPT'] = pass_count
574 log['DELETED'] = fail_count
575 log['END'] = 'ParseDb'
576 printLog(log)
577
578 # Close file handles
579 pass_handle.close()
580
581 return pass_handle.name
582
583
584 def renameDbFile(db_file, fields, names, out_args=default_out_args):
585 """
586 Renames fields in a database file
587
588 Arguments:
589 db_file = the database file name
590 fields = a list of fields to rename
591 values = a list of new names for fields
592 out_args = common output argument dictionary from parseCommonArgs
593
594 Returns:
595 the output file name
596 """
597 log = OrderedDict()
598 log['START'] = 'ParseDb'
599 log['COMMAND'] = 'rename'
600 log['FILE'] = os.path.basename(db_file)
601 log['FIELDS'] = ','.join(fields)
602 log['NAMES'] = ','.join(names)
603 printLog(log)
604
605 # Open file handles
606 db_iter = readDbFile(db_file, ig=False)
607 pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'],
608 out_name=out_args['out_name'], out_type='tab')
609
610 # Get header and rename fields
611 header = (readDbFile(db_file, ig=False)).fieldnames
612 for f, n in zip(fields, names):
613 i = header.index(f)
614 header[i] = n
615
616 # Open writer and write new header
617 # TODO: should modify getDbWriter to take a list of fields
618 pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab')
619 pass_writer.writeheader()
620
621 # Count records
622 result_count = countDbFile(db_file)
623
624 # Iterate over records
625 start_time = time()
626 rec_count = 0
627 for rec in db_iter:
628 # Print progress for previous iteration
629 printProgress(rec_count, result_count, 0.05, start_time)
630 rec_count += 1
631 # TODO: repeating renaming is unnecessary. should had a non-dict reader/writer to DbCore
632 # Rename fields
633 for f, n in zip(fields, names):
634 rec[n] = rec.pop(f)
635 # Write
636 pass_writer.writerow(rec)
637
638 # Print counts
639 printProgress(rec_count, result_count, 0.05, start_time)
640 log = OrderedDict()
641 log['OUTPUT'] = os.path.basename(pass_handle.name)
642 log['RECORDS'] = rec_count
643 log['END'] = 'ParseDb'
644 printLog(log)
645
646 # Close file handles
647 pass_handle.close()
648
649 return pass_handle.name
650
651
652 def selectDbFile(db_file, fields, values, logic='any', regex=False,
653 out_args=default_out_args):
654 """
655 Selects records from a database file
656
657 Arguments:
658 db_file = the database file name
659 fields = a list of fields to check for selection criteria
660 values = a list of values defining selection targets
661 logic = one of 'any' or 'all' defining whether one or all fields must have a match.
662 regex = if False do exact full string matches; if True allow partial regex matches.
663 out_args = common output argument dictionary from parseCommonArgs
664
665 Returns:
666 the output file name
667 """
668 # Define string match function
669 if regex:
670 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns])
671 else:
672 def _match_func(x, patterns): return x in patterns
673
674 # Define logic function
675 if logic == 'any':
676 _logic_func = any
677 elif logic == 'all':
678 _logic_func = all
679
680 # Print console log
681 log = OrderedDict()
682 log['START'] = 'ParseDb'
683 log['COMMAND'] = 'select'
684 log['FILE'] = os.path.basename(db_file)
685 log['FIELDS'] = ','.join(fields)
686 log['VALUES'] = ','.join(values)
687 log['REGEX'] =regex
688 printLog(log)
689
690 # Open file handles
691 db_iter = readDbFile(db_file, ig=False)
692 pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'],
693 out_name=out_args['out_name'], out_type='tab')
694 pass_writer = getDbWriter(pass_handle, db_file)
695 # Count records
696 result_count = countDbFile(db_file)
697
698 # Iterate over records
699 start_time = time()
700 rec_count = pass_count = fail_count = 0
701 for rec in db_iter:
702 # Print progress for previous iteration
703 printProgress(rec_count, result_count, 0.05, start_time)
704 rec_count += 1
705
706 # Check for selection values in all fields
707 select = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
708
709 # Write sequences
710 if select:
711 pass_count += 1
712 pass_writer.writerow(rec)
713 else:
714 fail_count += 1
715
716 # Print counts
717 printProgress(rec_count, result_count, 0.05, start_time)
718 log = OrderedDict()
719 log['OUTPUT'] = os.path.basename(pass_handle.name)
720 log['RECORDS'] = rec_count
721 log['SELECTED'] = pass_count
722 log['DISCARDED'] = fail_count
723 log['END'] = 'ParseDb'
724 printLog(log)
725
726 # Close file handles
727 pass_handle.close()
728
729 return pass_handle.name
730
731
732 def sortDbFile(db_file, field, numeric=False, descend=False,
733 out_args=default_out_args):
734 """
735 Sorts records by values in an annotation field
736
737 Arguments:
738 db_file = the database filename
739 field = the field name to sort by
740 numeric = if True sort field numerically;
741 if False sort field alphabetically
742 descend = if True sort in descending order;
743 if False sort in ascending order
744
745 out_args = common output argument dictionary from parseCommonArgs
746
747 Returns:
748 the output file name
749 """
750 log = OrderedDict()
751 log['START'] = 'ParseDb'
752 log['COMMAND'] = 'sort'
753 log['FILE'] = os.path.basename(db_file)
754 log['FIELD'] = field
755 log['NUMERIC'] = numeric
756 printLog(log)
757
758 # Open file handles
759 db_iter = readDbFile(db_file, ig=False)
760 pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'],
761 out_name=out_args['out_name'], out_type='tab')
762 pass_writer = getDbWriter(pass_handle, db_file)
763
764
765 # Store all records in a dictionary
766 start_time = time()
767 printMessage("Indexing: Running", start_time=start_time)
768 db_dict = {i:r for i, r in enumerate(db_iter)}
769 result_count = len(db_dict)
770
771 # Sort db_dict by field values
772 tag_dict = {k:v[field] for k, v in db_dict.items()}
773 if numeric: tag_dict = {k:float(v or 0) for k, v in tag_dict.items()}
774 sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend)
775 printMessage("Indexing: Done", start_time=start_time, end=True)
776
777 # Iterate over records
778 start_time = time()
779 rec_count = 0
780 for key in sorted_keys:
781 # Print progress for previous iteration
782 printProgress(rec_count, result_count, 0.05, start_time)
783 rec_count += 1
784
785 # Write records
786 pass_writer.writerow(db_dict[key])
787
788 # Print counts
789 printProgress(rec_count, result_count, 0.05, start_time)
790 log = OrderedDict()
791 log['OUTPUT'] = os.path.basename(pass_handle.name)
792 log['RECORDS'] = rec_count
793 log['END'] = 'ParseDb'
794 printLog(log)
795
796 # Close file handles
797 pass_handle.close()
798
799 return pass_handle.name
800
801
802 def updateDbFile(db_file, field, values, updates, out_args=default_out_args):
803 """
804 Updates field and value pairs to a database file
805
806 Arguments:
807 db_file = the database file name
808 field = the field to update
809 values = a list of values to specifying which rows to update
810 updates = a list of values to update each value with
811 out_args = common output argument dictionary from parseCommonArgs
812
813 Returns:
814 the output file name
815 """
816 log = OrderedDict()
817 log['START'] = 'ParseDb'
818 log['COMMAND'] = 'update'
819 log['FILE'] = os.path.basename(db_file)
820 log['FIELD'] = field
821 log['VALUES'] = ','.join(values)
822 log['UPDATES'] = ','.join(updates)
823 printLog(log)
824
825 # Open file handles
826 db_iter = readDbFile(db_file, ig=False)
827 pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'],
828 out_name=out_args['out_name'], out_type='tab')
829 pass_writer = getDbWriter(pass_handle, db_file)
830 # Count records
831 result_count = countDbFile(db_file)
832
833 # Iterate over records
834 start_time = time()
835 rec_count = pass_count = 0
836 for rec in db_iter:
837 # Print progress for previous iteration
838 printProgress(rec_count, result_count, 0.05, start_time)
839 rec_count += 1
840
841 # Updated values if found
842 for x, y in zip(values, updates):
843 if rec[field] == x:
844 rec[field] = y
845 pass_count += 1
846
847 # Write records
848 pass_writer.writerow(rec)
849
850 # Print counts
851 printProgress(rec_count, result_count, 0.05, start_time)
852 log = OrderedDict()
853 log['OUTPUT'] = os.path.basename(pass_handle.name)
854 log['RECORDS'] = rec_count
855 log['UPDATED'] = pass_count
856 log['END'] = 'ParseDb'
857 printLog(log)
858
859 # Close file handles
860 pass_handle.close()
861
862 return pass_handle.name
863
864
865 def getArgParser():
866 """
867 Defines the ArgumentParser
868
869 Arguments:
870 None
871
872 Returns:
873 an ArgumentParser object
874 """
875 # Define input and output field help message
876 fields = dedent(
877 '''
878 output files:
879 sequences
880 FASTA formatted sequences output from the subcommands fasta and clip.
881 <field>-<value>
882 database files partitioned by annotation <field> and <value>.
883 parse-<command>
884 output of the database modification functions where <command> is one of
885 the subcommands add, index, drop, delete, rename, select, sort or update.
886
887 required fields:
888 SEQUENCE_ID
889
890 optional fields:
891 JUNCTION, SEQUENCE_IMGT, SEQUENCE_VDJ, GERMLINE_IMGT, GERMLINE_VDJ,
892 GERMLINE_IMGT_D_MASK, GERMLINE_VDJ_D_MASK,
893 GERMLINE_IMGT_V_REGION, GERMLINE_VDJ_V_REGION
894
895 output fields:
896 None
897 ''')
898
899 # Define ArgumentParser
900 parser = ArgumentParser(description=__doc__, epilog=fields,
901 formatter_class=CommonHelpFormatter)
902 parser.add_argument('--version', action='version',
903 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
904 subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='',
905 help='Database operation')
906 # TODO: This is a temporary fix for Python issue 9253
907 subparsers.required = True
908
909 # Define parent parser
910 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
911 failed=False, log=False)
912
913 # Subparser to convert database entries to sequence file
914 parser_seq = subparsers.add_parser('fasta', parents=[parser_parent],
915 formatter_class=CommonHelpFormatter,
916 help='Creates a fasta file from database records')
917 parser_seq.add_argument('--if', action='store', dest='id_field',
918 default=default_id_field,
919 help='The name of the field containing identifiers')
920 parser_seq.add_argument('--sf', action='store', dest='seq_field',
921 default=default_seq_field,
922 help='The name of the field containing sequences')
923 parser_seq.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
924 help='List of annotation fields to add to the sequence description')
925 parser_seq.set_defaults(func=convertDbFasta)
926
927 # Subparser to convert database entries to clip-fasta file
928 parser_clip = subparsers.add_parser('clip', parents=[parser_parent],
929 formatter_class=CommonHelpFormatter,
930 help='''Creates a clip-fasta file from database
931 records, wherein germline sequences precede
932 each clone and are denoted by ">>" headers.''')
933 parser_clip.add_argument('--if', action='store', dest='id_field',
934 default=default_id_field,
935 help='The name of the field containing identifiers')
936 parser_clip.add_argument('--sf', action='store', dest='seq_field',
937 default=default_seq_field,
938 help='The name of the field containing reads')
939 parser_clip.add_argument('--gf', action='store', dest='germ_field',
940 default=default_germ_field,
941 help='The name of the field containing germline sequences')
942 parser_clip.add_argument('--cf', action='store', dest='cluster_field', default=None,
943 help='The name of the field containing containing sorted clone IDs')
944 parser_clip.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
945 help='List of annotation fields to add to the sequence description')
946 parser_clip.set_defaults(func=convertDbClip)
947
948 # Subparser to partition files by annotation values
949 parser_split = subparsers.add_parser('split', parents=[parser_parent],
950 formatter_class=CommonHelpFormatter,
951 help='Splits database files by field values')
952 parser_split.add_argument('-f', action='store', dest='field', type=str, required=True,
953 help='Annotation field by which to split database files.')
954 parser_split.add_argument('--num', action='store', dest='num_split', type=float, default=None,
955 help='''Specify to define the field as numeric and group
956 records by whether they are less than or at least
957 (greater than or equal to) the specified value.''')
958 parser_split.set_defaults(func=splitDbFile)
959
960 # Subparser to add records
961 parser_add = subparsers.add_parser('add', parents=[parser_parent],
962 formatter_class=CommonHelpFormatter,
963 help='Adds field and value pairs')
964 parser_add.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
965 help='The name of the fields to add.')
966 parser_add.add_argument('-u', nargs='+', action='store', dest='values', required=True,
967 help='The value to assign to all rows for each field.')
968 parser_add.set_defaults(func=addDbFile)
969
970 # Subparser to delete records
971 parser_delete = subparsers.add_parser('delete', parents=[parser_parent],
972 formatter_class=CommonHelpFormatter,
973 help='Deletes specific records')
974 parser_delete.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
975 help='The name of the fields to check for deletion criteria.')
976 parser_delete.add_argument('-u', nargs='+', action='store', dest='values', default=['', 'NA'],
977 help='''The values defining which records to delete. A value
978 may appear in any of the fields specified with -f.''')
979 parser_delete.add_argument('--logic', action='store', dest='logic',
980 choices=('any', 'all'), default='any',
981 help='''Defines whether a value may appear in any field (any)
982 or whether it must appear in all fields (all).''')
983 parser_delete.add_argument('--regex', action='store_true', dest='regex',
984 help='''If specified, treat values as regular expressions
985 and allow partial string matches.''')
986 parser_delete.set_defaults(func=deleteDbFile)
987
988 # Subparser to drop fields
989 parser_drop = subparsers.add_parser('drop', parents=[parser_parent],
990 formatter_class=CommonHelpFormatter,
991 help='Deletes entire fields')
992 parser_drop.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
993 help='The name of the fields to delete from the database.')
994 parser_drop.set_defaults(func=dropDbFile)
995
996 # Subparser to index fields
997 parser_index = subparsers.add_parser('index', parents=[parser_parent],
998 formatter_class=CommonHelpFormatter,
999 help='Adds a numeric index field')
1000 parser_index.add_argument('-f', action='store', dest='field',
1001 default=default_index_field,
1002 help='The name of the index field to add to the database.')
1003 parser_index.set_defaults(func=indexDbFile)
1004
1005 # Subparser to rename fields
1006 parser_rename = subparsers.add_parser('rename', parents=[parser_parent],
1007 formatter_class=CommonHelpFormatter,
1008 help='Renames fields')
1009 parser_rename.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
1010 help='List of fields to rename.')
1011 parser_rename.add_argument('-k', nargs='+', action='store', dest='names', required=True,
1012 help='List of new names for each field.')
1013 parser_rename.set_defaults(func=renameDbFile)
1014
1015 # Subparser to select records
1016 parser_select = subparsers.add_parser('select', parents=[parser_parent],
1017 formatter_class=CommonHelpFormatter,
1018 help='Selects specific records')
1019 parser_select.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
1020 help='The name of the fields to check for selection criteria.')
1021 parser_select.add_argument('-u', nargs='+', action='store', dest='values', required=True,
1022 help='''The values defining with records to select. A value
1023 may appear in any of the fields specified with -f.''')
1024 parser_select.add_argument('--logic', action='store', dest='logic',
1025 choices=('any', 'all'), default='any',
1026 help='''Defines whether a value may appear in any field (any)
1027 or whether it must appear in all fields (all).''')
1028 parser_select.add_argument('--regex', action='store_true', dest='regex',
1029 help='''If specified, treat values as regular expressions
1030 and allow partial string matches.''')
1031 parser_select.set_defaults(func=selectDbFile)
1032
1033 # Subparser to sort file by records
1034 parser_sort = subparsers.add_parser('sort', parents=[parser_parent],
1035 formatter_class=CommonHelpFormatter,
1036 help='Sorts records by field values')
1037 parser_sort.add_argument('-f', action='store', dest='field', type=str, required=True,
1038 help='The annotation field by which to sort records.')
1039 parser_sort.add_argument('--num', action='store_true', dest='numeric', default=False,
1040 help='''Specify to define the sort column as numeric rather
1041 than textual.''')
1042 parser_sort.add_argument('--descend', action='store_true', dest='descend',
1043 help='''If specified, sort records in descending, rather
1044 than ascending, order by values in the target field.''')
1045 parser_sort.set_defaults(func=sortDbFile)
1046
1047 # Subparser to update records
1048 parser_update = subparsers.add_parser('update', parents=[parser_parent],
1049 formatter_class=CommonHelpFormatter,
1050 help='Updates field and value pairs')
1051 parser_update.add_argument('-f', action='store', dest='field', required=True,
1052 help='The name of the field to update.')
1053 parser_update.add_argument('-u', nargs='+', action='store', dest='values', required=True,
1054 help='The values that will be replaced.')
1055 parser_update.add_argument('-t', nargs='+', action='store', dest='updates', required=True,
1056 help='''The new value to assign to each selected row.''')
1057 parser_update.set_defaults(func=updateDbFile)
1058
1059 return parser
1060
1061
1062 if __name__ == '__main__':
1063 """
1064 Parses command line arguments and calls main function
1065 """
1066 # Parse arguments
1067 parser = getArgParser()
1068 args = parser.parse_args()
1069 args_dict = parseCommonArgs(args)
1070 # Convert case of fields
1071 if 'id_field' in args_dict:
1072 args_dict['id_field'] = args_dict['id_field'].upper()
1073 if 'seq_field' in args_dict:
1074 args_dict['seq_field'] = args_dict['seq_field'].upper()
1075 if 'germ_field' in args_dict:
1076 args_dict['germ_field'] = args_dict['germ_field'].upper()
1077 if 'field' in args_dict:
1078 args_dict['field'] = args_dict['field'].upper()
1079 if 'cluster_field' in args_dict and args_dict['cluster_field'] is not None:
1080 args_dict['cluster_field'] = args_dict['cluster_field'].upper()
1081 if 'meta_fields' in args_dict and args_dict['meta_fields'] is not None:
1082 args_dict['meta_fields'] = [f.upper() for f in args_dict['meta_fields']]
1083 if 'fields' in args_dict:
1084 args_dict['fields'] = [f.upper() for f in args_dict['fields']]
1085
1086 # Check modify_args arguments
1087 if args.command == 'add' and len(args_dict['fields']) != len(args_dict['values']):
1088 parser.error('You must specify exactly one value (-u) per field (-f)')
1089 elif args.command == 'rename' and len(args_dict['fields']) != len(args_dict['names']):
1090 parser.error('You must specify exactly one new name (-k) per field (-f)')
1091 elif args.command == 'update' and len(args_dict['values']) != len(args_dict['updates']):
1092 parser.error('You must specify exactly one value (-u) per replacement (-t)')
1093
1094 # Call parser function for each database file
1095 del args_dict['command']
1096 del args_dict['func']
1097 del args_dict['db_files']
1098 for f in args.__dict__['db_files']:
1099 args_dict['db_file'] = f
1100 args.func(**args_dict)
1101