annotate ParseDb.py @ 6:3ddd933dd7a2 draft default tip

Uploaded
author davidvanzessen
date Thu, 15 Sep 2016 03:54:33 -0400
parents dda9b2e72e2b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1 #!/usr/bin/env python3
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
2 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
3 Parses tab delimited database files
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
4 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
5 # Info
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
6 __author__ = 'Jason Anthony Vander Heiden'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
7 from changeo import __version__, __date__
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
8
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
9 # Imports
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
10 import csv
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
11 import os
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
12 import re
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
13 from argparse import ArgumentParser
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
14 from collections import OrderedDict
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
15
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
16 from textwrap import dedent
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
17 from time import time
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
18 from Bio import SeqIO
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
19 from Bio.Seq import Seq
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
20 from Bio.SeqRecord import SeqRecord
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
21 from Bio.Alphabet import IUPAC
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
22
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
23 # Presto and changeo imports
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
24 from presto.Defaults import default_delimiter, default_out_args
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
25 from presto.Annotation import flattenAnnotation
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
26 from presto.IO import getOutputHandle, printLog, printProgress, printMessage
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
27 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
28 from changeo.IO import getDbWriter, readDbFile, countDbFile
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
29
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
30 # Defaults
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
31 default_id_field = 'SEQUENCE_ID'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
32 default_seq_field = 'SEQUENCE_IMGT'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
33 default_germ_field = 'GERMLINE_IMGT_D_MASK'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
34 default_index_field = 'INDEX'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
35
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
36 # TODO: convert SQL-ish operations to modify_func() as per ParseHeaders
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
37
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
38 def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
39 delimiter=default_delimiter):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
40 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
41 Parses a database record into a SeqRecord
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
42
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
43 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
44 db_record = a dictionary containing a database record
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
45 id_field = the field containing identifiers
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
46 seq_field = the field containing sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
47 meta_fields = a list of fields to add to sequence annotations
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
48 delimiter = a tuple of delimiters for (fields, values, value lists)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
49
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
50 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
51 a SeqRecord
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
52 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
53 # Return None if ID or sequence fields are empty
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
54 if not db_record[id_field] or not db_record[seq_field]:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
55 return None
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
56
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
57 # Create description string
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
58 desc_dict = OrderedDict([('ID', db_record[id_field])])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
59 if meta_fields is not None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
60 desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
61 desc_str = flattenAnnotation(desc_dict, delimiter=delimiter)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
62
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
63 # Create SeqRecord
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
64 seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna),
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
65 id=desc_str, name=desc_str, description='')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
66
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
67 return seq_record
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
68
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
69
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
70 def splitDbFile(db_file, field, num_split=None, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
71 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
72 Divides a tab-delimited database file into segments by description tags
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
73
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
74 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
75 db_file = filename of the tab-delimited database file to split
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
76 field = the field name by which to split db_file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
77 num_split = the numerical threshold by which to group sequences;
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
78 if None treat field as textual
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
79 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
80
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
81 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
82 a list of output file names
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
83 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
84 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
85 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
86 log['COMMAND'] = 'split'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
87 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
88 log['FIELD'] = field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
89 log['NUM_SPLIT'] = num_split
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
90 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
91
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
92 # Open IgRecord reader iter object
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
93 reader = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
94
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
95 # Determine total numbers of records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
96 rec_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
97
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
98 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
99 count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
100 # Sort records into files based on textual field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
101 if num_split is None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
102 # Create set of unique field tags
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
103 tmp_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
104 tag_list = list(set([row[field] for row in tmp_iter]))
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
105
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
106 # Forbidden characters in filename and replacements
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
107 noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
108 '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
109 # Replace forbidden characters in tag_list
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
110 tag_dict = {}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
111 for tag in tag_list:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
112 for c,r in noGood.items():
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
113 tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
114 if c in tag else tag_dict.get(tag, tag))
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
115
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
116 # Create output handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
117 handles_dict = {tag:getOutputHandle(db_file,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
118 '%s-%s' % (field, label),
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
119 out_type = out_args['out_type'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
120 out_name = out_args['out_name'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
121 out_dir = out_args['out_dir'])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
122 for tag, label in tag_dict.items()}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
123
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
124 # Create Db writer instances
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
125 writers_dict = {tag:getDbWriter(handles_dict[tag], db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
126 for tag in tag_dict}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
127
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
128 # Iterate over IgRecords
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
129 for row in reader:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
130 printProgress(count, rec_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
131 count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
132 # Write row to appropriate file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
133 tag = row[field]
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
134 writers_dict[tag].writerow(row)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
135
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
136 # Sort records into files based on numeric num_split
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
137 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
138 num_split = float(num_split)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
139
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
140 # Create output handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
141 handles_dict = {'under':getOutputHandle(db_file,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
142 'under-%.1f' % num_split,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
143 out_type = out_args['out_type'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
144 out_name = out_args['out_name'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
145 out_dir = out_args['out_dir']),
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
146 'atleast':getOutputHandle(db_file,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
147 'atleast-%.1f' % num_split,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
148 out_type = out_args['out_type'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
149 out_name = out_args['out_name'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
150 out_dir = out_args['out_dir'])}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
151
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
152 # Create Db writer instances
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
153 writers_dict = {'under':getDbWriter(handles_dict['under'], db_file),
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
154 'atleast':getDbWriter(handles_dict['atleast'], db_file)}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
155
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
156 # Iterate over IgRecords
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
157 for row in reader:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
158 printProgress(count, rec_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
159 count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
160 tag = row[field]
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
161 tag = 'under' if float(tag) < num_split else 'atleast'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
162 writers_dict[tag].writerow(row)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
163
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
164 # Write log
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
165 printProgress(count, rec_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
166 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
167 for i, k in enumerate(handles_dict):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
168 log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
169 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
170 log['PARTS'] = len(handles_dict)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
171 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
172 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
173
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
174 # Close output file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
175 for t in handles_dict: handles_dict[t].close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
176
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
177 return [handles_dict[t].name for t in handles_dict]
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
178
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
179
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
180 # TODO: SHOULD ALLOW FOR UNSORTED CLUSTER COLUMN
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
181 # TODO: SHOULD ALLOW FOR GROUPING FIELDS
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
182 def convertDbClip(db_file, id_field=default_id_field, seq_field=default_seq_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
183 germ_field=default_germ_field, cluster_field=None,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
184 meta_fields=None, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
185 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
186 Builds fasta files from database records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
187
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
188 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
189 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
190 id_field = the field containing identifiers
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
191 seq_field = the field containing sample sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
192 germ_field = the field containing germline sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
193 cluster_field = the field containing clonal groupings
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
194 if None write the germline for each record
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
195 meta_fields = a list of fields to add to sequence annotations
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
196 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
197
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
198 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
199 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
200 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
201 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
202 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
203 log['COMMAND'] = 'fasta'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
204 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
205 log['ID_FIELD'] = id_field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
206 log['SEQ_FIELD'] = seq_field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
207 log['GERM_FIELD'] = germ_field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
208 log['CLUSTER_FIELD'] = cluster_field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
209 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
210 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
211
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
212 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
213 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
214 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
215 out_name=out_args['out_name'], out_type='clip')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
216 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
217 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
218
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
219 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
220 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
221 rec_count = germ_count = pass_count = fail_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
222 cluster_last = None
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
223 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
224 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
225 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
226 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
227
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
228 # Update cluster ID
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
229 cluster = rec.get(cluster_field, None)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
230
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
231 # Get germline SeqRecord when needed
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
232 if cluster_field is None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
233 germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
234 delimiter=out_args['delimiter'])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
235 germ.id = '>' + germ.id
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
236 elif cluster != cluster_last:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
237 germ = getDbSeqRecord(rec, cluster_field, germ_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
238 delimiter=out_args['delimiter'])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
239 germ.id = '>' + germ.id
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
240 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
241 germ = None
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
242
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
243 # Get read SeqRecord
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
244 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
245 delimiter=out_args['delimiter'])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
246
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
247 # Write germline
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
248 if germ is not None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
249 germ_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
250 SeqIO.write(germ, pass_handle, 'fasta')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
251
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
252 # Write sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
253 if seq is not None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
254 pass_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
255 SeqIO.write(seq, pass_handle, 'fasta')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
256 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
257 fail_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
258
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
259 # Set last cluster ID
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
260 cluster_last = cluster
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
261
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
262 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
263 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
264 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
265 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
266 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
267 log['GERMLINES'] = germ_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
268 log['PASS'] = pass_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
269 log['FAIL'] = fail_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
270 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
271 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
272
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
273 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
274 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
275
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
276 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
277
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
278
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
279 def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
280 meta_fields=None, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
281 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
282 Builds fasta files from database records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
283
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
284 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
285 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
286 id_field = the field containing identifiers
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
287 seq_field = the field containing sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
288 meta_fields = a list of fields to add to sequence annotations
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
289 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
290
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
291 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
292 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
293 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
294 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
295 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
296 log['COMMAND'] = 'fasta'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
297 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
298 log['ID_FIELD'] = id_field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
299 log['SEQ_FIELD'] = seq_field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
300 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
301 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
302
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
303 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
304 out_type = 'fasta'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
305 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
306 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
307 out_name=out_args['out_name'], out_type=out_type)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
308 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
309 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
310
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
311 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
312 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
313 rec_count = pass_count = fail_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
314 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
315 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
316 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
317 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
318
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
319 # Get SeqRecord
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
320 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter'])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
321
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
322 # Write sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
323 if seq is not None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
324 pass_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
325 SeqIO.write(seq, pass_handle, out_type)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
326 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
327 fail_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
328
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
329 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
330 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
331 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
332 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
333 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
334 log['PASS'] = pass_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
335 log['FAIL'] = fail_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
336 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
337 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
338
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
339 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
340 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
341
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
342 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
343
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
344
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
345 def addDbFile(db_file, fields, values, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
346 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
347 Adds field and value pairs to a database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
348
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
349 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
350 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
351 fields = a list of fields to add
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
352 values = a list of values to assign to all rows of each field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
353 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
354
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
355 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
356 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
357 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
358 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
359 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
360 log['COMMAND'] = 'add'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
361 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
362 log['FIELDS'] = ','.join(fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
363 log['VALUES'] = ','.join(values)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
364 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
365
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
366 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
367 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
368 pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
369 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
370 pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
371 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
372 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
373
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
374 # Define fields and values to append
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
375 add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
376
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
377 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
378 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
379 rec_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
380 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
381 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
382 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
383 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
384 # Write updated row
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
385 rec.update(add_dict)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
386 pass_writer.writerow(rec)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
387
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
388 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
389 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
390 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
391 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
392 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
393 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
394 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
395
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
396 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
397 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
398
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
399 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
400
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
401
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
402 def indexDbFile(db_file, field=default_index_field, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
403 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
404 Adds an index column to a database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
405
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
406 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
407 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
408 field = the name of the index field to add
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
409 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
410
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
411 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
412 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
413 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
414 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
415 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
416 log['COMMAND'] = 'index'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
417 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
418 log['FIELD'] = field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
419 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
420
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
421 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
422 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
423 pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
424 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
425 pass_writer = getDbWriter(pass_handle, db_file, add_fields=field)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
426 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
427 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
428
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
429 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
430 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
431 rec_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
432 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
433 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
434 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
435 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
436
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
437 # Add count and write updated row
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
438 rec.update({field:rec_count})
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
439 pass_writer.writerow(rec)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
440
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
441 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
442 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
443 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
444 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
445 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
446 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
447 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
448
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
449 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
450 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
451
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
452 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
453
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
454
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
455 def dropDbFile(db_file, fields, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
456 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
457 Deletes entire fields from a database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
458
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
459 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
460 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
461 fields = a list of fields to drop
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
462 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
463
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
464 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
465 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
466 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
467 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
468 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
469 log['COMMAND'] = 'add'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
470 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
471 log['FIELDS'] = ','.join(fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
472 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
473
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
474 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
475 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
476 pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
477 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
478 pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
479 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
480 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
481
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
482 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
483 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
484 rec_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
485 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
486 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
487 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
488 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
489 # Write row
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
490 pass_writer.writerow(rec)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
491
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
492 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
493 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
494 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
495 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
496 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
497 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
498 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
499
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
500 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
501 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
502
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
503 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
504
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
505
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
506 def deleteDbFile(db_file, fields, values, logic='any', regex=False,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
507 out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
508 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
509 Deletes records from a database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
510
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
511 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
512 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
513 fields = a list of fields to check for deletion criteria
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
514 values = a list of values defining deletion targets
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
515 logic = one of 'any' or 'all' defining whether one or all fields must have a match.
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
516 regex = if False do exact full string matches; if True allow partial regex matches.
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
517 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
518
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
519 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
520 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
521 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
522 # Define string match function
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
523 if regex:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
524 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
525 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
526 def _match_func(x, patterns): return x in patterns
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
527
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
528 # Define logic function
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
529 if logic == 'any':
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
530 _logic_func = any
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
531 elif logic == 'all':
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
532 _logic_func = all
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
533
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
534 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
535 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
536 log['COMMAND'] = 'delete'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
537 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
538 log['FIELDS'] = ','.join(fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
539 log['VALUES'] = ','.join(values)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
540 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
541
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
542 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
543 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
544 pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
545 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
546 pass_writer = getDbWriter(pass_handle, db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
547 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
548 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
549
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
550 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
551 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
552 rec_count = pass_count = fail_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
553 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
554 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
555 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
556 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
557
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
558 # Check for deletion values in all fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
559 delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
560
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
561 # Write sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
562 if not delete:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
563 pass_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
564 pass_writer.writerow(rec)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
565 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
566 fail_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
567
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
568 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
569 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
570 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
571 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
572 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
573 log['KEPT'] = pass_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
574 log['DELETED'] = fail_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
575 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
576 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
577
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
578 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
579 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
580
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
581 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
582
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
583
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
584 def renameDbFile(db_file, fields, names, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
585 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
586 Renames fields in a database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
587
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
588 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
589 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
590 fields = a list of fields to rename
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
591 values = a list of new names for fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
592 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
593
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
594 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
595 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
596 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
597 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
598 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
599 log['COMMAND'] = 'rename'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
600 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
601 log['FIELDS'] = ','.join(fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
602 log['NAMES'] = ','.join(names)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
603 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
604
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
605 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
606 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
607 pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
608 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
609
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
610 # Get header and rename fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
611 header = (readDbFile(db_file, ig=False)).fieldnames
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
612 for f, n in zip(fields, names):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
613 i = header.index(f)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
614 header[i] = n
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
615
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
616 # Open writer and write new header
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
617 # TODO: should modify getDbWriter to take a list of fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
618 pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
619 pass_writer.writeheader()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
620
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
621 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
622 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
623
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
624 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
625 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
626 rec_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
627 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
628 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
629 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
630 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
631 # TODO: repeating renaming is unnecessary. should had a non-dict reader/writer to DbCore
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
632 # Rename fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
633 for f, n in zip(fields, names):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
634 rec[n] = rec.pop(f)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
635 # Write
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
636 pass_writer.writerow(rec)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
637
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
638 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
639 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
640 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
641 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
642 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
643 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
644 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
645
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
646 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
647 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
648
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
649 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
650
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
651
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
652 def selectDbFile(db_file, fields, values, logic='any', regex=False,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
653 out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
654 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
655 Selects records from a database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
656
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
657 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
658 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
659 fields = a list of fields to check for selection criteria
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
660 values = a list of values defining selection targets
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
661 logic = one of 'any' or 'all' defining whether one or all fields must have a match.
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
662 regex = if False do exact full string matches; if True allow partial regex matches.
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
663 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
664
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
665 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
666 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
667 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
668 # Define string match function
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
669 if regex:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
670 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
671 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
672 def _match_func(x, patterns): return x in patterns
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
673
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
674 # Define logic function
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
675 if logic == 'any':
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
676 _logic_func = any
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
677 elif logic == 'all':
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
678 _logic_func = all
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
679
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
680 # Print console log
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
681 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
682 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
683 log['COMMAND'] = 'select'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
684 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
685 log['FIELDS'] = ','.join(fields)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
686 log['VALUES'] = ','.join(values)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
687 log['REGEX'] =regex
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
688 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
689
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
690 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
691 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
692 pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
693 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
694 pass_writer = getDbWriter(pass_handle, db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
695 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
696 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
697
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
698 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
699 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
700 rec_count = pass_count = fail_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
701 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
702 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
703 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
704 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
705
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
706 # Check for selection values in all fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
707 select = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
708
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
709 # Write sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
710 if select:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
711 pass_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
712 pass_writer.writerow(rec)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
713 else:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
714 fail_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
715
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
716 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
717 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
718 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
719 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
720 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
721 log['SELECTED'] = pass_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
722 log['DISCARDED'] = fail_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
723 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
724 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
725
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
726 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
727 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
728
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
729 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
730
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
731
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
732 def sortDbFile(db_file, field, numeric=False, descend=False,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
733 out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
734 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
735 Sorts records by values in an annotation field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
736
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
737 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
738 db_file = the database filename
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
739 field = the field name to sort by
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
740 numeric = if True sort field numerically;
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
741 if False sort field alphabetically
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
742 descend = if True sort in descending order;
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
743 if False sort in ascending order
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
744
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
745 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
746
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
747 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
748 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
749 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
750 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
751 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
752 log['COMMAND'] = 'sort'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
753 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
754 log['FIELD'] = field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
755 log['NUMERIC'] = numeric
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
756 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
757
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
758 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
759 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
760 pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
761 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
762 pass_writer = getDbWriter(pass_handle, db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
763
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
764
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
765 # Store all records in a dictionary
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
766 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
767 printMessage("Indexing: Running", start_time=start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
768 db_dict = {i:r for i, r in enumerate(db_iter)}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
769 result_count = len(db_dict)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
770
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
771 # Sort db_dict by field values
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
772 tag_dict = {k:v[field] for k, v in db_dict.items()}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
773 if numeric: tag_dict = {k:float(v or 0) for k, v in tag_dict.items()}
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
774 sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
775 printMessage("Indexing: Done", start_time=start_time, end=True)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
776
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
777 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
778 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
779 rec_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
780 for key in sorted_keys:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
781 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
782 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
783 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
784
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
785 # Write records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
786 pass_writer.writerow(db_dict[key])
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
787
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
788 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
789 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
790 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
791 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
792 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
793 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
794 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
795
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
796 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
797 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
798
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
799 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
800
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
801
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
802 def updateDbFile(db_file, field, values, updates, out_args=default_out_args):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
803 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
804 Updates field and value pairs to a database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
805
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
806 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
807 db_file = the database file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
808 field = the field to update
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
809 values = a list of values to specifying which rows to update
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
810 updates = a list of values to update each value with
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
811 out_args = common output argument dictionary from parseCommonArgs
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
812
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
813 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
814 the output file name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
815 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
816 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
817 log['START'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
818 log['COMMAND'] = 'update'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
819 log['FILE'] = os.path.basename(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
820 log['FIELD'] = field
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
821 log['VALUES'] = ','.join(values)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
822 log['UPDATES'] = ','.join(updates)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
823 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
824
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
825 # Open file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
826 db_iter = readDbFile(db_file, ig=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
827 pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
828 out_name=out_args['out_name'], out_type='tab')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
829 pass_writer = getDbWriter(pass_handle, db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
830 # Count records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
831 result_count = countDbFile(db_file)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
832
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
833 # Iterate over records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
834 start_time = time()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
835 rec_count = pass_count = 0
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
836 for rec in db_iter:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
837 # Print progress for previous iteration
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
838 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
839 rec_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
840
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
841 # Updated values if found
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
842 for x, y in zip(values, updates):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
843 if rec[field] == x:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
844 rec[field] = y
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
845 pass_count += 1
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
846
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
847 # Write records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
848 pass_writer.writerow(rec)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
849
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
850 # Print counts
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
851 printProgress(rec_count, result_count, 0.05, start_time)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
852 log = OrderedDict()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
853 log['OUTPUT'] = os.path.basename(pass_handle.name)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
854 log['RECORDS'] = rec_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
855 log['UPDATED'] = pass_count
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
856 log['END'] = 'ParseDb'
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
857 printLog(log)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
858
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
859 # Close file handles
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
860 pass_handle.close()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
861
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
862 return pass_handle.name
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
863
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
864
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
865 def getArgParser():
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
866 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
867 Defines the ArgumentParser
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
868
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
869 Arguments:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
870 None
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
871
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
872 Returns:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
873 an ArgumentParser object
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
874 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
875 # Define input and output field help message
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
876 fields = dedent(
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
877 '''
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
878 output files:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
879 sequences
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
880 FASTA formatted sequences output from the subcommands fasta and clip.
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
881 <field>-<value>
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
882 database files partitioned by annotation <field> and <value>.
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
883 parse-<command>
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
884 output of the database modification functions where <command> is one of
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
885 the subcommands add, index, drop, delete, rename, select, sort or update.
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
886
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
887 required fields:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
888 SEQUENCE_ID
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
889
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
890 optional fields:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
891 JUNCTION, SEQUENCE_IMGT, SEQUENCE_VDJ, GERMLINE_IMGT, GERMLINE_VDJ,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
892 GERMLINE_IMGT_D_MASK, GERMLINE_VDJ_D_MASK,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
893 GERMLINE_IMGT_V_REGION, GERMLINE_VDJ_V_REGION
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
894
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
895 output fields:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
896 None
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
897 ''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
898
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
899 # Define ArgumentParser
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
900 parser = ArgumentParser(description=__doc__, epilog=fields,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
901 formatter_class=CommonHelpFormatter)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
902 parser.add_argument('--version', action='version',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
903 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
904 subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
905 help='Database operation')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
906 # TODO: This is a temporary fix for Python issue 9253
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
907 subparsers.required = True
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
908
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
909 # Define parent parser
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
910 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
911 failed=False, log=False)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
912
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
913 # Subparser to convert database entries to sequence file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
914 parser_seq = subparsers.add_parser('fasta', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
915 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
916 help='Creates a fasta file from database records')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
917 parser_seq.add_argument('--if', action='store', dest='id_field',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
918 default=default_id_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
919 help='The name of the field containing identifiers')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
920 parser_seq.add_argument('--sf', action='store', dest='seq_field',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
921 default=default_seq_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
922 help='The name of the field containing sequences')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
923 parser_seq.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
924 help='List of annotation fields to add to the sequence description')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
925 parser_seq.set_defaults(func=convertDbFasta)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
926
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
927 # Subparser to convert database entries to clip-fasta file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
928 parser_clip = subparsers.add_parser('clip', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
929 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
930 help='''Creates a clip-fasta file from database
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
931 records, wherein germline sequences precede
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
932 each clone and are denoted by ">>" headers.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
933 parser_clip.add_argument('--if', action='store', dest='id_field',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
934 default=default_id_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
935 help='The name of the field containing identifiers')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
936 parser_clip.add_argument('--sf', action='store', dest='seq_field',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
937 default=default_seq_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
938 help='The name of the field containing reads')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
939 parser_clip.add_argument('--gf', action='store', dest='germ_field',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
940 default=default_germ_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
941 help='The name of the field containing germline sequences')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
942 parser_clip.add_argument('--cf', action='store', dest='cluster_field', default=None,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
943 help='The name of the field containing containing sorted clone IDs')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
944 parser_clip.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
945 help='List of annotation fields to add to the sequence description')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
946 parser_clip.set_defaults(func=convertDbClip)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
947
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
948 # Subparser to partition files by annotation values
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
949 parser_split = subparsers.add_parser('split', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
950 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
951 help='Splits database files by field values')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
952 parser_split.add_argument('-f', action='store', dest='field', type=str, required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
953 help='Annotation field by which to split database files.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
954 parser_split.add_argument('--num', action='store', dest='num_split', type=float, default=None,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
955 help='''Specify to define the field as numeric and group
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
956 records by whether they are less than or at least
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
957 (greater than or equal to) the specified value.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
958 parser_split.set_defaults(func=splitDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
959
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
960 # Subparser to add records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
961 parser_add = subparsers.add_parser('add', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
962 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
963 help='Adds field and value pairs')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
964 parser_add.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
965 help='The name of the fields to add.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
966 parser_add.add_argument('-u', nargs='+', action='store', dest='values', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
967 help='The value to assign to all rows for each field.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
968 parser_add.set_defaults(func=addDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
969
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
970 # Subparser to delete records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
971 parser_delete = subparsers.add_parser('delete', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
972 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
973 help='Deletes specific records')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
974 parser_delete.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
975 help='The name of the fields to check for deletion criteria.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
976 parser_delete.add_argument('-u', nargs='+', action='store', dest='values', default=['', 'NA'],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
977 help='''The values defining which records to delete. A value
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
978 may appear in any of the fields specified with -f.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
979 parser_delete.add_argument('--logic', action='store', dest='logic',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
980 choices=('any', 'all'), default='any',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
981 help='''Defines whether a value may appear in any field (any)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
982 or whether it must appear in all fields (all).''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
983 parser_delete.add_argument('--regex', action='store_true', dest='regex',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
984 help='''If specified, treat values as regular expressions
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
985 and allow partial string matches.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
986 parser_delete.set_defaults(func=deleteDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
987
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
988 # Subparser to drop fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
989 parser_drop = subparsers.add_parser('drop', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
990 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
991 help='Deletes entire fields')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
992 parser_drop.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
993 help='The name of the fields to delete from the database.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
994 parser_drop.set_defaults(func=dropDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
995
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
996 # Subparser to index fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
997 parser_index = subparsers.add_parser('index', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
998 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
999 help='Adds a numeric index field')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1000 parser_index.add_argument('-f', action='store', dest='field',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1001 default=default_index_field,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1002 help='The name of the index field to add to the database.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1003 parser_index.set_defaults(func=indexDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1004
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1005 # Subparser to rename fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1006 parser_rename = subparsers.add_parser('rename', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1007 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1008 help='Renames fields')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1009 parser_rename.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1010 help='List of fields to rename.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1011 parser_rename.add_argument('-k', nargs='+', action='store', dest='names', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1012 help='List of new names for each field.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1013 parser_rename.set_defaults(func=renameDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1014
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1015 # Subparser to select records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1016 parser_select = subparsers.add_parser('select', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1017 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1018 help='Selects specific records')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1019 parser_select.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1020 help='The name of the fields to check for selection criteria.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1021 parser_select.add_argument('-u', nargs='+', action='store', dest='values', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1022 help='''The values defining with records to select. A value
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1023 may appear in any of the fields specified with -f.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1024 parser_select.add_argument('--logic', action='store', dest='logic',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1025 choices=('any', 'all'), default='any',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1026 help='''Defines whether a value may appear in any field (any)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1027 or whether it must appear in all fields (all).''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1028 parser_select.add_argument('--regex', action='store_true', dest='regex',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1029 help='''If specified, treat values as regular expressions
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1030 and allow partial string matches.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1031 parser_select.set_defaults(func=selectDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1032
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1033 # Subparser to sort file by records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1034 parser_sort = subparsers.add_parser('sort', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1035 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1036 help='Sorts records by field values')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1037 parser_sort.add_argument('-f', action='store', dest='field', type=str, required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1038 help='The annotation field by which to sort records.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1039 parser_sort.add_argument('--num', action='store_true', dest='numeric', default=False,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1040 help='''Specify to define the sort column as numeric rather
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1041 than textual.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1042 parser_sort.add_argument('--descend', action='store_true', dest='descend',
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1043 help='''If specified, sort records in descending, rather
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1044 than ascending, order by values in the target field.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1045 parser_sort.set_defaults(func=sortDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1046
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1047 # Subparser to update records
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1048 parser_update = subparsers.add_parser('update', parents=[parser_parent],
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1049 formatter_class=CommonHelpFormatter,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1050 help='Updates field and value pairs')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1051 parser_update.add_argument('-f', action='store', dest='field', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1052 help='The name of the field to update.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1053 parser_update.add_argument('-u', nargs='+', action='store', dest='values', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1054 help='The values that will be replaced.')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1055 parser_update.add_argument('-t', nargs='+', action='store', dest='updates', required=True,
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1056 help='''The new value to assign to each selected row.''')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1057 parser_update.set_defaults(func=updateDbFile)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1058
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1059 return parser
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1060
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1061
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1062 if __name__ == '__main__':
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1063 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1064 Parses command line arguments and calls main function
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1065 """
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1066 # Parse arguments
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1067 parser = getArgParser()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1068 args = parser.parse_args()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1069 args_dict = parseCommonArgs(args)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1070 # Convert case of fields
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1071 if 'id_field' in args_dict:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1072 args_dict['id_field'] = args_dict['id_field'].upper()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1073 if 'seq_field' in args_dict:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1074 args_dict['seq_field'] = args_dict['seq_field'].upper()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1075 if 'germ_field' in args_dict:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1076 args_dict['germ_field'] = args_dict['germ_field'].upper()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1077 if 'field' in args_dict:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1078 args_dict['field'] = args_dict['field'].upper()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1079 if 'cluster_field' in args_dict and args_dict['cluster_field'] is not None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1080 args_dict['cluster_field'] = args_dict['cluster_field'].upper()
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1081 if 'meta_fields' in args_dict and args_dict['meta_fields'] is not None:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1082 args_dict['meta_fields'] = [f.upper() for f in args_dict['meta_fields']]
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1083 if 'fields' in args_dict:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1084 args_dict['fields'] = [f.upper() for f in args_dict['fields']]
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1085
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1086 # Check modify_args arguments
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1087 if args.command == 'add' and len(args_dict['fields']) != len(args_dict['values']):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1088 parser.error('You must specify exactly one value (-u) per field (-f)')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1089 elif args.command == 'rename' and len(args_dict['fields']) != len(args_dict['names']):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1090 parser.error('You must specify exactly one new name (-k) per field (-f)')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1091 elif args.command == 'update' and len(args_dict['values']) != len(args_dict['updates']):
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1092 parser.error('You must specify exactly one value (-u) per replacement (-t)')
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1093
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1094 # Call parser function for each database file
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1095 del args_dict['command']
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1096 del args_dict['func']
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1097 del args_dict['db_files']
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1098 for f in args.__dict__['db_files']:
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1099 args_dict['db_file'] = f
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1100 args.func(**args_dict)
dda9b2e72e2b Uploaded
davidvanzessen
parents:
diff changeset
1101