Mercurial > repos > davidvanzessen > mutation_analysis
comparison change_o/MakeDb.py @ 120:613278c1bde0 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Tue, 16 Aug 2016 09:10:50 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 119:626a956f3811 | 120:613278c1bde0 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 """ | |
| 3 Create tab-delimited database file to store sequence alignment information | |
| 4 """ | |
| 5 # Info | |
| 6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | |
| 7 from changeo import __version__, __date__ | |
| 8 | |
| 9 # Imports | |
| 10 import csv | |
| 11 import os | |
| 12 import re | |
| 13 import sys | |
| 14 import pandas as pd | |
| 15 import tarfile | |
| 16 import zipfile | |
| 17 from argparse import ArgumentParser | |
| 18 from collections import OrderedDict | |
| 19 from itertools import groupby | |
| 20 from shutil import rmtree | |
| 21 from tempfile import mkdtemp | |
| 22 from textwrap import dedent | |
| 23 from time import time | |
| 24 from Bio import SeqIO | |
| 25 from Bio.Seq import Seq | |
| 26 from Bio.Alphabet import IUPAC | |
| 27 | |
| 28 # Presto and changeo imports | |
| 29 from presto.Defaults import default_out_args | |
| 30 from presto.Annotation import parseAnnotation | |
| 31 from presto.IO import countSeqFile, printLog, printProgress | |
| 32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs | |
| 33 from changeo.IO import getDbWriter, countDbFile, getRepo | |
| 34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ | |
| 35 j_allele_regex | |
| 36 | |
| 37 # Default parameters | |
| 38 default_delimiter = ('\t', ',', '-') | |
| 39 | |
| 40 | |
| 41 def gapV(ig_dict, repo_dict): | |
| 42 """ | |
| 43 Insert gaps into V region and update alignment information | |
| 44 | |
| 45 Arguments: | |
| 46 ig_dict : Dictionary of parsed IgBlast output | |
| 47 repo_dict : Dictionary of IMGT gapped germline sequences | |
| 48 | |
| 49 Returns: | |
| 50 dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields | |
| 51 """ | |
| 52 | |
| 53 seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] | |
| 54 | |
| 55 # Find gapped germline V segment | |
| 56 vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') | |
| 57 vkey = (vgene, ) | |
| 58 #TODO: Figure out else case | |
| 59 if vkey in repo_dict: | |
| 60 vgap = repo_dict[vkey] | |
| 61 # Iterate over gaps in the germline segment | |
| 62 gaps = re.finditer(r'\.', vgap) | |
| 63 gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 | |
| 64 for gap in gaps: | |
| 65 i = gap.start() | |
| 66 # Break if gap begins after V region | |
| 67 if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: | |
| 68 break | |
| 69 # Insert gap into IMGT sequence | |
| 70 seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] | |
| 71 # Update gap counter | |
| 72 gapcount += 1 | |
| 73 ig_dict['SEQUENCE_IMGT'] = seq_imgt | |
| 74 # Update IMGT positioning information for V | |
| 75 ig_dict['V_GERM_START_IMGT'] = 1 | |
| 76 ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount | |
| 77 | |
| 78 return ig_dict | |
| 79 | |
| 80 | |
| 81 def getIMGTJunc(ig_dict, repo_dict): | |
| 82 """ | |
| 83 Identify junction region by IMGT definition | |
| 84 | |
| 85 Arguments: | |
| 86 ig_dict : Dictionary of parsed IgBlast output | |
| 87 repo_dict : Dictionary of IMGT gapped germline sequences | |
| 88 | |
| 89 Returns: | |
| 90 dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields | |
| 91 """ | |
| 92 # Find germline J segment | |
| 93 jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') | |
| 94 jkey = (jgene, ) | |
| 95 #TODO: Figure out else case | |
| 96 if jkey in repo_dict: | |
| 97 # Get germline J sequence | |
| 98 jgerm = repo_dict[jkey] | |
| 99 jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] | |
| 100 # Look for (F|W)GXG aa motif in nt sequence | |
| 101 motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) | |
| 102 aa_end = len(ig_dict['SEQUENCE_IMGT']) | |
| 103 #TODO: Figure out else case | |
| 104 if motif: | |
| 105 # print('\n', motif.group()) | |
| 106 aa_end = motif.start() - len(jgerm) + 3 | |
| 107 # Add fields to dict | |
| 108 ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] | |
| 109 ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) | |
| 110 | |
| 111 return ig_dict | |
| 112 | |
| 113 | |
| 114 def getRegions(ig_dict): | |
| 115 """ | |
| 116 Identify FWR and CDR regions by IMGT definition | |
| 117 | |
| 118 Arguments: | |
| 119 ig_dict : Dictionary of parsed alignment output | |
| 120 | |
| 121 Returns: | |
| 122 dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, | |
| 123 CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields | |
| 124 """ | |
| 125 try: | |
| 126 seq_len = len(ig_dict['SEQUENCE_IMGT']) | |
| 127 ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] | |
| 128 except (KeyError, IndexError): | |
| 129 return ig_dict | |
| 130 | |
| 131 try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] | |
| 132 except (IndexError): return ig_dict | |
| 133 | |
| 134 try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] | |
| 135 except (IndexError): return ig_dict | |
| 136 | |
| 137 try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] | |
| 138 except (IndexError): return ig_dict | |
| 139 | |
| 140 try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] | |
| 141 except (IndexError): return ig_dict | |
| 142 | |
| 143 try: | |
| 144 cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] | |
| 145 ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] | |
| 146 ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] | |
| 147 except (KeyError, IndexError): | |
| 148 return ig_dict | |
| 149 | |
| 150 return ig_dict | |
| 151 | |
| 152 | |
| 153 def getSeqforIgBlast(seq_file): | |
| 154 """ | |
| 155 Fetch input sequences for IgBlast queries | |
| 156 | |
| 157 Arguments: | |
| 158 seq_file = a fasta file of sequences input to IgBlast | |
| 159 | |
| 160 Returns: | |
| 161 a dictionary of {ID:Seq} | |
| 162 """ | |
| 163 | |
| 164 seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) | |
| 165 | |
| 166 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | |
| 167 seqs = {} | |
| 168 for seq in seq_dict.values(): | |
| 169 seqs.update({seq.description:str(seq.seq)}) | |
| 170 | |
| 171 return seqs | |
| 172 | |
| 173 | |
| 174 def findLine(handle, query): | |
| 175 """ | |
| 176 Finds line with query string in file | |
| 177 | |
| 178 Arguments: | |
| 179 handle = file handle in which to search for line | |
| 180 query = query string for which to search in file | |
| 181 | |
| 182 Returns: | |
| 183 line from handle in which query string was found | |
| 184 """ | |
| 185 for line in handle: | |
| 186 if(re.match(query, line)): | |
| 187 return line | |
| 188 | |
| 189 | |
| 190 def extractIMGT(imgt_output): | |
| 191 """ | |
| 192 Extract necessary files from IMGT results, zipped or unzipped | |
| 193 | |
| 194 Arguments: | |
| 195 imgt_output = zipped file or unzipped folder output by IMGT | |
| 196 | |
| 197 Returns: | |
| 198 sorted list of filenames from which information will be read | |
| 199 """ | |
| 200 #file_ext = os.path.splitext(imgt_output)[1].lower() | |
| 201 imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') | |
| 202 temp_dir = mkdtemp() | |
| 203 if zipfile.is_zipfile(imgt_output): | |
| 204 # Open zip file | |
| 205 imgt_zip = zipfile.ZipFile(imgt_output, 'r') | |
| 206 # Extract required files | |
| 207 imgt_files = sorted([n for n in imgt_zip.namelist() \ | |
| 208 if os.path.basename(n).startswith(imgt_flags)]) | |
| 209 imgt_zip.extractall(temp_dir, imgt_files) | |
| 210 # Define file list | |
| 211 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | |
| 212 elif os.path.isdir(imgt_output): | |
| 213 # Find required files in folder | |
| 214 folder_files = [] | |
| 215 for root, dirs, files in os.walk(imgt_output): | |
| 216 folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) | |
| 217 # Define file list | |
| 218 imgt_files = sorted([n for n in folder_files \ | |
| 219 if os.path.basename(n).startswith(imgt_flags)]) | |
| 220 elif tarfile.is_tarfile(imgt_output): | |
| 221 # Open zip file | |
| 222 imgt_tar = tarfile.open(imgt_output, 'r') | |
| 223 # Extract required files | |
| 224 imgt_files = sorted([n for n in imgt_tar.getnames() \ | |
| 225 if os.path.basename(n).startswith(imgt_flags)]) | |
| 226 imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) | |
| 227 # Define file list | |
| 228 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | |
| 229 else: | |
| 230 sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') | |
| 231 | |
| 232 if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files | |
| 233 sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) | |
| 234 elif len(imgt_files) < len(imgt_flags): | |
| 235 sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) | |
| 236 | |
| 237 return temp_dir, imgt_files | |
| 238 | |
| 239 | |
| 240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks | |
| 241 def readOneIgBlastResult(block): | |
| 242 """ | |
| 243 Parse a single IgBLAST query result | |
| 244 | |
| 245 Arguments: | |
| 246 block = itertools groupby object of single result | |
| 247 | |
| 248 Returns: | |
| 249 None if no results, otherwise list of DataFrames for each result block | |
| 250 """ | |
| 251 results = list() | |
| 252 i = 0 | |
| 253 for match, subblock in groupby(block, lambda l: l=='\n'): | |
| 254 if not match: | |
| 255 # Strip whitespace and comments | |
| 256 sub = [s.strip() for s in subblock if not s.startswith('#')] | |
| 257 | |
| 258 # Continue on empty block | |
| 259 if not sub: continue | |
| 260 else: i += 1 | |
| 261 | |
| 262 # Split by tabs | |
| 263 sub = [s.split('\t') for s in sub] | |
| 264 | |
| 265 # Append list for "V-(D)-J rearrangement summary" (i == 1) | |
| 266 # And "V-(D)-J junction details" (i == 2) | |
| 267 # Otherwise append DataFrame of subblock | |
| 268 if i == 1 or i == 2: | |
| 269 results.append(sub[0]) | |
| 270 else: | |
| 271 df = pd.DataFrame(sub) | |
| 272 if not df.empty: results.append(df) | |
| 273 | |
| 274 return results if results else None | |
| 275 | |
| 276 | |
| 277 # TODO: needs more speeds. pandas is probably to blame. | |
| 278 def readIgBlast(igblast_output, seq_dict, repo_dict, | |
| 279 score_fields=False, region_fields=False): | |
| 280 """ | |
| 281 Reads IgBlast output | |
| 282 | |
| 283 Arguments: | |
| 284 igblast_output = IgBlast output file (format 7) | |
| 285 seq_dict = a dictionary of {ID:Seq} from input fasta file | |
| 286 repo_dict = dictionary of IMGT gapped germline sequences | |
| 287 score_fields = if True parse alignment scores | |
| 288 region_fields = if True add FWR and CDR region fields | |
| 289 | |
| 290 Returns: | |
| 291 a generator of dictionaries containing alignment data | |
| 292 """ | |
| 293 | |
| 294 # Open IgBlast output file | |
| 295 with open(igblast_output) as f: | |
| 296 # Iterate over individual results (separated by # IGBLASTN) | |
| 297 for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): | |
| 298 block = list(block) | |
| 299 if not k1: | |
| 300 # TODO: move query name extraction into block parser readOneIgBlastResult(). | |
| 301 # Extract sequence ID | |
| 302 query_name = ' '.join(block[0].strip().split(' ')[2:]) | |
| 303 # Initialize db_gen to have ID and input sequence | |
| 304 db_gen = {'SEQUENCE_ID': query_name, | |
| 305 'SEQUENCE_INPUT': seq_dict[query_name]} | |
| 306 | |
| 307 # Parse further sub-blocks | |
| 308 block_list = readOneIgBlastResult(block) | |
| 309 | |
| 310 # TODO: this is indented pretty far. should be a separate function. or several functions. | |
| 311 # If results exist, parse further to obtain full db_gen | |
| 312 if block_list is not None: | |
| 313 # Parse quality information | |
| 314 db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' | |
| 315 db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' | |
| 316 db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' | |
| 317 if block_list[0][-1] == '-': | |
| 318 db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], | |
| 319 IUPAC.ambiguous_dna).reverse_complement()) | |
| 320 | |
| 321 # Parse V, D, and J calls | |
| 322 call_str = ' '.join(block_list[0]) | |
| 323 v_call = parseAllele(call_str, v_allele_regex, action='list') | |
| 324 d_call = parseAllele(call_str, d_allele_regex, action='list') | |
| 325 j_call = parseAllele(call_str, j_allele_regex, action='list') | |
| 326 db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' | |
| 327 db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' | |
| 328 db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' | |
| 329 | |
| 330 # Parse junction sequence | |
| 331 # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) | |
| 332 # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) | |
| 333 | |
| 334 # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why? | |
| 335 # TODO: maybe we should fail these. they look craptastic. | |
| 336 #pd.set_option('display.width', 500) | |
| 337 #print query_name, len(block_list), hit_idx | |
| 338 #for i, x in enumerate(block_list): | |
| 339 # print '[%i]' % i | |
| 340 # print x | |
| 341 | |
| 342 # Parse segment start and stop positions | |
| 343 hit_df = block_list[-1] | |
| 344 | |
| 345 # Alignment info block | |
| 346 # 0: segment | |
| 347 # 1: query id | |
| 348 # 2: subject id | |
| 349 # 3: % identity | |
| 350 # 4: alignment length | |
| 351 # 5: mismatches | |
| 352 # 6: gap opens | |
| 353 # 7: gaps | |
| 354 # 8: q. start | |
| 355 # 9: q. end | |
| 356 # 10: s. start | |
| 357 # 11: s. end | |
| 358 # 12: evalue | |
| 359 # 13: bit score | |
| 360 # 14: query seq | |
| 361 # 15: subject seq | |
| 362 # 16: btop | |
| 363 | |
| 364 # If V call exists, parse V alignment information | |
| 365 seq_vdj = '' | |
| 366 if v_call is not None: | |
| 367 v_align = hit_df[hit_df[0] == 'V'].iloc[0] | |
| 368 # Germline positions | |
| 369 db_gen['V_GERM_START_VDJ'] = int(v_align[10]) | |
| 370 db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 | |
| 371 # Query sequence positions | |
| 372 db_gen['V_SEQ_START'] = int(v_align[8]) | |
| 373 db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 | |
| 374 | |
| 375 if int(v_align[6]) == 0: | |
| 376 db_gen['INDELS'] = 'F' | |
| 377 else: | |
| 378 db_gen['INDELS'] = 'T' | |
| 379 # Set functional to none so record gets tossed (junction will be wrong) | |
| 380 # db_gen['FUNCTIONAL'] = None | |
| 381 | |
| 382 # V alignment scores | |
| 383 if score_fields: | |
| 384 try: db_gen['V_SCORE'] = float(v_align[13]) | |
| 385 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' | |
| 386 | |
| 387 try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 | |
| 388 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' | |
| 389 | |
| 390 try: db_gen['V_EVALUE'] = float(v_align[12]) | |
| 391 except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' | |
| 392 | |
| 393 try: db_gen['V_BTOP'] = v_align[16] | |
| 394 except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' | |
| 395 | |
| 396 # Update VDJ sequence, removing insertions | |
| 397 start = 0 | |
| 398 for m in re.finditer(r'-', v_align[15]): | |
| 399 ins = m.start() | |
| 400 seq_vdj += v_align[14][start:ins] | |
| 401 start = ins + 1 | |
| 402 seq_vdj += v_align[14][start:] | |
| 403 | |
| 404 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. | |
| 405 # If D call exists, parse D alignment information | |
| 406 if d_call is not None: | |
| 407 d_align = hit_df[hit_df[0] == 'D'].iloc[0] | |
| 408 | |
| 409 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. | |
| 410 # Determine N-region length and amount of J overlap with V or D alignment | |
| 411 overlap = 0 | |
| 412 if v_call is not None: | |
| 413 n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | |
| 414 if n1_len < 0: | |
| 415 db_gen['N1_LENGTH'] = 0 | |
| 416 overlap = abs(n1_len) | |
| 417 else: | |
| 418 db_gen['N1_LENGTH'] = n1_len | |
| 419 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) | |
| 420 n1_end = int(d_align[8])-1 | |
| 421 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | |
| 422 | |
| 423 # Query sequence positions | |
| 424 db_gen['D_SEQ_START'] = int(d_align[8]) + overlap | |
| 425 db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) | |
| 426 | |
| 427 # Germline positions | |
| 428 db_gen['D_GERM_START'] = int(d_align[10]) + overlap | |
| 429 db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) | |
| 430 | |
| 431 # Update VDJ sequence, removing insertions | |
| 432 start = overlap | |
| 433 for m in re.finditer(r'-', d_align[15]): | |
| 434 ins = m.start() | |
| 435 seq_vdj += d_align[14][start:ins] | |
| 436 start = ins + 1 | |
| 437 seq_vdj += d_align[14][start:] | |
| 438 | |
| 439 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. | |
| 440 # If J call exists, parse J alignment information | |
| 441 if j_call is not None: | |
| 442 j_align = hit_df[hit_df[0] == 'J'].iloc[0] | |
| 443 | |
| 444 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. | |
| 445 # Determine N-region length and amount of J overlap with V or D alignment | |
| 446 overlap = 0 | |
| 447 if d_call is not None: | |
| 448 n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) | |
| 449 if n2_len < 0: | |
| 450 db_gen['N2_LENGTH'] = 0 | |
| 451 overlap = abs(n2_len) | |
| 452 else: | |
| 453 db_gen['N2_LENGTH'] = n2_len | |
| 454 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) | |
| 455 n2_end = int(j_align[8])-1 | |
| 456 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] | |
| 457 elif v_call is not None: | |
| 458 n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | |
| 459 if n1_len < 0: | |
| 460 db_gen['N1_LENGTH'] = 0 | |
| 461 overlap = abs(n1_len) | |
| 462 else: | |
| 463 db_gen['N1_LENGTH'] = n1_len | |
| 464 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) | |
| 465 n1_end = int(j_align[8])-1 | |
| 466 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | |
| 467 else: | |
| 468 db_gen['N1_LENGTH'] = 0 | |
| 469 | |
| 470 # Query positions | |
| 471 db_gen['J_SEQ_START'] = int(j_align[8]) + overlap | |
| 472 db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) | |
| 473 | |
| 474 # Germline positions | |
| 475 db_gen['J_GERM_START'] = int(j_align[10]) + overlap | |
| 476 db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) | |
| 477 | |
| 478 # J alignment scores | |
| 479 if score_fields: | |
| 480 try: db_gen['J_SCORE'] = float(j_align[13]) | |
| 481 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' | |
| 482 | |
| 483 try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 | |
| 484 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' | |
| 485 | |
| 486 try: db_gen['J_EVALUE'] = float(j_align[12]) | |
| 487 except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' | |
| 488 | |
| 489 try: db_gen['J_BTOP'] = j_align[16] | |
| 490 except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' | |
| 491 | |
| 492 # Update VDJ sequence, removing insertions | |
| 493 start = overlap | |
| 494 for m in re.finditer(r'-', j_align[15]): | |
| 495 ins = m.start() | |
| 496 seq_vdj += j_align[14][start:ins] | |
| 497 start = ins + 1 | |
| 498 seq_vdj += j_align[14][start:] | |
| 499 | |
| 500 db_gen['SEQUENCE_VDJ'] = seq_vdj | |
| 501 | |
| 502 # Create IMGT-gapped sequence and infer IMGT junction | |
| 503 if v_call is not None: | |
| 504 db_gen = gapV(db_gen, repo_dict) | |
| 505 if j_call is not None: | |
| 506 db_gen = getIMGTJunc(db_gen, repo_dict) | |
| 507 | |
| 508 # FWR and CDR regions | |
| 509 if region_fields: getRegions(db_gen) | |
| 510 | |
| 511 yield IgRecord(db_gen) | |
| 512 | |
| 513 | |
| 514 # TODO: should be more readable | |
| 515 def readIMGT(imgt_files, score_fields=False, region_fields=False): | |
| 516 """ | |
| 517 Reads IMGT/HighV-Quest output | |
| 518 | |
| 519 Arguments: | |
| 520 imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 | |
| 521 score_fields = if True parse alignment scores | |
| 522 region_fields = if True add FWR and CDR region fields | |
| 523 | |
| 524 Returns: | |
| 525 a generator of dictionaries containing alignment data | |
| 526 """ | |
| 527 imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] | |
| 528 # Create a dictionary for each sequence alignment and yield its generator | |
| 529 for sm, gp, nt, jn in zip(*imgt_iters): | |
| 530 if len(set([sm['Sequence ID'], | |
| 531 gp['Sequence ID'], | |
| 532 nt['Sequence ID'], | |
| 533 jn['Sequence ID']])) != 1: | |
| 534 sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ | |
| 535 % sm['Sequence ID']) | |
| 536 | |
| 537 db_gen = {'SEQUENCE_ID': sm['Sequence ID'], | |
| 538 'SEQUENCE_INPUT': sm['Sequence']} | |
| 539 | |
| 540 if 'No results' not in sm['Functionality']: | |
| 541 db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + | |
| 542 ('unprod' in sm['Functionality'])] | |
| 543 db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + | |
| 544 ('out-of-frame' in sm['JUNCTION frame'])], | |
| 545 db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + | |
| 546 ('unprod' in sm['Functionality'])] | |
| 547 db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], | |
| 548 'missing' in sm['V-REGION potential ins/del']))) + | |
| 549 ('unprod' in sm['Functionality'])] | |
| 550 db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], | |
| 551 sm['V-REGION insertions'], | |
| 552 sm['V-REGION deletions']))] | |
| 553 | |
| 554 db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] | |
| 555 db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] | |
| 556 | |
| 557 db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) | |
| 558 db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) | |
| 559 db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) | |
| 560 | |
| 561 v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 | |
| 562 db_gen['V_SEQ_START'] = nt['V-REGION start'] | |
| 563 db_gen['V_SEQ_LENGTH'] = v_seq_length | |
| 564 db_gen['V_GERM_START_IMGT'] = 1 | |
| 565 db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 | |
| 566 | |
| 567 db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], | |
| 568 jn['N-REGION-nt nb'], | |
| 569 jn['N1-REGION-nt nb'], | |
| 570 jn["P5'D-nt nb"]] if i) | |
| 571 db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, | |
| 572 jn["P3'V-nt nb"], | |
| 573 jn['N-REGION-nt nb'], | |
| 574 jn['N1-REGION-nt nb'], | |
| 575 jn["P5'D-nt nb"]] if i) | |
| 576 db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | |
| 577 db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 | |
| 578 db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | |
| 579 db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], | |
| 580 jn['N2-REGION-nt nb'], | |
| 581 jn["P5'J-nt nb"]] if i) | |
| 582 | |
| 583 db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, | |
| 584 jn["P3'V-nt nb"], | |
| 585 jn['N-REGION-nt nb'], | |
| 586 jn['N1-REGION-nt nb'], | |
| 587 jn["P5'D-nt nb"], | |
| 588 jn["D-REGION-nt nb"], | |
| 589 jn["P3'D-nt nb"], | |
| 590 jn['N2-REGION-nt nb'], | |
| 591 jn["P5'J-nt nb"]] if i) | |
| 592 db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 | |
| 593 db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 | |
| 594 db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 | |
| 595 | |
| 596 db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 | |
| 597 db_gen['JUNCTION'] = jn['JUNCTION'] | |
| 598 | |
| 599 # Alignment scores | |
| 600 if score_fields: | |
| 601 try: db_gen['V_SCORE'] = float(sm['V-REGION score']) | |
| 602 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' | |
| 603 | |
| 604 try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 | |
| 605 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' | |
| 606 | |
| 607 try: db_gen['J_SCORE'] = float(sm['J-REGION score']) | |
| 608 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' | |
| 609 | |
| 610 try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 | |
| 611 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' | |
| 612 | |
| 613 # FWR and CDR regions | |
| 614 if region_fields: getRegions(db_gen) | |
| 615 else: | |
| 616 db_gen['V_CALL'] = 'None' | |
| 617 db_gen['D_CALL'] = 'None' | |
| 618 db_gen['J_CALL'] = 'None' | |
| 619 | |
| 620 yield IgRecord(db_gen) | |
| 621 | |
| 622 | |
| 623 def getIDforIMGT(seq_file): | |
| 624 """ | |
| 625 Create a sequence ID translation using IMGT truncation | |
| 626 | |
| 627 Arguments: | |
| 628 seq_file = a fasta file of sequences input to IMGT | |
| 629 | |
| 630 Returns: | |
| 631 a dictionary of {truncated ID: full seq description} | |
| 632 """ | |
| 633 | |
| 634 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | |
| 635 ids = {} | |
| 636 for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): | |
| 637 if len(rec.description) <= 50: | |
| 638 id_key = rec.description | |
| 639 else: | |
| 640 id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) | |
| 641 ids.update({id_key:rec.description}) | |
| 642 | |
| 643 return ids | |
| 644 | |
| 645 | |
| 646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, | |
| 647 score_fields=False, region_fields=False, out_args=default_out_args): | |
| 648 """ | |
| 649 Writes tab-delimited database file in output directory | |
| 650 | |
| 651 Arguments: | |
| 652 db_gen = a generator of IgRecord objects containing alignment data | |
| 653 file_prefix = directory and prefix for CLIP tab-delim file | |
| 654 total_count = number of records (for progress bar) | |
| 655 id_dict = a dictionary of {IMGT ID: full seq description} | |
| 656 no_parse = if ID is to be parsed for pRESTO output with default delimiters | |
| 657 score_fields = if True add alignment score fields to output file | |
| 658 region_fields = if True add FWR and CDR region fields to output file | |
| 659 out_args = common output argument dictionary from parseCommonArgs | |
| 660 | |
| 661 Returns: | |
| 662 None | |
| 663 """ | |
| 664 pass_file = "%s_db-pass.tab" % file_prefix | |
| 665 fail_file = "%s_db-fail.tab" % file_prefix | |
| 666 ordered_fields = ['SEQUENCE_ID', | |
| 667 'SEQUENCE_INPUT', | |
| 668 'FUNCTIONAL', | |
| 669 'IN_FRAME', | |
| 670 'STOP', | |
| 671 'MUTATED_INVARIANT', | |
| 672 'INDELS', | |
| 673 'V_CALL', | |
| 674 'D_CALL', | |
| 675 'J_CALL', | |
| 676 'SEQUENCE_VDJ', | |
| 677 'SEQUENCE_IMGT', | |
| 678 'V_SEQ_START', | |
| 679 'V_SEQ_LENGTH', | |
| 680 'V_GERM_START_VDJ', | |
| 681 'V_GERM_LENGTH_VDJ', | |
| 682 'V_GERM_START_IMGT', | |
| 683 'V_GERM_LENGTH_IMGT', | |
| 684 'N1_LENGTH', | |
| 685 'D_SEQ_START', | |
| 686 'D_SEQ_LENGTH', | |
| 687 'D_GERM_START', | |
| 688 'D_GERM_LENGTH', | |
| 689 'N2_LENGTH', | |
| 690 'J_SEQ_START', | |
| 691 'J_SEQ_LENGTH', | |
| 692 'J_GERM_START', | |
| 693 'J_GERM_LENGTH', | |
| 694 'JUNCTION_LENGTH', | |
| 695 'JUNCTION'] | |
| 696 | |
| 697 if score_fields: | |
| 698 ordered_fields.extend(['V_SCORE', | |
| 699 'V_IDENTITY', | |
| 700 'V_EVALUE', | |
| 701 'V_BTOP', | |
| 702 'J_SCORE', | |
| 703 'J_IDENTITY', | |
| 704 'J_EVALUE', | |
| 705 'J_BTOP']) | |
| 706 | |
| 707 if region_fields: | |
| 708 ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', | |
| 709 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) | |
| 710 | |
| 711 | |
| 712 # TODO: This is not the best approach. should pass in output fields. | |
| 713 # Initiate passed handle | |
| 714 pass_handle = None | |
| 715 | |
| 716 # Open failed file | |
| 717 if out_args['failed']: | |
| 718 fail_handle = open(fail_file, 'wt') | |
| 719 fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) | |
| 720 else: | |
| 721 fail_handle = None | |
| 722 fail_writer = None | |
| 723 | |
| 724 # Initialize counters and file | |
| 725 pass_writer = None | |
| 726 start_time = time() | |
| 727 rec_count = pass_count = fail_count = 0 | |
| 728 for record in db_gen: | |
| 729 #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | |
| 730 printProgress(rec_count, total_count, 0.05, start_time) | |
| 731 rec_count += 1 | |
| 732 | |
| 733 # Count pass or fail | |
| 734 if (record.v_call == 'None' and record.j_call == 'None') or \ | |
| 735 record.functional is None or \ | |
| 736 not record.seq_vdj or \ | |
| 737 not record.junction: | |
| 738 # print(record.v_call, record.j_call, record.functional, record.junction) | |
| 739 fail_count += 1 | |
| 740 if fail_writer is not None: fail_writer.writerow(record.toDict()) | |
| 741 continue | |
| 742 else: | |
| 743 pass_count += 1 | |
| 744 | |
| 745 # Build sample sequence description | |
| 746 if record.id in id_dict: | |
| 747 record.id = id_dict[record.id] | |
| 748 | |
| 749 # Parse sequence description into new columns | |
| 750 if not no_parse: | |
| 751 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) | |
| 752 record.id = record.annotations['ID'] | |
| 753 del record.annotations['ID'] | |
| 754 | |
| 755 # TODO: This is not the best approach. should pass in output fields. | |
| 756 # If first sequence, use parsed description to create new columns and initialize writer | |
| 757 if pass_writer is None: | |
| 758 if not no_parse: ordered_fields.extend(list(record.annotations.keys())) | |
| 759 pass_handle = open(pass_file, 'wt') | |
| 760 pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) | |
| 761 | |
| 762 # Write row to tab-delim CLIP file | |
| 763 pass_writer.writerow(record.toDict()) | |
| 764 | |
| 765 # Print log | |
| 766 #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | |
| 767 printProgress(rec_count, total_count, 0.05, start_time) | |
| 768 | |
| 769 log = OrderedDict() | |
| 770 log['OUTPUT'] = pass_file | |
| 771 log['PASS'] = pass_count | |
| 772 log['FAIL'] = fail_count | |
| 773 log['END'] = 'MakeDb' | |
| 774 printLog(log) | |
| 775 | |
| 776 if pass_handle is not None: pass_handle.close() | |
| 777 if fail_handle is not None: fail_handle.close() | |
| 778 | |
| 779 | |
| 780 # TODO: may be able to merge with parseIMGT | |
| 781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, | |
| 782 region_fields=False, out_args=default_out_args): | |
| 783 """ | |
| 784 Main for IgBlast aligned sample sequences | |
| 785 | |
| 786 Arguments: | |
| 787 igblast_output = IgBlast output file to process | |
| 788 seq_file = fasta file input to IgBlast (from which to get sequence) | |
| 789 repo = folder with germline repertoire files | |
| 790 no_parse = if ID is to be parsed for pRESTO output with default delimiters | |
| 791 score_fields = if True add alignment score fields to output file | |
| 792 region_fields = if True add FWR and CDR region fields to output file | |
| 793 out_args = common output argument dictionary from parseCommonArgs | |
| 794 | |
| 795 Returns: | |
| 796 None | |
| 797 """ | |
| 798 # Print parameter info | |
| 799 log = OrderedDict() | |
| 800 log['START'] = 'MakeDB' | |
| 801 log['ALIGNER'] = 'IgBlast' | |
| 802 log['ALIGN_RESULTS'] = os.path.basename(igblast_output) | |
| 803 log['SEQ_FILE'] = os.path.basename(seq_file) | |
| 804 log['NO_PARSE'] = no_parse | |
| 805 log['SCORE_FIELDS'] = score_fields | |
| 806 log['REGION_FIELDS'] = region_fields | |
| 807 printLog(log) | |
| 808 | |
| 809 # Get input sequence dictionary | |
| 810 seq_dict = getSeqforIgBlast(seq_file) | |
| 811 | |
| 812 # Formalize out_dir and file-prefix | |
| 813 if not out_args['out_dir']: | |
| 814 out_dir = os.path.split(igblast_output)[0] | |
| 815 else: | |
| 816 out_dir = os.path.abspath(out_args['out_dir']) | |
| 817 if not os.path.exists(out_dir): os.mkdir(out_dir) | |
| 818 if out_args['out_name']: | |
| 819 file_prefix = out_args['out_name'] | |
| 820 else: | |
| 821 file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) | |
| 822 file_prefix = os.path.join(out_dir, file_prefix) | |
| 823 | |
| 824 total_count = countSeqFile(seq_file) | |
| 825 | |
| 826 # Create | |
| 827 repo_dict = getRepo(repo) | |
| 828 igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, | |
| 829 score_fields=score_fields, region_fields=region_fields) | |
| 830 writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, | |
| 831 score_fields=score_fields, region_fields=region_fields, out_args=out_args) | |
| 832 | |
| 833 | |
| 834 # TODO: may be able to merge with parseIgBlast | |
| 835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, | |
| 836 region_fields=False, out_args=default_out_args): | |
| 837 """ | |
| 838 Main for IMGT aligned sample sequences | |
| 839 | |
| 840 Arguments: | |
| 841 imgt_output = zipped file or unzipped folder output by IMGT | |
| 842 seq_file = FASTA file input to IMGT (from which to get seqID) | |
| 843 no_parse = if ID is to be parsed for pRESTO output with default delimiters | |
| 844 score_fields = if True add alignment score fields to output file | |
| 845 region_fields = if True add FWR and CDR region fields to output file | |
| 846 out_args = common output argument dictionary from parseCommonArgs | |
| 847 | |
| 848 Returns: | |
| 849 None | |
| 850 """ | |
| 851 # Print parameter info | |
| 852 log = OrderedDict() | |
| 853 log['START'] = 'MakeDb' | |
| 854 log['ALIGNER'] = 'IMGT' | |
| 855 log['ALIGN_RESULTS'] = imgt_output | |
| 856 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | |
| 857 log['NO_PARSE'] = no_parse | |
| 858 log['SCORE_FIELDS'] = score_fields | |
| 859 log['REGION_FIELDS'] = region_fields | |
| 860 printLog(log) | |
| 861 | |
| 862 # Get individual IMGT result files | |
| 863 temp_dir, imgt_files = extractIMGT(imgt_output) | |
| 864 | |
| 865 # Formalize out_dir and file-prefix | |
| 866 if not out_args['out_dir']: | |
| 867 out_dir = os.path.dirname(os.path.abspath(imgt_output)) | |
| 868 else: | |
| 869 out_dir = os.path.abspath(out_args['out_dir']) | |
| 870 if not os.path.exists(out_dir): os.mkdir(out_dir) | |
| 871 if out_args['out_name']: | |
| 872 file_prefix = out_args['out_name'] | |
| 873 else: | |
| 874 file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] | |
| 875 file_prefix = os.path.join(out_dir, file_prefix) | |
| 876 | |
| 877 total_count = countDbFile(imgt_files[0]) | |
| 878 | |
| 879 # Get (parsed) IDs from fasta file submitted to IMGT | |
| 880 id_dict = getIDforIMGT(seq_file) if seq_file else {} | |
| 881 | |
| 882 # Create | |
| 883 imgt_dict = readIMGT(imgt_files, score_fields=score_fields, | |
| 884 region_fields=region_fields) | |
| 885 writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, | |
| 886 score_fields=score_fields, region_fields=region_fields, out_args=out_args) | |
| 887 | |
| 888 # Delete temp directory | |
| 889 rmtree(temp_dir) | |
| 890 | |
| 891 | |
| 892 def getArgParser(): | |
| 893 """ | |
| 894 Defines the ArgumentParser | |
| 895 | |
| 896 Arguments: | |
| 897 None | |
| 898 | |
| 899 Returns: | |
| 900 an ArgumentParser object | |
| 901 """ | |
| 902 fields = dedent( | |
| 903 ''' | |
| 904 output files: | |
| 905 db-pass | |
| 906 database of parsed alignment records. | |
| 907 db-fail | |
| 908 database with records failing alignment. | |
| 909 | |
| 910 output fields: | |
| 911 SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, | |
| 912 INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, | |
| 913 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, | |
| 914 V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, | |
| 915 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, | |
| 916 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, | |
| 917 JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, | |
| 918 J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, | |
| 919 FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT | |
| 920 ''') | |
| 921 | |
| 922 # Define ArgumentParser | |
| 923 parser = ArgumentParser(description=__doc__, epilog=fields, | |
| 924 formatter_class=CommonHelpFormatter) | |
| 925 parser.add_argument('--version', action='version', | |
| 926 version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) | |
| 927 subparsers = parser.add_subparsers(title='subcommands', dest='command', | |
| 928 help='Aligner used', metavar='') | |
| 929 # TODO: This is a temporary fix for Python issue 9253 | |
| 930 subparsers.required = True | |
| 931 | |
| 932 # Parent parser | |
| 933 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) | |
| 934 | |
| 935 # IgBlast Aligner | |
| 936 parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', | |
| 937 parents=[parser_parent], | |
| 938 formatter_class=CommonHelpFormatter) | |
| 939 parser_igblast.set_defaults(func=parseIgBlast) | |
| 940 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
| 941 required=True, | |
| 942 help='''IgBLAST output files in format 7 with query sequence | |
| 943 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') | |
| 944 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | |
| 945 help='''List of folders and/or fasta files containing | |
| 946 IMGT-gapped germline sequences corresponding to the | |
| 947 set of germlines used in the IgBLAST alignment.''') | |
| 948 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | |
| 949 required=True, | |
| 950 help='List of input FASTA files containing sequences') | |
| 951 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', | |
| 952 help='''Specify if input IDs should not be parsed to add | |
| 953 new columns to database.''') | |
| 954 parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', | |
| 955 help='''Specify if alignment score metrics should be | |
| 956 included in the output. Adds the V_SCORE, V_IDENTITY, | |
| 957 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, | |
| 958 J_BTOP, and J_EVALUE columns.''') | |
| 959 parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', | |
| 960 help='''Specify if IMGT framework and CDR regions should be | |
| 961 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
| 962 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
| 963 CDR3_IMGT columns.''') | |
| 964 | |
| 965 # IMGT aligner | |
| 966 parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', | |
| 967 parents=[parser_parent], | |
| 968 formatter_class=CommonHelpFormatter) | |
| 969 imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) | |
| 970 imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', | |
| 971 help='''Either zipped IMGT output files (.zip) or a folder | |
| 972 containing unzipped IMGT output files (which must | |
| 973 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | |
| 974 and 6_Junction).''') | |
| 975 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', | |
| 976 required=False, | |
| 977 help='List of input FASTA files containing sequences') | |
| 978 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', | |
| 979 help='''Specify if input IDs should not be parsed to add new | |
| 980 columns to database.''') | |
| 981 parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', | |
| 982 help='''Specify if alignment score metrics should be | |
| 983 included in the output. Adds the V_SCORE, V_IDENTITY, | |
| 984 J_SCORE and J_IDENTITY. Note, this will also add | |
| 985 the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, | |
| 986 but they will be empty for IMGT output.''') | |
| 987 parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', | |
| 988 help='''Specify if IMGT framework and CDR regions should be | |
| 989 included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | |
| 990 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | |
| 991 CDR3_IMGT columns.''') | |
| 992 parser_imgt.set_defaults(func=parseIMGT) | |
| 993 | |
| 994 return parser | |
| 995 | |
| 996 | |
| 997 if __name__ == "__main__": | |
| 998 """ | |
| 999 Parses command line arguments and calls main | |
| 1000 """ | |
| 1001 parser = getArgParser() | |
| 1002 args = parser.parse_args() | |
| 1003 args_dict = parseCommonArgs(args, in_arg='aligner_files') | |
| 1004 | |
| 1005 # Set no ID parsing if sequence files are not provided | |
| 1006 if 'seq_files' in args_dict and not args_dict['seq_files']: | |
| 1007 args_dict['no_parse'] = True | |
| 1008 | |
| 1009 # Delete | |
| 1010 if 'seq_files' in args_dict: del args_dict['seq_files'] | |
| 1011 if 'aligner_files' in args_dict: del args_dict['aligner_files'] | |
| 1012 if 'command' in args_dict: del args_dict['command'] | |
| 1013 if 'func' in args_dict: del args_dict['func'] | |
| 1014 | |
| 1015 if args.command == 'imgt': | |
| 1016 for i in range(len(args.__dict__['aligner_files'])): | |
| 1017 args_dict['imgt_output'] = args.__dict__['aligner_files'][i] | |
| 1018 args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | |
| 1019 if args.__dict__['seq_files'] else None | |
| 1020 args.func(**args_dict) | |
| 1021 elif args.command == 'igblast': | |
| 1022 for i in range(len(args.__dict__['aligner_files'])): | |
| 1023 args_dict['igblast_output'] = args.__dict__['aligner_files'][i] | |
| 1024 args_dict['seq_file'] = args.__dict__['seq_files'][i] | |
| 1025 args.func(**args_dict) |
