| 120 | 1 #!/usr/bin/env python3 | 
|  | 2 """ | 
|  | 3 Create tab-delimited database file to store sequence alignment information | 
|  | 4 """ | 
|  | 5 # Info | 
|  | 6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden' | 
|  | 7 from changeo import __version__, __date__ | 
|  | 8 | 
|  | 9 # Imports | 
|  | 10 import csv | 
|  | 11 import os | 
|  | 12 import re | 
|  | 13 import sys | 
|  | 14 import pandas as pd | 
|  | 15 import tarfile | 
|  | 16 import zipfile | 
|  | 17 from argparse import ArgumentParser | 
|  | 18 from collections import OrderedDict | 
|  | 19 from itertools import groupby | 
|  | 20 from shutil import rmtree | 
|  | 21 from tempfile import mkdtemp | 
|  | 22 from textwrap import dedent | 
|  | 23 from time import time | 
|  | 24 from Bio import SeqIO | 
|  | 25 from Bio.Seq import Seq | 
|  | 26 from Bio.Alphabet import IUPAC | 
|  | 27 | 
|  | 28 # Presto and changeo imports | 
|  | 29 from presto.Defaults import default_out_args | 
|  | 30 from presto.Annotation import parseAnnotation | 
|  | 31 from presto.IO import countSeqFile, printLog, printProgress | 
|  | 32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs | 
|  | 33 from changeo.IO import getDbWriter, countDbFile, getRepo | 
|  | 34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ | 
|  | 35                              j_allele_regex | 
|  | 36 | 
|  | 37 # Default parameters | 
|  | 38 default_delimiter = ('\t', ',', '-') | 
|  | 39 | 
|  | 40 | 
|  | 41 def gapV(ig_dict, repo_dict): | 
|  | 42     """ | 
|  | 43     Insert gaps into V region and update alignment information | 
|  | 44 | 
|  | 45     Arguments: | 
|  | 46       ig_dict : Dictionary of parsed IgBlast output | 
|  | 47       repo_dict : Dictionary of IMGT gapped germline sequences | 
|  | 48 | 
|  | 49     Returns: | 
|  | 50       dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields | 
|  | 51     """ | 
|  | 52 | 
|  | 53     seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] | 
|  | 54 | 
|  | 55     # Find gapped germline V segment | 
|  | 56     vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') | 
|  | 57     vkey = (vgene, ) | 
|  | 58     #TODO: Figure out else case | 
|  | 59     if vkey in repo_dict: | 
|  | 60         vgap = repo_dict[vkey] | 
|  | 61         # Iterate over gaps in the germline segment | 
|  | 62         gaps = re.finditer(r'\.', vgap) | 
|  | 63         gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 | 
|  | 64         for gap in gaps: | 
|  | 65             i = gap.start() | 
|  | 66             # Break if gap begins after V region | 
|  | 67             if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: | 
|  | 68                 break | 
|  | 69             # Insert gap into IMGT sequence | 
|  | 70             seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] | 
|  | 71             # Update gap counter | 
|  | 72             gapcount += 1 | 
|  | 73         ig_dict['SEQUENCE_IMGT'] = seq_imgt | 
|  | 74         # Update IMGT positioning information for V | 
|  | 75         ig_dict['V_GERM_START_IMGT'] = 1 | 
|  | 76         ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount | 
|  | 77 | 
|  | 78     return ig_dict | 
|  | 79 | 
|  | 80 | 
|  | 81 def getIMGTJunc(ig_dict, repo_dict): | 
|  | 82     """ | 
|  | 83     Identify junction region by IMGT definition | 
|  | 84 | 
|  | 85     Arguments: | 
|  | 86       ig_dict : Dictionary of parsed IgBlast output | 
|  | 87       repo_dict : Dictionary of IMGT gapped germline sequences | 
|  | 88 | 
|  | 89     Returns: | 
|  | 90       dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields | 
|  | 91     """ | 
|  | 92     # Find germline J segment | 
|  | 93     jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') | 
|  | 94     jkey = (jgene, ) | 
|  | 95     #TODO: Figure out else case | 
|  | 96     if jkey in repo_dict: | 
|  | 97         # Get germline J sequence | 
|  | 98         jgerm = repo_dict[jkey] | 
|  | 99         jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] | 
|  | 100         # Look for (F|W)GXG aa motif in nt sequence | 
|  | 101         motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) | 
|  | 102         aa_end = len(ig_dict['SEQUENCE_IMGT']) | 
|  | 103         #TODO: Figure out else case | 
|  | 104         if motif: | 
|  | 105             # print('\n', motif.group()) | 
|  | 106             aa_end = motif.start() - len(jgerm) + 3 | 
|  | 107         # Add fields to dict | 
|  | 108         ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] | 
|  | 109         ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) | 
|  | 110 | 
|  | 111     return ig_dict | 
|  | 112 | 
|  | 113 | 
|  | 114 def getRegions(ig_dict): | 
|  | 115     """ | 
|  | 116     Identify FWR and CDR regions by IMGT definition | 
|  | 117 | 
|  | 118     Arguments: | 
|  | 119       ig_dict : Dictionary of parsed alignment output | 
|  | 120 | 
|  | 121     Returns: | 
|  | 122       dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, | 
|  | 123              CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields | 
|  | 124     """ | 
|  | 125     try: | 
|  | 126         seq_len = len(ig_dict['SEQUENCE_IMGT']) | 
|  | 127         ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] | 
|  | 128     except (KeyError, IndexError): | 
|  | 129         return ig_dict | 
|  | 130 | 
|  | 131     try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] | 
|  | 132     except (IndexError): return ig_dict | 
|  | 133 | 
|  | 134     try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] | 
|  | 135     except (IndexError): return ig_dict | 
|  | 136 | 
|  | 137     try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] | 
|  | 138     except (IndexError): return ig_dict | 
|  | 139 | 
|  | 140     try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] | 
|  | 141     except (IndexError): return ig_dict | 
|  | 142 | 
|  | 143     try: | 
|  | 144         cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] | 
|  | 145         ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] | 
|  | 146         ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] | 
|  | 147     except (KeyError, IndexError): | 
|  | 148         return ig_dict | 
|  | 149 | 
|  | 150     return ig_dict | 
|  | 151 | 
|  | 152 | 
|  | 153 def getSeqforIgBlast(seq_file): | 
|  | 154     """ | 
|  | 155     Fetch input sequences for IgBlast queries | 
|  | 156 | 
|  | 157     Arguments: | 
|  | 158     seq_file = a fasta file of sequences input to IgBlast | 
|  | 159 | 
|  | 160     Returns: | 
|  | 161     a dictionary of {ID:Seq} | 
|  | 162     """ | 
|  | 163 | 
|  | 164     seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) | 
|  | 165 | 
|  | 166     # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | 
|  | 167     seqs = {} | 
|  | 168     for seq in seq_dict.values(): | 
|  | 169         seqs.update({seq.description:str(seq.seq)}) | 
|  | 170 | 
|  | 171     return seqs | 
|  | 172 | 
|  | 173 | 
|  | 174 def findLine(handle, query): | 
|  | 175     """ | 
|  | 176     Finds line with query string in file | 
|  | 177 | 
|  | 178     Arguments: | 
|  | 179     handle = file handle in which to search for line | 
|  | 180     query = query string for which to search in file | 
|  | 181 | 
|  | 182     Returns: | 
|  | 183     line from handle in which query string was found | 
|  | 184     """ | 
|  | 185     for line in handle: | 
|  | 186         if(re.match(query, line)): | 
|  | 187             return line | 
|  | 188 | 
|  | 189 | 
|  | 190 def extractIMGT(imgt_output): | 
|  | 191     """ | 
|  | 192     Extract necessary files from IMGT results, zipped or unzipped | 
|  | 193 | 
|  | 194     Arguments: | 
|  | 195     imgt_output = zipped file or unzipped folder output by IMGT | 
|  | 196 | 
|  | 197     Returns: | 
|  | 198     sorted list of filenames from which information will be read | 
|  | 199     """ | 
|  | 200     #file_ext = os.path.splitext(imgt_output)[1].lower() | 
|  | 201     imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') | 
|  | 202     temp_dir = mkdtemp() | 
|  | 203     if zipfile.is_zipfile(imgt_output): | 
|  | 204         # Open zip file | 
|  | 205         imgt_zip = zipfile.ZipFile(imgt_output, 'r') | 
|  | 206         # Extract required files | 
|  | 207         imgt_files = sorted([n for n in imgt_zip.namelist() \ | 
|  | 208                              if os.path.basename(n).startswith(imgt_flags)]) | 
|  | 209         imgt_zip.extractall(temp_dir, imgt_files) | 
|  | 210         # Define file list | 
|  | 211         imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | 
|  | 212     elif os.path.isdir(imgt_output): | 
|  | 213         # Find required files in folder | 
|  | 214         folder_files = [] | 
|  | 215         for root, dirs, files in os.walk(imgt_output): | 
|  | 216             folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) | 
|  | 217         # Define file list | 
|  | 218         imgt_files = sorted([n for n in folder_files \ | 
|  | 219                              if os.path.basename(n).startswith(imgt_flags)]) | 
|  | 220     elif tarfile.is_tarfile(imgt_output): | 
|  | 221         # Open zip file | 
|  | 222         imgt_tar = tarfile.open(imgt_output, 'r') | 
|  | 223         # Extract required files | 
|  | 224         imgt_files = sorted([n for n in imgt_tar.getnames() \ | 
|  | 225                              if os.path.basename(n).startswith(imgt_flags)]) | 
|  | 226         imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) | 
|  | 227         # Define file list | 
|  | 228         imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] | 
|  | 229     else: | 
|  | 230         sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') | 
|  | 231 | 
|  | 232     if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files | 
|  | 233         sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) | 
|  | 234     elif len(imgt_files) < len(imgt_flags): | 
|  | 235         sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) | 
|  | 236 | 
|  | 237     return temp_dir, imgt_files | 
|  | 238 | 
|  | 239 | 
|  | 240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks | 
|  | 241 def readOneIgBlastResult(block): | 
|  | 242     """ | 
|  | 243     Parse a single IgBLAST query result | 
|  | 244 | 
|  | 245     Arguments: | 
|  | 246     block =  itertools groupby object of single result | 
|  | 247 | 
|  | 248     Returns: | 
|  | 249     None if no results, otherwise list of DataFrames for each result block | 
|  | 250     """ | 
|  | 251     results = list() | 
|  | 252     i = 0 | 
|  | 253     for match, subblock in groupby(block, lambda l: l=='\n'): | 
|  | 254         if not match: | 
|  | 255             # Strip whitespace and comments | 
|  | 256             sub = [s.strip() for s in subblock if not s.startswith('#')] | 
|  | 257 | 
|  | 258             # Continue on empty block | 
|  | 259             if not sub:  continue | 
|  | 260             else:  i += 1 | 
|  | 261 | 
|  | 262             # Split by tabs | 
|  | 263             sub = [s.split('\t') for s in sub] | 
|  | 264 | 
|  | 265             # Append list for "V-(D)-J rearrangement summary" (i == 1) | 
|  | 266             # And "V-(D)-J junction details" (i == 2) | 
|  | 267             # Otherwise append DataFrame of subblock | 
|  | 268             if i == 1 or i == 2: | 
|  | 269                 results.append(sub[0]) | 
|  | 270             else: | 
|  | 271                 df = pd.DataFrame(sub) | 
|  | 272                 if not df.empty: results.append(df) | 
|  | 273 | 
|  | 274     return results if results else None | 
|  | 275 | 
|  | 276 | 
|  | 277 # TODO:  needs more speeds. pandas is probably to blame. | 
|  | 278 def readIgBlast(igblast_output, seq_dict, repo_dict, | 
|  | 279                 score_fields=False, region_fields=False): | 
|  | 280     """ | 
|  | 281     Reads IgBlast output | 
|  | 282 | 
|  | 283     Arguments: | 
|  | 284     igblast_output = IgBlast output file (format 7) | 
|  | 285     seq_dict = a dictionary of {ID:Seq} from input fasta file | 
|  | 286     repo_dict = dictionary of IMGT gapped germline sequences | 
|  | 287     score_fields = if True parse alignment scores | 
|  | 288     region_fields = if True add FWR and CDR region fields | 
|  | 289 | 
|  | 290     Returns: | 
|  | 291     a generator of dictionaries containing alignment data | 
|  | 292     """ | 
|  | 293 | 
|  | 294     # Open IgBlast output file | 
|  | 295     with open(igblast_output) as f: | 
|  | 296         # Iterate over individual results (separated by # IGBLASTN) | 
|  | 297         for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): | 
|  | 298             block = list(block) | 
|  | 299             if not k1: | 
|  | 300                 # TODO: move query name extraction into block parser readOneIgBlastResult(). | 
|  | 301                 # Extract sequence ID | 
|  | 302                 query_name = ' '.join(block[0].strip().split(' ')[2:]) | 
|  | 303                 # Initialize db_gen to have ID and input sequence | 
|  | 304                 db_gen = {'SEQUENCE_ID':     query_name, | 
|  | 305                           'SEQUENCE_INPUT':  seq_dict[query_name]} | 
|  | 306 | 
|  | 307                 # Parse further sub-blocks | 
|  | 308                 block_list = readOneIgBlastResult(block) | 
|  | 309 | 
|  | 310                 # TODO: this is indented pretty far.  should be a separate function. or several functions. | 
|  | 311                 # If results exist, parse further to obtain full db_gen | 
|  | 312                 if block_list is not None: | 
|  | 313                     # Parse quality information | 
|  | 314                     db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' | 
|  | 315                     db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' | 
|  | 316                     db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' | 
|  | 317                     if block_list[0][-1] == '-': | 
|  | 318                         db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], | 
|  | 319                                                            IUPAC.ambiguous_dna).reverse_complement()) | 
|  | 320 | 
|  | 321                     # Parse V, D, and J calls | 
|  | 322                     call_str = ' '.join(block_list[0]) | 
|  | 323                     v_call = parseAllele(call_str, v_allele_regex, action='list') | 
|  | 324                     d_call = parseAllele(call_str, d_allele_regex, action='list') | 
|  | 325                     j_call = parseAllele(call_str, j_allele_regex, action='list') | 
|  | 326                     db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' | 
|  | 327                     db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' | 
|  | 328                     db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' | 
|  | 329 | 
|  | 330                     # Parse junction sequence | 
|  | 331                     # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) | 
|  | 332                     # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) | 
|  | 333 | 
|  | 334                     # TODO:  IgBLAST does a stupid and doesn't output block #3 sometimes. why? | 
|  | 335                     # TODO:  maybe we should fail these. they look craptastic. | 
|  | 336                     #pd.set_option('display.width', 500) | 
|  | 337                     #print query_name, len(block_list), hit_idx | 
|  | 338                     #for i, x in enumerate(block_list): | 
|  | 339                     #    print '[%i]' % i | 
|  | 340                     #    print x | 
|  | 341 | 
|  | 342                     # Parse segment start and stop positions | 
|  | 343                     hit_df = block_list[-1] | 
|  | 344 | 
|  | 345                     # Alignment info block | 
|  | 346                     #  0:  segment | 
|  | 347                     #  1:  query id | 
|  | 348                     #  2:  subject id | 
|  | 349                     #  3:  % identity | 
|  | 350                     #  4:  alignment length | 
|  | 351                     #  5:  mismatches | 
|  | 352                     #  6:  gap opens | 
|  | 353                     #  7:  gaps | 
|  | 354                     #  8:  q. start | 
|  | 355                     #  9:  q. end | 
|  | 356                     # 10:  s. start | 
|  | 357                     # 11:  s. end | 
|  | 358                     # 12:  evalue | 
|  | 359                     # 13:  bit score | 
|  | 360                     # 14:  query seq | 
|  | 361                     # 15:  subject seq | 
|  | 362                     # 16:  btop | 
|  | 363 | 
|  | 364                     # If V call exists, parse V alignment information | 
|  | 365                     seq_vdj = '' | 
|  | 366                     if v_call is not None: | 
|  | 367                         v_align = hit_df[hit_df[0] == 'V'].iloc[0] | 
|  | 368                         # Germline positions | 
|  | 369                         db_gen['V_GERM_START_VDJ'] = int(v_align[10]) | 
|  | 370                         db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 | 
|  | 371                         # Query sequence positions | 
|  | 372                         db_gen['V_SEQ_START'] = int(v_align[8]) | 
|  | 373                         db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 | 
|  | 374 | 
|  | 375                         if int(v_align[6]) == 0: | 
|  | 376                             db_gen['INDELS'] = 'F' | 
|  | 377                         else: | 
|  | 378                             db_gen['INDELS'] = 'T' | 
|  | 379                             # Set functional to none so record gets tossed (junction will be wrong) | 
|  | 380                             # db_gen['FUNCTIONAL'] = None | 
|  | 381 | 
|  | 382                         # V alignment scores | 
|  | 383                         if score_fields: | 
|  | 384                             try: db_gen['V_SCORE'] = float(v_align[13]) | 
|  | 385                             except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' | 
|  | 386 | 
|  | 387                             try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 | 
|  | 388                             except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' | 
|  | 389 | 
|  | 390                             try: db_gen['V_EVALUE'] = float(v_align[12]) | 
|  | 391                             except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' | 
|  | 392 | 
|  | 393                             try: db_gen['V_BTOP'] = v_align[16] | 
|  | 394                             except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' | 
|  | 395 | 
|  | 396                         # Update VDJ sequence, removing insertions | 
|  | 397                         start = 0 | 
|  | 398                         for m in re.finditer(r'-', v_align[15]): | 
|  | 399                             ins = m.start() | 
|  | 400                             seq_vdj += v_align[14][start:ins] | 
|  | 401                             start = ins + 1 | 
|  | 402                         seq_vdj += v_align[14][start:] | 
|  | 403 | 
|  | 404                     # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them. | 
|  | 405                     # If D call exists, parse D alignment information | 
|  | 406                     if d_call is not None: | 
|  | 407                         d_align = hit_df[hit_df[0] == 'D'].iloc[0] | 
|  | 408 | 
|  | 409                         # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though. | 
|  | 410                         # Determine N-region length and amount of J overlap with V or D alignment | 
|  | 411                         overlap = 0 | 
|  | 412                         if v_call is not None: | 
|  | 413                             n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | 
|  | 414                             if n1_len < 0: | 
|  | 415                                 db_gen['N1_LENGTH'] = 0 | 
|  | 416                                 overlap = abs(n1_len) | 
|  | 417                             else: | 
|  | 418                                 db_gen['N1_LENGTH'] = n1_len | 
|  | 419                                 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) | 
|  | 420                                 n1_end = int(d_align[8])-1 | 
|  | 421                                 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | 
|  | 422 | 
|  | 423                         # Query sequence positions | 
|  | 424                         db_gen['D_SEQ_START'] = int(d_align[8]) + overlap | 
|  | 425                         db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) | 
|  | 426 | 
|  | 427                         # Germline positions | 
|  | 428                         db_gen['D_GERM_START'] = int(d_align[10]) + overlap | 
|  | 429                         db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) | 
|  | 430 | 
|  | 431                         # Update VDJ sequence, removing insertions | 
|  | 432                         start = overlap | 
|  | 433                         for m in re.finditer(r'-', d_align[15]): | 
|  | 434                             ins = m.start() | 
|  | 435                             seq_vdj += d_align[14][start:ins] | 
|  | 436                             start = ins + 1 | 
|  | 437                         seq_vdj += d_align[14][start:] | 
|  | 438 | 
|  | 439                     # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them. | 
|  | 440                     # If J call exists, parse J alignment information | 
|  | 441                     if j_call is not None: | 
|  | 442                         j_align = hit_df[hit_df[0] == 'J'].iloc[0] | 
|  | 443 | 
|  | 444                         # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though. | 
|  | 445                         # Determine N-region length and amount of J overlap with V or D alignment | 
|  | 446                         overlap = 0 | 
|  | 447                         if d_call is not None: | 
|  | 448                             n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) | 
|  | 449                             if n2_len < 0: | 
|  | 450                                 db_gen['N2_LENGTH'] = 0 | 
|  | 451                                 overlap = abs(n2_len) | 
|  | 452                             else: | 
|  | 453                                 db_gen['N2_LENGTH'] = n2_len | 
|  | 454                                 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) | 
|  | 455                                 n2_end = int(j_align[8])-1 | 
|  | 456                                 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] | 
|  | 457                         elif v_call is not None: | 
|  | 458                             n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) | 
|  | 459                             if n1_len < 0: | 
|  | 460                                 db_gen['N1_LENGTH'] = 0 | 
|  | 461                                 overlap = abs(n1_len) | 
|  | 462                             else: | 
|  | 463                                 db_gen['N1_LENGTH'] = n1_len | 
|  | 464                                 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) | 
|  | 465                                 n1_end = int(j_align[8])-1 | 
|  | 466                                 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] | 
|  | 467                         else: | 
|  | 468                             db_gen['N1_LENGTH'] = 0 | 
|  | 469 | 
|  | 470                         # Query positions | 
|  | 471                         db_gen['J_SEQ_START'] = int(j_align[8]) + overlap | 
|  | 472                         db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) | 
|  | 473 | 
|  | 474                         # Germline positions | 
|  | 475                         db_gen['J_GERM_START'] = int(j_align[10]) + overlap | 
|  | 476                         db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) | 
|  | 477 | 
|  | 478                         # J alignment scores | 
|  | 479                         if score_fields: | 
|  | 480                             try: db_gen['J_SCORE'] = float(j_align[13]) | 
|  | 481                             except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' | 
|  | 482 | 
|  | 483                             try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 | 
|  | 484                             except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' | 
|  | 485 | 
|  | 486                             try: db_gen['J_EVALUE'] = float(j_align[12]) | 
|  | 487                             except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' | 
|  | 488 | 
|  | 489                             try: db_gen['J_BTOP'] = j_align[16] | 
|  | 490                             except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' | 
|  | 491 | 
|  | 492                         # Update VDJ sequence, removing insertions | 
|  | 493                         start = overlap | 
|  | 494                         for m in re.finditer(r'-', j_align[15]): | 
|  | 495                             ins = m.start() | 
|  | 496                             seq_vdj += j_align[14][start:ins] | 
|  | 497                             start = ins + 1 | 
|  | 498                         seq_vdj += j_align[14][start:] | 
|  | 499 | 
|  | 500                     db_gen['SEQUENCE_VDJ'] = seq_vdj | 
|  | 501 | 
|  | 502                     # Create IMGT-gapped sequence and infer IMGT junction | 
|  | 503                     if v_call is not None: | 
|  | 504                         db_gen = gapV(db_gen, repo_dict) | 
|  | 505                         if j_call is not None: | 
|  | 506                             db_gen = getIMGTJunc(db_gen, repo_dict) | 
|  | 507 | 
|  | 508                     # FWR and CDR regions | 
|  | 509                     if region_fields: getRegions(db_gen) | 
|  | 510 | 
|  | 511                 yield IgRecord(db_gen) | 
|  | 512 | 
|  | 513 | 
|  | 514 # TODO:  should be more readable | 
|  | 515 def readIMGT(imgt_files, score_fields=False, region_fields=False): | 
|  | 516     """ | 
|  | 517     Reads IMGT/HighV-Quest output | 
|  | 518 | 
|  | 519     Arguments: | 
|  | 520     imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 | 
|  | 521     score_fields = if True parse alignment scores | 
|  | 522     region_fields = if True add FWR and CDR region fields | 
|  | 523 | 
|  | 524     Returns: | 
|  | 525     a generator of dictionaries containing alignment data | 
|  | 526     """ | 
|  | 527     imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] | 
|  | 528     # Create a dictionary for each sequence alignment and yield its generator | 
|  | 529     for sm, gp, nt, jn in zip(*imgt_iters): | 
|  | 530         if len(set([sm['Sequence ID'], | 
|  | 531                     gp['Sequence ID'], | 
|  | 532                     nt['Sequence ID'], | 
|  | 533                     jn['Sequence ID']])) != 1: | 
|  | 534             sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ | 
|  | 535                      % sm['Sequence ID']) | 
|  | 536 | 
|  | 537         db_gen = {'SEQUENCE_ID': sm['Sequence ID'], | 
|  | 538                   'SEQUENCE_INPUT': sm['Sequence']} | 
|  | 539 | 
|  | 540         if 'No results' not in sm['Functionality']: | 
|  | 541             db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + | 
|  | 542                                                  ('unprod' in sm['Functionality'])] | 
|  | 543             db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + | 
|  | 544                                                ('out-of-frame' in sm['JUNCTION frame'])], | 
|  | 545             db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + | 
|  | 546                                            ('unprod' in sm['Functionality'])] | 
|  | 547             db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], | 
|  | 548                                                          'missing' in sm['V-REGION potential ins/del']))) + | 
|  | 549                                                          ('unprod' in sm['Functionality'])] | 
|  | 550             db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], | 
|  | 551                                               sm['V-REGION insertions'], | 
|  | 552                                               sm['V-REGION deletions']))] | 
|  | 553 | 
|  | 554             db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] | 
|  | 555             db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] | 
|  | 556 | 
|  | 557             db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) | 
|  | 558             db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) | 
|  | 559             db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) | 
|  | 560 | 
|  | 561             v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 | 
|  | 562             db_gen['V_SEQ_START'] = nt['V-REGION start'] | 
|  | 563             db_gen['V_SEQ_LENGTH'] = v_seq_length | 
|  | 564             db_gen['V_GERM_START_IMGT'] = 1 | 
|  | 565             db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 | 
|  | 566 | 
|  | 567             db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], | 
|  | 568                                                        jn['N-REGION-nt nb'], | 
|  | 569                                                        jn['N1-REGION-nt nb'], | 
|  | 570                                                        jn["P5'D-nt nb"]] if i) | 
|  | 571             db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, | 
|  | 572                                                          jn["P3'V-nt nb"], | 
|  | 573                                                          jn['N-REGION-nt nb'], | 
|  | 574                                                          jn['N1-REGION-nt nb'], | 
|  | 575                                                          jn["P5'D-nt nb"]] if i) | 
|  | 576             db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | 
|  | 577             db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 | 
|  | 578             db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) | 
|  | 579             db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], | 
|  | 580                                                        jn['N2-REGION-nt nb'], | 
|  | 581                                                        jn["P5'J-nt nb"]] if i) | 
|  | 582 | 
|  | 583             db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, | 
|  | 584                                                          jn["P3'V-nt nb"], | 
|  | 585                                                          jn['N-REGION-nt nb'], | 
|  | 586                                                          jn['N1-REGION-nt nb'], | 
|  | 587                                                          jn["P5'D-nt nb"], | 
|  | 588                                                          jn["D-REGION-nt nb"], | 
|  | 589                                                          jn["P3'D-nt nb"], | 
|  | 590                                                          jn['N2-REGION-nt nb'], | 
|  | 591                                                          jn["P5'J-nt nb"]] if i) | 
|  | 592             db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 | 
|  | 593             db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 | 
|  | 594             db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 | 
|  | 595 | 
|  | 596             db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 | 
|  | 597             db_gen['JUNCTION'] = jn['JUNCTION'] | 
|  | 598 | 
|  | 599             # Alignment scores | 
|  | 600             if score_fields: | 
|  | 601                 try:  db_gen['V_SCORE'] = float(sm['V-REGION score']) | 
|  | 602                 except (TypeError, ValueError):  db_gen['V_SCORE'] = 'None' | 
|  | 603 | 
|  | 604                 try:  db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 | 
|  | 605                 except (TypeError, ValueError):  db_gen['V_IDENTITY'] = 'None' | 
|  | 606 | 
|  | 607                 try:  db_gen['J_SCORE'] = float(sm['J-REGION score']) | 
|  | 608                 except (TypeError, ValueError):  db_gen['J_SCORE'] = 'None' | 
|  | 609 | 
|  | 610                 try:  db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 | 
|  | 611                 except (TypeError, ValueError):  db_gen['J_IDENTITY'] = 'None' | 
|  | 612 | 
|  | 613             # FWR and CDR regions | 
|  | 614             if region_fields: getRegions(db_gen) | 
|  | 615         else: | 
|  | 616             db_gen['V_CALL'] = 'None' | 
|  | 617             db_gen['D_CALL'] = 'None' | 
|  | 618             db_gen['J_CALL'] = 'None' | 
|  | 619 | 
|  | 620         yield IgRecord(db_gen) | 
|  | 621 | 
|  | 622 | 
|  | 623 def getIDforIMGT(seq_file): | 
|  | 624     """ | 
|  | 625     Create a sequence ID translation using IMGT truncation | 
|  | 626 | 
|  | 627     Arguments: | 
|  | 628     seq_file = a fasta file of sequences input to IMGT | 
|  | 629 | 
|  | 630     Returns: | 
|  | 631     a dictionary of {truncated ID: full seq description} | 
|  | 632     """ | 
|  | 633 | 
|  | 634     # Create a seq_dict ID translation using IDs truncate up to space or 50 chars | 
|  | 635     ids = {} | 
|  | 636     for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): | 
|  | 637         if len(rec.description) <= 50: | 
|  | 638             id_key = rec.description | 
|  | 639         else: | 
|  | 640             id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) | 
|  | 641         ids.update({id_key:rec.description}) | 
|  | 642 | 
|  | 643     return ids | 
|  | 644 | 
|  | 645 | 
|  | 646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, | 
|  | 647             score_fields=False, region_fields=False, out_args=default_out_args): | 
|  | 648     """ | 
|  | 649     Writes tab-delimited database file in output directory | 
|  | 650 | 
|  | 651     Arguments: | 
|  | 652     db_gen = a generator of IgRecord objects containing alignment data | 
|  | 653     file_prefix = directory and prefix for CLIP tab-delim file | 
|  | 654     total_count = number of records (for progress bar) | 
|  | 655     id_dict = a dictionary of {IMGT ID: full seq description} | 
|  | 656     no_parse = if ID is to be parsed for pRESTO output with default delimiters | 
|  | 657     score_fields = if True add alignment score fields to output file | 
|  | 658     region_fields = if True add FWR and CDR region fields to output file | 
|  | 659     out_args = common output argument dictionary from parseCommonArgs | 
|  | 660 | 
|  | 661     Returns: | 
|  | 662     None | 
|  | 663     """ | 
|  | 664     pass_file = "%s_db-pass.tab" % file_prefix | 
|  | 665     fail_file = "%s_db-fail.tab" % file_prefix | 
|  | 666     ordered_fields = ['SEQUENCE_ID', | 
|  | 667                       'SEQUENCE_INPUT', | 
|  | 668                       'FUNCTIONAL', | 
|  | 669                       'IN_FRAME', | 
|  | 670                       'STOP', | 
|  | 671                       'MUTATED_INVARIANT', | 
|  | 672                       'INDELS', | 
|  | 673                       'V_CALL', | 
|  | 674                       'D_CALL', | 
|  | 675                       'J_CALL', | 
|  | 676                       'SEQUENCE_VDJ', | 
|  | 677                       'SEQUENCE_IMGT', | 
|  | 678                       'V_SEQ_START', | 
|  | 679                       'V_SEQ_LENGTH', | 
|  | 680                       'V_GERM_START_VDJ', | 
|  | 681                       'V_GERM_LENGTH_VDJ', | 
|  | 682                       'V_GERM_START_IMGT', | 
|  | 683                       'V_GERM_LENGTH_IMGT', | 
|  | 684                       'N1_LENGTH', | 
|  | 685                       'D_SEQ_START', | 
|  | 686                       'D_SEQ_LENGTH', | 
|  | 687                       'D_GERM_START', | 
|  | 688                       'D_GERM_LENGTH', | 
|  | 689                       'N2_LENGTH', | 
|  | 690                       'J_SEQ_START', | 
|  | 691                       'J_SEQ_LENGTH', | 
|  | 692                       'J_GERM_START', | 
|  | 693                       'J_GERM_LENGTH', | 
|  | 694                       'JUNCTION_LENGTH', | 
|  | 695                       'JUNCTION'] | 
|  | 696 | 
|  | 697     if score_fields: | 
|  | 698         ordered_fields.extend(['V_SCORE', | 
|  | 699                                'V_IDENTITY', | 
|  | 700                                'V_EVALUE', | 
|  | 701                                'V_BTOP', | 
|  | 702                                'J_SCORE', | 
|  | 703                                'J_IDENTITY', | 
|  | 704                                'J_EVALUE', | 
|  | 705                                'J_BTOP']) | 
|  | 706 | 
|  | 707     if region_fields: | 
|  | 708         ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', | 
|  | 709                                'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) | 
|  | 710 | 
|  | 711 | 
|  | 712     # TODO:  This is not the best approach. should pass in output fields. | 
|  | 713     # Initiate passed handle | 
|  | 714     pass_handle = None | 
|  | 715 | 
|  | 716     # Open failed file | 
|  | 717     if out_args['failed']: | 
|  | 718         fail_handle = open(fail_file, 'wt') | 
|  | 719         fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) | 
|  | 720     else: | 
|  | 721         fail_handle = None | 
|  | 722         fail_writer = None | 
|  | 723 | 
|  | 724     # Initialize counters and file | 
|  | 725     pass_writer = None | 
|  | 726     start_time = time() | 
|  | 727     rec_count = pass_count = fail_count = 0 | 
|  | 728     for record in db_gen: | 
|  | 729         #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | 
|  | 730         printProgress(rec_count, total_count, 0.05, start_time) | 
|  | 731         rec_count += 1 | 
|  | 732 | 
|  | 733         # Count pass or fail | 
|  | 734         if (record.v_call == 'None' and record.j_call == 'None') or \ | 
|  | 735                 record.functional is None or \ | 
|  | 736                 not record.seq_vdj or \ | 
|  | 737                 not record.junction: | 
|  | 738             # print(record.v_call, record.j_call, record.functional, record.junction) | 
|  | 739             fail_count += 1 | 
|  | 740             if fail_writer is not None: fail_writer.writerow(record.toDict()) | 
|  | 741             continue | 
|  | 742         else: | 
|  | 743             pass_count += 1 | 
|  | 744 | 
|  | 745         # Build sample sequence description | 
|  | 746         if record.id in id_dict: | 
|  | 747             record.id = id_dict[record.id] | 
|  | 748 | 
|  | 749         # Parse sequence description into new columns | 
|  | 750         if not no_parse: | 
|  | 751             record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) | 
|  | 752             record.id = record.annotations['ID'] | 
|  | 753             del record.annotations['ID'] | 
|  | 754 | 
|  | 755         # TODO:  This is not the best approach. should pass in output fields. | 
|  | 756         # If first sequence, use parsed description to create new columns and initialize writer | 
|  | 757         if pass_writer is None: | 
|  | 758             if not no_parse:  ordered_fields.extend(list(record.annotations.keys())) | 
|  | 759             pass_handle = open(pass_file, 'wt') | 
|  | 760             pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) | 
|  | 761 | 
|  | 762         # Write row to tab-delim CLIP file | 
|  | 763         pass_writer.writerow(record.toDict()) | 
|  | 764 | 
|  | 765     # Print log | 
|  | 766     #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) | 
|  | 767     printProgress(rec_count, total_count, 0.05, start_time) | 
|  | 768 | 
|  | 769     log = OrderedDict() | 
|  | 770     log['OUTPUT'] = pass_file | 
|  | 771     log['PASS'] = pass_count | 
|  | 772     log['FAIL'] = fail_count | 
|  | 773     log['END'] = 'MakeDb' | 
|  | 774     printLog(log) | 
|  | 775 | 
|  | 776     if pass_handle is not None: pass_handle.close() | 
|  | 777     if fail_handle is not None: fail_handle.close() | 
|  | 778 | 
|  | 779 | 
|  | 780 # TODO:  may be able to merge with parseIMGT | 
|  | 781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, | 
|  | 782                  region_fields=False, out_args=default_out_args): | 
|  | 783     """ | 
|  | 784     Main for IgBlast aligned sample sequences | 
|  | 785 | 
|  | 786     Arguments: | 
|  | 787     igblast_output = IgBlast output file to process | 
|  | 788     seq_file = fasta file input to IgBlast (from which to get sequence) | 
|  | 789     repo = folder with germline repertoire files | 
|  | 790     no_parse = if ID is to be parsed for pRESTO output with default delimiters | 
|  | 791     score_fields = if True add alignment score fields to output file | 
|  | 792     region_fields = if True add FWR and CDR region fields to output file | 
|  | 793     out_args = common output argument dictionary from parseCommonArgs | 
|  | 794 | 
|  | 795     Returns: | 
|  | 796     None | 
|  | 797     """ | 
|  | 798     # Print parameter info | 
|  | 799     log = OrderedDict() | 
|  | 800     log['START'] = 'MakeDB' | 
|  | 801     log['ALIGNER'] = 'IgBlast' | 
|  | 802     log['ALIGN_RESULTS'] = os.path.basename(igblast_output) | 
|  | 803     log['SEQ_FILE'] = os.path.basename(seq_file) | 
|  | 804     log['NO_PARSE'] = no_parse | 
|  | 805     log['SCORE_FIELDS'] = score_fields | 
|  | 806     log['REGION_FIELDS'] = region_fields | 
|  | 807     printLog(log) | 
|  | 808 | 
|  | 809     # Get input sequence dictionary | 
|  | 810     seq_dict = getSeqforIgBlast(seq_file) | 
|  | 811 | 
|  | 812     # Formalize out_dir and file-prefix | 
|  | 813     if not out_args['out_dir']: | 
|  | 814         out_dir = os.path.split(igblast_output)[0] | 
|  | 815     else: | 
|  | 816         out_dir = os.path.abspath(out_args['out_dir']) | 
|  | 817         if not os.path.exists(out_dir):  os.mkdir(out_dir) | 
|  | 818     if out_args['out_name']: | 
|  | 819         file_prefix = out_args['out_name'] | 
|  | 820     else: | 
|  | 821         file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) | 
|  | 822     file_prefix = os.path.join(out_dir, file_prefix) | 
|  | 823 | 
|  | 824     total_count = countSeqFile(seq_file) | 
|  | 825 | 
|  | 826     # Create | 
|  | 827     repo_dict = getRepo(repo) | 
|  | 828     igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, | 
|  | 829                                score_fields=score_fields, region_fields=region_fields) | 
|  | 830     writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, | 
|  | 831             score_fields=score_fields, region_fields=region_fields, out_args=out_args) | 
|  | 832 | 
|  | 833 | 
|  | 834 # TODO:  may be able to merge with parseIgBlast | 
|  | 835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, | 
|  | 836               region_fields=False, out_args=default_out_args): | 
|  | 837     """ | 
|  | 838     Main for IMGT aligned sample sequences | 
|  | 839 | 
|  | 840     Arguments: | 
|  | 841     imgt_output = zipped file or unzipped folder output by IMGT | 
|  | 842     seq_file = FASTA file input to IMGT (from which to get seqID) | 
|  | 843     no_parse = if ID is to be parsed for pRESTO output with default delimiters | 
|  | 844     score_fields = if True add alignment score fields to output file | 
|  | 845     region_fields = if True add FWR and CDR region fields to output file | 
|  | 846     out_args = common output argument dictionary from parseCommonArgs | 
|  | 847 | 
|  | 848     Returns: | 
|  | 849     None | 
|  | 850     """ | 
|  | 851     # Print parameter info | 
|  | 852     log = OrderedDict() | 
|  | 853     log['START'] = 'MakeDb' | 
|  | 854     log['ALIGNER'] = 'IMGT' | 
|  | 855     log['ALIGN_RESULTS'] = imgt_output | 
|  | 856     log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' | 
|  | 857     log['NO_PARSE'] = no_parse | 
|  | 858     log['SCORE_FIELDS'] = score_fields | 
|  | 859     log['REGION_FIELDS'] = region_fields | 
|  | 860     printLog(log) | 
|  | 861 | 
|  | 862     # Get individual IMGT result files | 
|  | 863     temp_dir, imgt_files = extractIMGT(imgt_output) | 
|  | 864 | 
|  | 865     # Formalize out_dir and file-prefix | 
|  | 866     if not out_args['out_dir']: | 
|  | 867         out_dir = os.path.dirname(os.path.abspath(imgt_output)) | 
|  | 868     else: | 
|  | 869         out_dir = os.path.abspath(out_args['out_dir']) | 
|  | 870         if not os.path.exists(out_dir):  os.mkdir(out_dir) | 
|  | 871     if out_args['out_name']: | 
|  | 872         file_prefix = out_args['out_name'] | 
|  | 873     else: | 
|  | 874         file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] | 
|  | 875     file_prefix = os.path.join(out_dir, file_prefix) | 
|  | 876 | 
|  | 877     total_count = countDbFile(imgt_files[0]) | 
|  | 878 | 
|  | 879     # Get (parsed) IDs from fasta file submitted to IMGT | 
|  | 880     id_dict = getIDforIMGT(seq_file) if seq_file else {} | 
|  | 881 | 
|  | 882     # Create | 
|  | 883     imgt_dict = readIMGT(imgt_files, score_fields=score_fields, | 
|  | 884                          region_fields=region_fields) | 
|  | 885     writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, | 
|  | 886             score_fields=score_fields, region_fields=region_fields, out_args=out_args) | 
|  | 887 | 
|  | 888     # Delete temp directory | 
|  | 889     rmtree(temp_dir) | 
|  | 890 | 
|  | 891 | 
|  | 892 def getArgParser(): | 
|  | 893     """ | 
|  | 894     Defines the ArgumentParser | 
|  | 895 | 
|  | 896     Arguments: | 
|  | 897     None | 
|  | 898 | 
|  | 899     Returns: | 
|  | 900     an ArgumentParser object | 
|  | 901     """ | 
|  | 902     fields = dedent( | 
|  | 903              ''' | 
|  | 904               output files: | 
|  | 905                   db-pass | 
|  | 906                       database of parsed alignment records. | 
|  | 907                   db-fail | 
|  | 908                       database with records failing alignment. | 
|  | 909 | 
|  | 910               output fields: | 
|  | 911                   SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, | 
|  | 912                   INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, | 
|  | 913                   V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, | 
|  | 914                   V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, | 
|  | 915                   D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, | 
|  | 916                   J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, | 
|  | 917                   JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, | 
|  | 918                   J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, | 
|  | 919                   FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT | 
|  | 920               ''') | 
|  | 921 | 
|  | 922     # Define ArgumentParser | 
|  | 923     parser = ArgumentParser(description=__doc__, epilog=fields, | 
|  | 924                             formatter_class=CommonHelpFormatter) | 
|  | 925     parser.add_argument('--version', action='version', | 
|  | 926                         version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) | 
|  | 927     subparsers = parser.add_subparsers(title='subcommands', dest='command', | 
|  | 928                                        help='Aligner used', metavar='') | 
|  | 929     # TODO:  This is a temporary fix for Python issue 9253 | 
|  | 930     subparsers.required = True | 
|  | 931 | 
|  | 932     # Parent parser | 
|  | 933     parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) | 
|  | 934 | 
|  | 935     # IgBlast Aligner | 
|  | 936     parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', | 
|  | 937                                            parents=[parser_parent], | 
|  | 938                                            formatter_class=CommonHelpFormatter) | 
|  | 939     parser_igblast.set_defaults(func=parseIgBlast) | 
|  | 940     parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', | 
|  | 941                                 required=True, | 
|  | 942                                 help='''IgBLAST output files in format 7 with query sequence | 
|  | 943                                      (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') | 
|  | 944     parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, | 
|  | 945                                 help='''List of folders and/or fasta files containing | 
|  | 946                                      IMGT-gapped germline sequences corresponding to the | 
|  | 947                                      set of germlines used in the IgBLAST alignment.''') | 
|  | 948     parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', | 
|  | 949                                 required=True, | 
|  | 950                                 help='List of input FASTA files containing sequences') | 
|  | 951     parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', | 
|  | 952                                 help='''Specify if input IDs should not be parsed to add | 
|  | 953                                      new columns to database.''') | 
|  | 954     parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', | 
|  | 955                                 help='''Specify if alignment score metrics should be | 
|  | 956                                      included in the output. Adds the V_SCORE, V_IDENTITY, | 
|  | 957                                      V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, | 
|  | 958                                      J_BTOP, and J_EVALUE columns.''') | 
|  | 959     parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', | 
|  | 960                                 help='''Specify if IMGT framework and CDR regions should be | 
|  | 961                                      included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | 
|  | 962                                      FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | 
|  | 963                                      CDR3_IMGT columns.''') | 
|  | 964 | 
|  | 965     # IMGT aligner | 
|  | 966     parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', | 
|  | 967                                         parents=[parser_parent], | 
|  | 968                                         formatter_class=CommonHelpFormatter) | 
|  | 969     imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True) | 
|  | 970     imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', | 
|  | 971                                 help='''Either zipped IMGT output files (.zip) or a folder | 
|  | 972                                      containing unzipped IMGT output files (which must | 
|  | 973                                      include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, | 
|  | 974                                      and 6_Junction).''') | 
|  | 975     parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', | 
|  | 976                              required=False, | 
|  | 977                              help='List of input FASTA files containing sequences') | 
|  | 978     parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', | 
|  | 979                              help='''Specify if input IDs should not be parsed to add new | 
|  | 980                                   columns to database.''') | 
|  | 981     parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', | 
|  | 982                              help='''Specify if alignment score metrics should be | 
|  | 983                                   included in the output. Adds the V_SCORE, V_IDENTITY, | 
|  | 984                                   J_SCORE and J_IDENTITY. Note, this will also add | 
|  | 985                                   the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, | 
|  | 986                                   but they will be empty for IMGT output.''') | 
|  | 987     parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', | 
|  | 988                              help='''Specify if IMGT framework and CDR regions should be | 
|  | 989                                   included in the output. Adds the FWR1_IMGT, FWR2_IMGT, | 
|  | 990                                   FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and | 
|  | 991                                   CDR3_IMGT columns.''') | 
|  | 992     parser_imgt.set_defaults(func=parseIMGT) | 
|  | 993 | 
|  | 994     return parser | 
|  | 995 | 
|  | 996 | 
|  | 997 if __name__ == "__main__": | 
|  | 998     """ | 
|  | 999     Parses command line arguments and calls main | 
|  | 1000     """ | 
|  | 1001     parser = getArgParser() | 
|  | 1002     args = parser.parse_args() | 
|  | 1003     args_dict = parseCommonArgs(args, in_arg='aligner_files') | 
|  | 1004 | 
|  | 1005     # Set no ID parsing if sequence files are not provided | 
|  | 1006     if 'seq_files' in args_dict and not args_dict['seq_files']: | 
|  | 1007         args_dict['no_parse'] = True | 
|  | 1008 | 
|  | 1009     # Delete | 
|  | 1010     if 'seq_files' in args_dict: del args_dict['seq_files'] | 
|  | 1011     if 'aligner_files' in args_dict: del args_dict['aligner_files'] | 
|  | 1012     if 'command' in args_dict: del args_dict['command'] | 
|  | 1013     if 'func' in args_dict: del args_dict['func'] | 
|  | 1014 | 
|  | 1015     if args.command == 'imgt': | 
|  | 1016         for i in range(len(args.__dict__['aligner_files'])): | 
|  | 1017             args_dict['imgt_output'] = args.__dict__['aligner_files'][i] | 
|  | 1018             args_dict['seq_file'] = args.__dict__['seq_files'][i] \ | 
|  | 1019                                     if args.__dict__['seq_files'] else None | 
|  | 1020             args.func(**args_dict) | 
|  | 1021     elif args.command == 'igblast': | 
|  | 1022         for i in range(len(args.__dict__['aligner_files'])): | 
|  | 1023             args_dict['igblast_output'] =  args.__dict__['aligner_files'][i] | 
|  | 1024             args_dict['seq_file'] = args.__dict__['seq_files'][i] | 
|  | 1025             args.func(**args_dict) |