changeset 120:613278c1bde0 draft

Uploaded
author davidvanzessen
date Tue, 16 Aug 2016 09:10:50 -0400
parents 626a956f3811
children 31cca6d3722a
files change_o/DefineClones.py change_o/MakeDb.py change_o/define_clones.r change_o/define_clones.sh change_o/makedb.sh mutation_analysis.r wrapper.sh
diffstat 7 files changed, 2186 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/DefineClones.py	Tue Aug 16 09:10:50 2016 -0400
@@ -0,0 +1,1052 @@
+#!/usr/bin/env python3
+"""
+Assign Ig sequences into clones
+"""
+# Info
+__author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman'
+from changeo import __version__, __date__
+
+# Imports
+import os
+import re
+import sys
+import numpy as np
+from argparse import ArgumentParser
+from collections import OrderedDict
+from itertools import chain
+from textwrap import dedent
+from time import time
+from Bio import pairwise2
+from Bio.Seq import translate
+
+# Presto and changeo imports
+from presto.Defaults import default_out_args
+from presto.IO import getFileType, getOutputHandle, printLog, printProgress
+from presto.Multiprocessing import manageProcesses
+from presto.Sequence import getDNAScoreDict
+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
+from changeo.Distance import getDNADistMatrix, getAADistMatrix, \
+                             hs1f_model, m1n_model, hs5f_model, \
+                             calcDistances, formClusters
+from changeo.IO import getDbWriter, readDbFile, countDbFile
+from changeo.Multiprocessing import DbData, DbResult
+
+# Defaults
+default_translate = False
+default_distance = 0.0
+default_bygroup_model = 'hs1f'
+default_hclust_model = 'chen2010'
+default_seq_field = 'JUNCTION'
+default_norm = 'len'
+default_sym = 'avg'
+default_linkage = 'single'
+
+# TODO:  should be in Distance, but need to be after function definitions
+# Amino acid Hamming distance
+aa_model = getAADistMatrix(mask_dist=1, gap_dist=0)
+
+# DNA Hamming distance
+ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0)
+
+
+# TODO:  this function is an abstraction to facilitate later cleanup
+def getModelMatrix(model):
+    """
+    Simple wrapper to get distance matrix from model name
+
+    Arguments:
+    model = model name
+
+    Return:
+    a pandas.DataFrame containing the character distance matrix
+    """
+    if model == 'aa':
+        return(aa_model)
+    elif model == 'ham':
+        return(ham_model)
+    elif model == 'm1n':
+        return(m1n_model)
+    elif model == 'hs1f':
+        return(hs1f_model)
+    elif model == 'hs5f':
+        return(hs5f_model)
+    else:
+        sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+
+
+def indexJunctions(db_iter, fields=None, mode='gene', action='first'):
+    """
+    Identifies preclonal groups by V, J and junction length
+
+    Arguments: 
+    db_iter = an iterator of IgRecords defined by readDbFile
+    fields = additional annotation fields to use to group preclones;
+             if None use only V, J and junction length
+    mode = specificity of alignment call to use for assigning preclones;
+           one of ('allele', 'gene')
+    action = how to handle multiple value fields when assigning preclones;
+             one of ('first', 'set')
+    
+    Returns: 
+    a dictionary of {(V, J, junction length):[IgRecords]}
+    """
+    # Define functions for grouping keys
+    if mode == 'allele' and fields is None:
+        def _get_key(rec, act):
+            return (rec.getVAllele(act), rec.getJAllele(act),
+                    None if rec.junction is None else len(rec.junction))
+    elif mode == 'gene' and fields is None:
+        def _get_key(rec, act):  
+            return (rec.getVGene(act), rec.getJGene(act),
+                    None if rec.junction is None else len(rec.junction))
+    elif mode == 'allele' and fields is not None:
+        def _get_key(rec, act):
+            vdj = [rec.getVAllele(act), rec.getJAllele(act),
+                    None if rec.junction is None else len(rec.junction)]
+            ann = [rec.toDict().get(k, None) for k in fields]
+            return tuple(chain(vdj, ann))
+    elif mode == 'gene' and fields is not None:
+        def _get_key(rec, act):
+            vdj = [rec.getVGene(act), rec.getJGene(act),
+                    None if rec.junction is None else len(rec.junction)]
+            ann = [rec.toDict().get(k, None) for k in fields]
+            return tuple(chain(vdj, ann))
+
+    start_time = time()
+    clone_index = {}
+    rec_count = 0
+    for rec in db_iter:
+        key = _get_key(rec, action)
+
+        # Print progress
+        if rec_count == 0:
+            print('PROGRESS> Grouping sequences')
+
+        printProgress(rec_count, step=1000, start_time=start_time)
+        rec_count += 1
+
+        # Assigned passed preclone records to key and failed to index None
+        if all([k is not None and k != '' for k in key]):
+            #print key
+            # TODO:  Has much slow. Should have less slow.
+            if action == 'set':
+                
+                f_range = list(range(2, 3 + (len(fields) if fields else 0)))
+                vdj_range = list(range(2))
+                
+                # Check for any keys that have matching columns and junction length and overlapping genes/alleles
+                to_remove = []
+                if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index:
+                    key = list(key)
+                    for k in clone_index:
+                        if k is not None and all([key[i] == k[i] for i in f_range]):
+                            if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]):
+                                for i in vdj_range:  key[i] = tuple(set(key[i]).union(set(k[i])))
+                                to_remove.append(k)
+                
+                # Remove original keys, replace with union of all genes/alleles and append values to new key
+                val = [rec]
+                val += list(chain(*(clone_index.pop(k) for k in to_remove)))
+                clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val 
+
+            elif action == 'first':
+                clone_index.setdefault(key, []).append(rec)
+        else:
+            clone_index.setdefault(None, []).append(rec)
+
+    printProgress(rec_count, step=1000, start_time=start_time, end=True)
+
+    return clone_index
+
+
+def distanceClones(records, model=default_bygroup_model, distance=default_distance,
+                   dist_mat=None, norm=default_norm, sym=default_sym,
+                   linkage=default_linkage, seq_field=default_seq_field):
+    """
+    Separates a set of IgRecords into clones
+
+    Arguments: 
+    records = an iterator of IgRecords
+    model = substitution model used to calculate distance
+    distance = the distance threshold to assign clonal groups
+    dist_mat = pandas DataFrame of pairwise nucleotide or amino acid distances
+    norm = normalization method
+    sym = symmetry method
+    linkage = type of linkage
+    seq_field = sequence field used to calculate distance between records
+
+    Returns: 
+    a dictionary of lists defining {clone number: [IgRecords clonal group]}
+    """
+    # Get distance matrix if not provided
+    if dist_mat is None:  dist_mat = getModelMatrix(model)
+
+    # Determine length of n-mers
+    if model in ['hs1f', 'm1n', 'aa', 'ham']:
+        nmer_len = 1
+    elif model in ['hs5f']:
+        nmer_len = 5
+    else:
+        sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+
+    # Define unique junction mapping
+    seq_map = {}
+    for ig in records:
+        seq = ig.getSeqField(seq_field)
+        # Check if sequence length is 0
+        if len(seq) == 0:
+            return None
+
+        seq = re.sub('[\.-]','N', str(seq))
+        if model == 'aa':  seq = translate(seq)
+
+        seq_map.setdefault(seq, []).append(ig)
+
+    # Process records
+    if len(seq_map) == 1:
+        return {1:records}
+
+    # Define sequences
+    seqs = list(seq_map.keys())
+
+    # Calculate pairwise distance matrix
+    dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym)
+
+    # Perform hierarchical clustering
+    clusters = formClusters(dists, linkage, distance)
+
+    # Turn clusters into clone dictionary
+    clone_dict = {}
+    for i, c in enumerate(clusters):
+        clone_dict.setdefault(c, []).extend(seq_map[seqs[i]])
+
+    return clone_dict
+
+
+def distChen2010(records):
+    """
+    Calculate pairwise distances as defined in Chen 2010
+    
+    Arguments:
+    records = list of IgRecords where first is query to be compared to others in list
+    
+    Returns:
+    list of distances
+    """
+    # Pull out query sequence and V/J information
+    query = records.popitem(last=False)
+    query_cdr3 = query.junction[3:-3]
+    query_v_allele = query.getVAllele()
+    query_v_gene = query.getVGene()
+    query_v_family = query.getVFamily()
+    query_j_allele = query.getJAllele()
+    query_j_gene = query.getJGene()
+    # Create alignment scoring dictionary
+    score_dict = getDNAScoreDict()
+    
+    scores = [0]*len(records)    
+    for i in range(len(records)):
+        ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3],
+                                      score_dict, -1, -1, one_alignment_only=True)
+        # Check V similarity
+        if records[i].getVAllele() == query_v_allele: ld += 0
+        elif records[i].getVGene() == query_v_gene: ld += 1
+        elif records[i].getVFamily() == query_v_family: ld += 3
+        else: ld += 5
+        # Check J similarity
+        if records[i].getJAllele() == query_j_allele: ld += 0
+        elif records[i].getJGene() == query_j_gene: ld += 1
+        else: ld += 3
+        # Divide by length
+        scores[i] = ld/max(len(records[i].junction[3:-3]), query_cdr3)
+        
+    return scores
+
+
+def distAdemokun2011(records):
+    """
+    Calculate pairwise distances as defined in Ademokun 2011
+    
+    Arguments:
+    records = list of IgRecords where first is query to be compared to others in list
+    
+    Returns:
+    list of distances
+    """
+    # Pull out query sequence and V family information
+    query = records.popitem(last=False)
+    query_cdr3 = query.junction[3:-3]
+    query_v_family = query.getVFamily()
+    # Create alignment scoring dictionary
+    score_dict = getDNAScoreDict()
+    
+    scores = [0]*len(records)    
+    for i in range(len(records)):
+        
+        if abs(len(query_cdr3) - len(records[i].junction[3:-3])) > 10:
+            scores[i] = 1
+        elif query_v_family != records[i].getVFamily(): 
+            scores[i] = 1
+        else: 
+            ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3], 
+                                          score_dict, -1, -1, one_alignment_only=True)
+            scores[i] = ld/min(len(records[i].junction[3:-3]), query_cdr3)
+    
+    return scores
+
+
+def hierClust(dist_mat, method='chen2010'):
+    """
+    Calculate hierarchical clustering
+    
+    Arguments:
+    dist_mat = square-formed distance matrix of pairwise CDR3 comparisons
+    
+    Returns:
+    list of cluster ids
+    """
+    if method == 'chen2010':
+        clusters = formClusters(dist_mat, 'average', 0.32)
+    elif method == 'ademokun2011':
+        clusters = formClusters(dist_mat, 'complete', 0.25)
+    else: clusters = np.ones(dist_mat.shape[0])
+        
+    return clusters
+
+# TODO:  Merge duplicate feed, process and collect functions.
+def feedQueue(alive, data_queue, db_file, group_func, group_args={}):
+    """
+    Feeds the data queue with Ig records
+
+    Arguments: 
+    alive = a multiprocessing.Value boolean controlling whether processing continues
+            if False exit process
+    data_queue = a multiprocessing.Queue to hold data for processing
+    db_file = the Ig record database file
+    group_func = the function to use for assigning preclones
+    group_args = a dictionary of arguments to pass to group_func
+    
+    Returns: 
+    None
+    """
+    # Open input file and perform grouping
+    try:
+        # Iterate over Ig records and assign groups
+        db_iter = readDbFile(db_file)
+        clone_dict = group_func(db_iter, **group_args)
+    except:
+        #sys.stderr.write('Exception in feeder grouping step\n')
+        alive.value = False
+        raise
+    
+    # Add groups to data queue
+    try:
+        #print 'START FEED', alive.value
+        # Iterate over groups and feed data queue
+        clone_iter = iter(clone_dict.items())
+        while alive.value:
+            # Get data from queue
+            if data_queue.full():  continue
+            else:  data = next(clone_iter, None)
+            # Exit upon reaching end of iterator
+            if data is None:  break
+            #print "FEED", alive.value, k
+            
+            # Feed queue
+            data_queue.put(DbData(*data))
+        else:
+            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
+                             % os.getpid())
+            return None
+    except:
+        #sys.stderr.write('Exception in feeder queue feeding step\n')
+        alive.value = False
+        raise
+
+    return None
+
+
+def feedQueueClust(alive, data_queue, db_file, group_func=None, group_args={}):
+    """
+    Feeds the data queue with Ig records
+
+    Arguments: 
+    alive = a multiprocessing.Value boolean controlling whether processing continues
+            if False exit process
+    data_queue = a multiprocessing.Queue to hold data for processing
+    db_file = the Ig record database file
+    
+    Returns: 
+    None
+    """
+    # Open input file and perform grouping
+    try:
+        # Iterate over Ig records and order by junction length
+        records = {}
+        db_iter = readDbFile(db_file)
+        for rec in db_iter:
+            records[rec.id] = rec
+        records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
+        dist_dict = {}
+        for __ in range(len(records)):
+            k,v = records.popitem(last=False)
+            dist_dict[k] = [v].append(list(records.values()))
+    except:
+        #sys.stderr.write('Exception in feeder grouping step\n')
+        alive.value = False
+        raise
+    
+    # Add groups to data queue
+    try:
+        # print 'START FEED', alive.value
+        # Iterate over groups and feed data queue
+        dist_iter = iter(dist_dict.items())
+        while alive.value:
+            # Get data from queue
+            if data_queue.full():  continue
+            else:  data = next(dist_iter, None)
+            # Exit upon reaching end of iterator
+            if data is None:  break
+            #print "FEED", alive.value, k
+            
+            # Feed queue
+            data_queue.put(DbData(*data))
+        else:
+            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
+                             % os.getpid())
+            return None
+    except:
+        #sys.stderr.write('Exception in feeder queue feeding step\n')
+        alive.value = False
+        raise
+
+    return None
+
+
+def processQueue(alive, data_queue, result_queue, clone_func, clone_args):
+    """
+    Pulls from data queue, performs calculations, and feeds results queue
+
+    Arguments: 
+    alive = a multiprocessing.Value boolean controlling whether processing continues
+            if False exit process
+    data_queue = a multiprocessing.Queue holding data to process
+    result_queue = a multiprocessing.Queue to hold processed results
+    clone_func = the function to call for clonal assignment
+    clone_args = a dictionary of arguments to pass to clone_func
+
+    Returns: 
+    None
+    """
+    try:
+        # Iterator over data queue until sentinel object reached
+        while alive.value:
+            # Get data from queue
+            if data_queue.empty():  continue
+            else:  data = data_queue.get()
+            # Exit upon reaching sentinel
+            if data is None:  break
+
+            # Define result object for iteration and get data records
+            records = data.data
+            result = DbResult(data.id, records)
+
+            # Check for invalid data (due to failed indexing) and add failed result
+            if not data:
+                result_queue.put(result)
+                continue
+
+            # Add V(D)J to log
+            result.log['ID'] = ','.join([str(x) for x in data.id])
+            result.log['VALLELE'] = ','.join(set([(r.getVAllele() or '') for r in records]))
+            result.log['DALLELE'] = ','.join(set([(r.getDAllele() or '') for r in records]))
+            result.log['JALLELE'] = ','.join(set([(r.getJAllele() or '') for r in records]))
+            result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in records]))
+            result.log['SEQUENCES'] = len(records)
+             
+            # Checking for preclone failure and assign clones
+            clones = clone_func(records, **clone_args) if data else None
+
+            # import cProfile
+            # prof = cProfile.Profile()
+            # clones = prof.runcall(clone_func, records, **clone_args)
+            # prof.dump_stats('worker-%d.prof' % os.getpid())
+
+            if clones is not None:
+                result.results = clones
+                result.valid = True
+                result.log['CLONES'] = len(clones)
+            else:
+                result.log['CLONES'] = 0
+  
+            # Feed results to result queue
+            result_queue.put(result)
+        else:
+            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
+                             % os.getpid())
+            return None
+    except:
+        #sys.stderr.write('Exception in worker\n')
+        alive.value = False
+        raise
+    
+    return None
+
+
+def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args):
+    """
+    Pulls from data queue, performs calculations, and feeds results queue
+
+    Arguments: 
+    alive = a multiprocessing.Value boolean controlling whether processing continues
+            if False exit process
+    data_queue = a multiprocessing.Queue holding data to process
+    result_queue = a multiprocessing.Queue to hold processed results
+    clone_func = the function to call for calculating pairwise distances between sequences
+    clone_args = a dictionary of arguments to pass to clone_func
+
+    Returns: 
+    None
+    """
+    
+    try:
+        # print 'START WORK', alive.value
+        # Iterator over data queue until sentinel object reached
+        while alive.value:
+            # Get data from queue
+            if data_queue.empty():  continue
+            else:  data = data_queue.get()
+            # Exit upon reaching sentinel
+            if data is None:  break
+            # print "WORK", alive.value, data['id']
+
+            # Define result object for iteration and get data records
+            records = data.data
+            result = DbResult(data.id, records)
+             
+            # Create row of distance matrix and check for error
+            dist_row = clone_func(records, **clone_args) if data else None
+            if dist_row is not None:
+                result.results = dist_row
+                result.valid = True
+  
+            # Feed results to result queue
+            result_queue.put(result)
+        else:
+            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
+                             % os.getpid())
+            return None
+    except:
+        #sys.stderr.write('Exception in worker\n')
+        alive.value = False
+        raise
+    
+    return None
+
+
+def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}):
+    """
+    Assembles results from a queue of individual sequence results and manages log/file I/O
+
+    Arguments: 
+    alive = a multiprocessing.Value boolean controlling whether processing continues
+            if False exit process
+    result_queue = a multiprocessing.Queue holding processQueue results
+    collect_queue = a multiprocessing.Queue to store collector return values
+    db_file = the input database file name
+    out_args = common output argument dictionary from parseCommonArgs
+    cluster_func = the function to call for carrying out clustering on distance matrix
+    cluster_args = a dictionary of arguments to pass to cluster_func
+    
+    Returns: 
+    None
+    (adds 'log' and 'out_files' to collect_dict)
+    """
+    # Open output files
+    try:
+        # Count records and define output format 
+        out_type = getFileType(db_file) if out_args['out_type'] is None \
+                   else out_args['out_type']
+        result_count = countDbFile(db_file)
+        
+        # Defined successful output handle
+        pass_handle = getOutputHandle(db_file, 
+                                      out_label='clone-pass', 
+                                      out_dir=out_args['out_dir'], 
+                                      out_name=out_args['out_name'], 
+                                      out_type=out_type)
+        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
+        
+        # Defined failed alignment output handle
+        if out_args['failed']:
+            fail_handle = getOutputHandle(db_file,
+                                          out_label='clone-fail', 
+                                          out_dir=out_args['out_dir'], 
+                                          out_name=out_args['out_name'], 
+                                          out_type=out_type)
+            fail_writer = getDbWriter(fail_handle, db_file)
+        else:
+            fail_handle = None
+            fail_writer = None
+
+        # Define log handle
+        if out_args['log_file'] is None:  
+            log_handle = None
+        else:  
+            log_handle = open(out_args['log_file'], 'w')
+    except:
+        #sys.stderr.write('Exception in collector file opening step\n')
+        alive.value = False
+        raise
+
+    # Get results from queue and write to files
+    try:
+        #print 'START COLLECT', alive.value
+        # Iterator over results queue until sentinel object reached
+        start_time = time()
+        rec_count = clone_count = pass_count = fail_count = 0
+        while alive.value:
+            # Get result from queue
+            if result_queue.empty():  continue
+            else:  result = result_queue.get()
+            # Exit upon reaching sentinel
+            if result is None:  break
+            #print "COLLECT", alive.value, result['id']
+            
+            # Print progress for previous iteration and update record count
+            if rec_count == 0:
+                print('PROGRESS> Assigning clones')
+            printProgress(rec_count, result_count, 0.05, start_time) 
+            rec_count += len(result.data)
+            
+            # Write passed and failed records
+            if result:
+                for clone in result.results.values():
+                    clone_count += 1
+                    for i, rec in enumerate(clone):
+                        rec.annotations['CLONE'] = clone_count
+                        pass_writer.writerow(rec.toDict())
+                        pass_count += 1
+                        result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
+    
+            else:
+                for i, rec in enumerate(result.data):
+                    if fail_writer is not None: fail_writer.writerow(rec.toDict())
+                    fail_count += 1
+                    result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
+                    
+            # Write log
+            printLog(result.log, handle=log_handle)
+        else:
+            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
+                             % os.getpid())
+            return None
+        
+        # Print total counts
+        printProgress(rec_count, result_count, 0.05, start_time)
+
+        # Close file handles
+        pass_handle.close()
+        if fail_handle is not None:  fail_handle.close()
+        if log_handle is not None:  log_handle.close()
+                
+        # Update return list
+        log = OrderedDict()
+        log['OUTPUT'] = os.path.basename(pass_handle.name)
+        log['CLONES'] = clone_count
+        log['RECORDS'] = rec_count
+        log['PASS'] = pass_count
+        log['FAIL'] = fail_count
+        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
+        collect_queue.put(collect_dict)
+    except:
+        #sys.stderr.write('Exception in collector result processing step\n')
+        alive.value = False
+        raise
+
+    return None
+
+
+def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args):
+    """
+    Assembles results from a queue of individual sequence results and manages log/file I/O
+
+    Arguments: 
+    alive = a multiprocessing.Value boolean controlling whether processing continues
+            if False exit process
+    result_queue = a multiprocessing.Queue holding processQueue results
+    collect_queue = a multiprocessing.Queue to store collector return values
+    db_file = the input database file name
+    out_args = common output argument dictionary from parseCommonArgs
+    cluster_func = the function to call for carrying out clustering on distance matrix
+    cluster_args = a dictionary of arguments to pass to cluster_func
+    
+    Returns: 
+    None
+    (adds 'log' and 'out_files' to collect_dict)
+    """
+    # Open output files
+    try:
+               
+        # Iterate over Ig records to count and order by junction length
+        result_count = 0
+        records = {}
+        # print 'Reading file...'
+        db_iter = readDbFile(db_file)
+        for rec in db_iter:
+            records[rec.id] = rec
+            result_count += 1
+        records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
+                
+        # Define empty matrix to store assembled results
+        dist_mat = np.zeros((result_count,result_count))
+        
+        # Count records and define output format 
+        out_type = getFileType(db_file) if out_args['out_type'] is None \
+                   else out_args['out_type']
+                   
+        # Defined successful output handle
+        pass_handle = getOutputHandle(db_file, 
+                                      out_label='clone-pass', 
+                                      out_dir=out_args['out_dir'], 
+                                      out_name=out_args['out_name'], 
+                                      out_type=out_type)
+        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
+        
+        # Defined failed cloning output handle
+        if out_args['failed']:
+            fail_handle = getOutputHandle(db_file,
+                                          out_label='clone-fail', 
+                                          out_dir=out_args['out_dir'], 
+                                          out_name=out_args['out_name'], 
+                                          out_type=out_type)
+            fail_writer = getDbWriter(fail_handle, db_file)
+        else:
+            fail_handle = None
+            fail_writer = None
+
+        # Open log file
+        if out_args['log_file'] is None:
+            log_handle = None
+        else:
+            log_handle = open(out_args['log_file'], 'w')
+    except:
+        alive.value = False
+        raise
+    
+    try:
+        # Iterator over results queue until sentinel object reached
+        start_time = time()
+        row_count = rec_count = 0
+        while alive.value:
+            # Get result from queue
+            if result_queue.empty():  continue
+            else:  result = result_queue.get()
+            # Exit upon reaching sentinel
+            if result is None:  break
+
+            # Print progress for previous iteration
+            if row_count == 0:
+                print('PROGRESS> Assigning clones')
+            printProgress(row_count, result_count, 0.05, start_time)
+            
+            # Update counts for iteration
+            row_count += 1
+            rec_count += len(result)
+            
+            # Add result row to distance matrix
+            if result:
+                dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results
+                
+        else:
+            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
+                             % os.getpid())
+            return None    
+        
+        # Calculate linkage and carry out clustering
+        # print dist_mat
+        clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None
+        clones = {}
+        # print clusters
+        for i, c in enumerate(clusters):
+            clones.setdefault(c, []).append(records[list(records.keys())[i]])
+        
+        # Write passed and failed records
+        clone_count = pass_count = fail_count = 0
+        if clones:
+            for clone in clones.values():
+                clone_count += 1
+                for i, rec in enumerate(clone):
+                    rec.annotations['CLONE'] = clone_count
+                    pass_writer.writerow(rec.toDict())
+                    pass_count += 1
+                    #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
+
+        else:
+            for i, rec in enumerate(result.data):
+                fail_writer.writerow(rec.toDict())
+                fail_count += 1
+                #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
+        
+        # Print final progress
+        printProgress(row_count, result_count, 0.05, start_time)
+    
+        # Close file handles
+        pass_handle.close()
+        if fail_handle is not None:  fail_handle.close()
+        if log_handle is not None:  log_handle.close()
+                
+        # Update return list
+        log = OrderedDict()
+        log['OUTPUT'] = os.path.basename(pass_handle.name)
+        log['CLONES'] = clone_count
+        log['RECORDS'] = rec_count
+        log['PASS'] = pass_count
+        log['FAIL'] = fail_count
+        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
+        collect_queue.put(collect_dict)
+    except:
+        alive.value = False
+        raise
+    
+    return None
+
+
+def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None,
+                 group_func=None, group_args={}, clone_args={}, cluster_args={}, 
+                 out_args=default_out_args, nproc=None, queue_size=None):
+    """
+    Define clonally related sequences
+    
+    Arguments:
+    db_file = filename of input database
+    feed_func = the function that feeds the queue
+    work_func = the worker function that will run on each CPU
+    collect_func = the function that collects results from the workers
+    group_func = the function to use for assigning preclones
+    clone_func = the function to use for determining clones within preclonal groups
+    group_args = a dictionary of arguments to pass to group_func
+    clone_args = a dictionary of arguments to pass to clone_func
+    out_args = common output argument dictionary from parseCommonArgs
+    nproc = the number of processQueue processes;
+            if None defaults to the number of CPUs
+    queue_size = maximum size of the argument queue;
+                 if None defaults to 2*nproc    
+    
+    Returns:
+    a list of successful output file names
+    """
+    # Print parameter info
+    log = OrderedDict()
+    log['START'] = 'DefineClones'
+    log['DB_FILE'] = os.path.basename(db_file)
+    if group_func is not None:
+        log['GROUP_FUNC'] = group_func.__name__
+        log['GROUP_ARGS'] = group_args
+    log['CLONE_FUNC'] = clone_func.__name__
+
+    # TODO:  this is yucky, but can be fixed by using a model class
+    clone_log = clone_args.copy()
+    if 'dist_mat' in clone_log:  del clone_log['dist_mat']
+    log['CLONE_ARGS'] = clone_log
+
+    if cluster_func is not None:
+        log['CLUSTER_FUNC'] = cluster_func.__name__
+        log['CLUSTER_ARGS'] = cluster_args
+    log['NPROC'] = nproc
+    printLog(log)
+    
+    # Define feeder function and arguments
+    feed_args = {'db_file': db_file,
+                 'group_func': group_func, 
+                 'group_args': group_args}
+    # Define worker function and arguments
+    work_args = {'clone_func': clone_func, 
+                 'clone_args': clone_args}
+    # Define collector function and arguments
+    collect_args = {'db_file': db_file,
+                    'out_args': out_args,
+                    'cluster_func': cluster_func,
+                    'cluster_args': cluster_args}
+    
+    # Call process manager
+    result = manageProcesses(feed_func, work_func, collect_func, 
+                             feed_args, work_args, collect_args, 
+                             nproc, queue_size)
+        
+    # Print log
+    result['log']['END'] = 'DefineClones'
+    printLog(result['log'])
+    
+    return result['out_files']
+
+
+def getArgParser():
+    """
+    Defines the ArgumentParser
+
+    Arguments: 
+    None
+                      
+    Returns: 
+    an ArgumentParser object
+    """
+    # Define input and output fields
+    fields = dedent(
+             '''
+             output files:
+                 clone-pass
+                     database with assigned clonal group numbers.
+                 clone-fail
+                     database with records failing clonal grouping.
+
+             required fields:
+                 SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH
+
+                 <field>
+                     sequence field specified by the --sf parameter
+                
+             output fields:
+                 CLONE
+              ''')
+
+    # Define ArgumentParser
+    parser = ArgumentParser(description=__doc__, epilog=fields,
+                            formatter_class=CommonHelpFormatter)
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
+    subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='',
+                                       help='Cloning method')
+    # TODO:  This is a temporary fix for Python issue 9253
+    subparsers.required = True
+    
+    # Parent parser    
+    parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True, 
+                                       multiproc=True)
+    
+    # Distance cloning method
+    parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='''Defines clones as having same V assignment,
+                                              J assignment, and junction length with
+                                              specified substitution distance model.''')
+    parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
+                             help='Additional fields to use for grouping clones (non VDJ)')
+    parser_bygroup.add_argument('--mode', action='store', dest='mode', 
+                             choices=('allele', 'gene'), default='gene', 
+                             help='''Specifies whether to use the V(D)J allele or gene for
+                                  initial grouping.''')
+    parser_bygroup.add_argument('--act', action='store', dest='action', default='set',
+                             choices=('first', 'set'),
+                             help='''Specifies how to handle multiple V(D)J assignments
+                                  for initial grouping.''')
+    parser_bygroup.add_argument('--model', action='store', dest='model', 
+                             choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'),
+                             default=default_bygroup_model,
+                             help='''Specifies which substitution model to use for
+                                  calculating distance between sequences. Where m1n is the
+                                  mouse single nucleotide transition/trasversion model
+                                  of Smith et al, 1996; hs1f is the human single
+                                  nucleotide model derived from Yaari et al, 2013; hs5f
+                                  is the human S5F model of Yaari et al, 2013; ham is
+                                  nucleotide Hamming distance; and aa is amino acid
+                                  Hamming distance. The hs5f data should be
+                                  considered experimental.''')
+    parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float, 
+                             default=default_distance,
+                             help='The distance threshold for clonal grouping')
+    parser_bygroup.add_argument('--norm', action='store', dest='norm',
+                             choices=('len', 'mut', 'none'), default=default_norm,
+                             help='''Specifies how to normalize distances. One of none
+                                  (do not normalize), len (normalize by length),
+                                  or mut (normalize by number of mutations between sequences).''')
+    parser_bygroup.add_argument('--sym', action='store', dest='sym',
+                             choices=('avg', 'min'), default=default_sym,
+                             help='''Specifies how to combine asymmetric distances. One of avg
+                                  (average of A->B and B->A) or min (minimum of A->B and B->A).''')
+    parser_bygroup.add_argument('--link', action='store', dest='linkage',
+                             choices=('single', 'average', 'complete'), default=default_linkage,
+                             help='''Type of linkage to use for hierarchical clustering.''')
+    parser_bygroup.add_argument('--sf', action='store', dest='seq_field',
+                                default=default_seq_field,
+                                help='''The name of the field to be used to calculate
+                                     distance between records''')
+    parser_bygroup.set_defaults(feed_func=feedQueue)
+    parser_bygroup.set_defaults(work_func=processQueue)
+    parser_bygroup.set_defaults(collect_func=collectQueue)  
+    parser_bygroup.set_defaults(group_func=indexJunctions)  
+    parser_bygroup.set_defaults(clone_func=distanceClones)
+    
+    
+    # Hierarchical clustering cloning method
+    parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='Defines clones by specified distance metric on CDR3s and \
+                                              cutting of hierarchical clustering tree')
+#     parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
+#                              help='Fields to use for grouping clones (non VDJ)')
+    parser_hclust.add_argument('--method', action='store', dest='method', 
+                             choices=('chen2010', 'ademokun2011'), default=default_hclust_model, 
+                             help='Specifies which cloning method to use for calculating distance \
+                                   between CDR3s, computing linkage, and cutting clusters')
+    parser_hclust.set_defaults(feed_func=feedQueueClust)
+    parser_hclust.set_defaults(work_func=processQueueClust)
+    parser_hclust.set_defaults(collect_func=collectQueueClust)
+    parser_hclust.set_defaults(cluster_func=hierClust)
+        
+    return parser
+
+
+if __name__ == '__main__':
+    """
+    Parses command line arguments and calls main function
+    """
+    # Parse arguments
+    parser = getArgParser()
+    args = parser.parse_args()
+    args_dict = parseCommonArgs(args)
+    # Convert case of fields
+    if 'seq_field' in args_dict:
+        args_dict['seq_field'] = args_dict['seq_field'].upper()
+    if 'fields' in args_dict and args_dict['fields'] is not None:  
+        args_dict['fields'] = [f.upper() for f in args_dict['fields']]
+    
+    # Define clone_args
+    if args.command == 'bygroup':
+        args_dict['group_args'] = {'fields': args_dict['fields'],
+                                   'action': args_dict['action'], 
+                                   'mode':args_dict['mode']}
+        args_dict['clone_args'] = {'model':  args_dict['model'],
+                                   'distance':  args_dict['distance'],
+                                   'norm': args_dict['norm'],
+                                   'sym': args_dict['sym'],
+                                   'linkage': args_dict['linkage'],
+                                   'seq_field': args_dict['seq_field']}
+
+        # TODO:  can be cleaned up with abstract model class
+        args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model'])
+
+        del args_dict['fields']
+        del args_dict['action']
+        del args_dict['mode']
+        del args_dict['model']
+        del args_dict['distance']
+        del args_dict['norm']
+        del args_dict['sym']
+        del args_dict['linkage']
+        del args_dict['seq_field']
+
+    # Define clone_args
+    if args.command == 'hclust':
+        dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011}
+        args_dict['clone_func'] = dist_funcs[args_dict['method']]
+        args_dict['cluster_args'] = {'method':  args_dict['method']}
+        #del args_dict['fields']
+        del args_dict['method']
+    
+    # Call defineClones
+    del args_dict['command']
+    del args_dict['db_files']
+    for f in args.__dict__['db_files']:
+        args_dict['db_file'] = f
+        defineClones(**args_dict)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/MakeDb.py	Tue Aug 16 09:10:50 2016 -0400
@@ -0,0 +1,1025 @@
+#!/usr/bin/env python3
+"""
+Create tab-delimited database file to store sequence alignment information
+"""
+# Info
+__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
+from changeo import __version__, __date__
+
+# Imports
+import csv
+import os
+import re
+import sys
+import pandas as pd
+import tarfile
+import zipfile
+from argparse import ArgumentParser
+from collections import OrderedDict
+from itertools import groupby
+from shutil import rmtree
+from tempfile import mkdtemp
+from textwrap import dedent
+from time import time
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+
+# Presto and changeo imports
+from presto.Defaults import default_out_args
+from presto.Annotation import parseAnnotation
+from presto.IO import countSeqFile, printLog, printProgress
+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
+from changeo.IO import getDbWriter, countDbFile, getRepo
+from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
+                             j_allele_regex
+
+# Default parameters
+default_delimiter = ('\t', ',', '-')
+
+
+def gapV(ig_dict, repo_dict):
+    """
+    Insert gaps into V region and update alignment information
+
+    Arguments:
+      ig_dict : Dictionary of parsed IgBlast output
+      repo_dict : Dictionary of IMGT gapped germline sequences
+
+    Returns:
+      dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
+    """
+
+    seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
+
+    # Find gapped germline V segment
+    vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
+    vkey = (vgene, )
+    #TODO: Figure out else case
+    if vkey in repo_dict:
+        vgap = repo_dict[vkey]
+        # Iterate over gaps in the germline segment
+        gaps = re.finditer(r'\.', vgap)
+        gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
+        for gap in gaps:
+            i = gap.start()
+            # Break if gap begins after V region
+            if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
+                break
+            # Insert gap into IMGT sequence
+            seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
+            # Update gap counter
+            gapcount += 1
+        ig_dict['SEQUENCE_IMGT'] = seq_imgt
+        # Update IMGT positioning information for V
+        ig_dict['V_GERM_START_IMGT'] = 1
+        ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
+
+    return ig_dict
+
+
+def getIMGTJunc(ig_dict, repo_dict):
+    """
+    Identify junction region by IMGT definition
+
+    Arguments:
+      ig_dict : Dictionary of parsed IgBlast output
+      repo_dict : Dictionary of IMGT gapped germline sequences
+
+    Returns:
+      dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
+    """
+    # Find germline J segment
+    jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
+    jkey = (jgene, )
+    #TODO: Figure out else case
+    if jkey in repo_dict:
+        # Get germline J sequence
+        jgerm = repo_dict[jkey]
+        jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
+        # Look for (F|W)GXG aa motif in nt sequence
+        motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
+        aa_end = len(ig_dict['SEQUENCE_IMGT'])
+        #TODO: Figure out else case
+        if motif:
+            # print('\n', motif.group())
+            aa_end = motif.start() - len(jgerm) + 3
+        # Add fields to dict
+        ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
+        ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
+
+    return ig_dict
+
+
+def getRegions(ig_dict):
+    """
+    Identify FWR and CDR regions by IMGT definition
+
+    Arguments:
+      ig_dict : Dictionary of parsed alignment output
+
+    Returns:
+      dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
+             CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
+    """
+    try:
+        seq_len = len(ig_dict['SEQUENCE_IMGT'])
+        ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
+    except (KeyError, IndexError):
+        return ig_dict
+
+    try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
+    except (IndexError): return ig_dict
+
+    try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
+    except (IndexError): return ig_dict
+
+    try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
+    except (IndexError): return ig_dict
+
+    try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
+    except (IndexError): return ig_dict
+
+    try:
+        cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
+        ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
+        ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
+    except (KeyError, IndexError):
+        return ig_dict
+
+    return ig_dict
+
+
+def getSeqforIgBlast(seq_file):
+    """
+    Fetch input sequences for IgBlast queries
+
+    Arguments:
+    seq_file = a fasta file of sequences input to IgBlast
+
+    Returns:
+    a dictionary of {ID:Seq}
+    """
+
+    seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
+
+    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
+    seqs = {}
+    for seq in seq_dict.values():
+        seqs.update({seq.description:str(seq.seq)})
+
+    return seqs
+
+
+def findLine(handle, query):
+    """
+    Finds line with query string in file
+
+    Arguments:
+    handle = file handle in which to search for line
+    query = query string for which to search in file
+
+    Returns:
+    line from handle in which query string was found
+    """
+    for line in handle:
+        if(re.match(query, line)):
+            return line
+
+
+def extractIMGT(imgt_output):
+    """
+    Extract necessary files from IMGT results, zipped or unzipped
+    
+    Arguments:
+    imgt_output = zipped file or unzipped folder output by IMGT
+    
+    Returns:
+    sorted list of filenames from which information will be read
+    """
+    #file_ext = os.path.splitext(imgt_output)[1].lower()
+    imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
+    temp_dir = mkdtemp()
+    if zipfile.is_zipfile(imgt_output):
+        # Open zip file
+        imgt_zip = zipfile.ZipFile(imgt_output, 'r')
+        # Extract required files
+        imgt_files = sorted([n for n in imgt_zip.namelist() \
+                             if os.path.basename(n).startswith(imgt_flags)])
+        imgt_zip.extractall(temp_dir, imgt_files)
+        # Define file list
+        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
+    elif os.path.isdir(imgt_output):
+        # Find required files in folder
+        folder_files = []
+        for root, dirs, files in os.walk(imgt_output):
+            folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
+        # Define file list
+        imgt_files = sorted([n for n in folder_files \
+                             if os.path.basename(n).startswith(imgt_flags)])
+    elif tarfile.is_tarfile(imgt_output):
+        # Open zip file
+        imgt_tar = tarfile.open(imgt_output, 'r')
+        # Extract required files
+        imgt_files = sorted([n for n in imgt_tar.getnames() \
+                             if os.path.basename(n).startswith(imgt_flags)])
+        imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
+        # Define file list
+        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
+    else:
+        sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
+    
+    if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
+        sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
+    elif len(imgt_files) < len(imgt_flags):
+        sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
+        
+    return temp_dir, imgt_files
+
+
+# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
+def readOneIgBlastResult(block):
+    """
+    Parse a single IgBLAST query result
+
+    Arguments:
+    block =  itertools groupby object of single result
+
+    Returns:
+    None if no results, otherwise list of DataFrames for each result block
+    """
+    results = list()
+    i = 0
+    for match, subblock in groupby(block, lambda l: l=='\n'):
+        if not match:
+            # Strip whitespace and comments
+            sub = [s.strip() for s in subblock if not s.startswith('#')]
+
+            # Continue on empty block
+            if not sub:  continue
+            else:  i += 1
+
+            # Split by tabs
+            sub = [s.split('\t') for s in sub]
+
+            # Append list for "V-(D)-J rearrangement summary" (i == 1)
+            # And "V-(D)-J junction details" (i == 2)
+            # Otherwise append DataFrame of subblock
+            if i == 1 or i == 2:
+                results.append(sub[0])
+            else:
+                df = pd.DataFrame(sub)
+                if not df.empty: results.append(df)
+
+    return results if results else None
+
+
+# TODO:  needs more speeds. pandas is probably to blame.
+def readIgBlast(igblast_output, seq_dict, repo_dict,
+                score_fields=False, region_fields=False):
+    """
+    Reads IgBlast output
+
+    Arguments:
+    igblast_output = IgBlast output file (format 7)
+    seq_dict = a dictionary of {ID:Seq} from input fasta file
+    repo_dict = dictionary of IMGT gapped germline sequences
+    score_fields = if True parse alignment scores
+    region_fields = if True add FWR and CDR region fields
+
+    Returns:
+    a generator of dictionaries containing alignment data
+    """
+
+    # Open IgBlast output file
+    with open(igblast_output) as f:
+        # Iterate over individual results (separated by # IGBLASTN)
+        for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
+            block = list(block)
+            if not k1:
+                # TODO: move query name extraction into block parser readOneIgBlastResult().
+                # Extract sequence ID
+                query_name = ' '.join(block[0].strip().split(' ')[2:])
+                # Initialize db_gen to have ID and input sequence
+                db_gen = {'SEQUENCE_ID':     query_name,
+                          'SEQUENCE_INPUT':  seq_dict[query_name]}
+
+                # Parse further sub-blocks
+                block_list = readOneIgBlastResult(block)
+
+                # TODO: this is indented pretty far.  should be a separate function. or several functions.
+                # If results exist, parse further to obtain full db_gen
+                if block_list is not None:
+                    # Parse quality information
+                    db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
+                    db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
+                    db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
+                    if block_list[0][-1] == '-':
+                        db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
+                                                           IUPAC.ambiguous_dna).reverse_complement())
+
+                    # Parse V, D, and J calls
+                    call_str = ' '.join(block_list[0])
+                    v_call = parseAllele(call_str, v_allele_regex, action='list')
+                    d_call = parseAllele(call_str, d_allele_regex, action='list')
+                    j_call = parseAllele(call_str, j_allele_regex, action='list')
+                    db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
+                    db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
+                    db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
+
+                    # Parse junction sequence
+                    # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
+                    # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
+
+                    # TODO:  IgBLAST does a stupid and doesn't output block #3 sometimes. why?
+                    # TODO:  maybe we should fail these. they look craptastic.
+                    #pd.set_option('display.width', 500)
+                    #print query_name, len(block_list), hit_idx
+                    #for i, x in enumerate(block_list):
+                    #    print '[%i]' % i
+                    #    print x
+
+                    # Parse segment start and stop positions
+                    hit_df = block_list[-1]
+
+                    # Alignment info block
+                    #  0:  segment
+                    #  1:  query id
+                    #  2:  subject id
+                    #  3:  % identity
+                    #  4:  alignment length
+                    #  5:  mismatches
+                    #  6:  gap opens
+                    #  7:  gaps
+                    #  8:  q. start
+                    #  9:  q. end
+                    # 10:  s. start
+                    # 11:  s. end
+                    # 12:  evalue
+                    # 13:  bit score
+                    # 14:  query seq
+                    # 15:  subject seq
+                    # 16:  btop
+
+                    # If V call exists, parse V alignment information
+                    seq_vdj = ''
+                    if v_call is not None:
+                        v_align = hit_df[hit_df[0] == 'V'].iloc[0]
+                        # Germline positions
+                        db_gen['V_GERM_START_VDJ'] = int(v_align[10])
+                        db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
+                        # Query sequence positions
+                        db_gen['V_SEQ_START'] = int(v_align[8])
+                        db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
+
+                        if int(v_align[6]) == 0:
+                            db_gen['INDELS'] = 'F'
+                        else:
+                            db_gen['INDELS'] = 'T'
+                            # Set functional to none so record gets tossed (junction will be wrong)
+                            # db_gen['FUNCTIONAL'] = None
+
+                        # V alignment scores
+                        if score_fields:
+                            try: db_gen['V_SCORE'] = float(v_align[13])
+                            except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
+
+                            try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
+                            except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
+
+                            try: db_gen['V_EVALUE'] = float(v_align[12])
+                            except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
+
+                            try: db_gen['V_BTOP'] = v_align[16]
+                            except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
+
+                        # Update VDJ sequence, removing insertions
+                        start = 0
+                        for m in re.finditer(r'-', v_align[15]):
+                            ins = m.start()
+                            seq_vdj += v_align[14][start:ins]
+                            start = ins + 1
+                        seq_vdj += v_align[14][start:]
+
+                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
+                    # If D call exists, parse D alignment information
+                    if d_call is not None:
+                        d_align = hit_df[hit_df[0] == 'D'].iloc[0]
+
+                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
+                        # Determine N-region length and amount of J overlap with V or D alignment
+                        overlap = 0
+                        if v_call is not None:
+                            n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
+                            if n1_len < 0:
+                                db_gen['N1_LENGTH'] = 0
+                                overlap = abs(n1_len)
+                            else:
+                                db_gen['N1_LENGTH'] = n1_len
+                                n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
+                                n1_end = int(d_align[8])-1
+                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
+
+                        # Query sequence positions
+                        db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
+                        db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
+
+                        # Germline positions
+                        db_gen['D_GERM_START'] = int(d_align[10]) + overlap
+                        db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
+
+                        # Update VDJ sequence, removing insertions
+                        start = overlap
+                        for m in re.finditer(r'-', d_align[15]):
+                            ins = m.start()
+                            seq_vdj += d_align[14][start:ins]
+                            start = ins + 1
+                        seq_vdj += d_align[14][start:]
+
+                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
+                    # If J call exists, parse J alignment information
+                    if j_call is not None:
+                        j_align = hit_df[hit_df[0] == 'J'].iloc[0]
+
+                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
+                        # Determine N-region length and amount of J overlap with V or D alignment
+                        overlap = 0
+                        if d_call is not None:
+                            n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
+                            if n2_len < 0:
+                                db_gen['N2_LENGTH'] = 0
+                                overlap = abs(n2_len)
+                            else:
+                                db_gen['N2_LENGTH'] = n2_len
+                                n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
+                                n2_end = int(j_align[8])-1
+                                seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
+                        elif v_call is not None:
+                            n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
+                            if n1_len < 0:
+                                db_gen['N1_LENGTH'] = 0
+                                overlap = abs(n1_len)
+                            else:
+                                db_gen['N1_LENGTH'] = n1_len
+                                n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
+                                n1_end = int(j_align[8])-1
+                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
+                        else:
+                            db_gen['N1_LENGTH'] = 0
+
+                        # Query positions
+                        db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
+                        db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
+
+                        # Germline positions
+                        db_gen['J_GERM_START'] = int(j_align[10]) + overlap
+                        db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
+
+                        # J alignment scores
+                        if score_fields:
+                            try: db_gen['J_SCORE'] = float(j_align[13])
+                            except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
+
+                            try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
+                            except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
+
+                            try: db_gen['J_EVALUE'] = float(j_align[12])
+                            except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
+
+                            try: db_gen['J_BTOP'] = j_align[16]
+                            except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
+
+                        # Update VDJ sequence, removing insertions
+                        start = overlap
+                        for m in re.finditer(r'-', j_align[15]):
+                            ins = m.start()
+                            seq_vdj += j_align[14][start:ins]
+                            start = ins + 1
+                        seq_vdj += j_align[14][start:]
+
+                    db_gen['SEQUENCE_VDJ'] = seq_vdj
+
+                    # Create IMGT-gapped sequence and infer IMGT junction
+                    if v_call is not None:
+                        db_gen = gapV(db_gen, repo_dict)
+                        if j_call is not None:
+                            db_gen = getIMGTJunc(db_gen, repo_dict)
+
+                    # FWR and CDR regions
+                    if region_fields: getRegions(db_gen)
+
+                yield IgRecord(db_gen)
+
+
+# TODO:  should be more readable
+def readIMGT(imgt_files, score_fields=False, region_fields=False):
+    """
+    Reads IMGT/HighV-Quest output
+
+    Arguments: 
+    imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
+    score_fields = if True parse alignment scores
+    region_fields = if True add FWR and CDR region fields
+    
+    Returns: 
+    a generator of dictionaries containing alignment data
+    """
+    imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
+    # Create a dictionary for each sequence alignment and yield its generator
+    for sm, gp, nt, jn in zip(*imgt_iters):
+        if len(set([sm['Sequence ID'],
+                    gp['Sequence ID'],
+                    nt['Sequence ID'],
+                    jn['Sequence ID']])) != 1:
+            sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
+                     % sm['Sequence ID'])
+
+        db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
+                  'SEQUENCE_INPUT': sm['Sequence']}
+
+        if 'No results' not in sm['Functionality']:
+            db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
+                                                 ('unprod' in sm['Functionality'])]
+            db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
+                                               ('out-of-frame' in sm['JUNCTION frame'])],
+            db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
+                                           ('unprod' in sm['Functionality'])]
+            db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
+                                                         'missing' in sm['V-REGION potential ins/del']))) +
+                                                         ('unprod' in sm['Functionality'])]
+            db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
+                                              sm['V-REGION insertions'],
+                                              sm['V-REGION deletions']))]
+
+            db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
+            db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
+
+            db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
+            db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
+            db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
+
+            v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
+            db_gen['V_SEQ_START'] = nt['V-REGION start']
+            db_gen['V_SEQ_LENGTH'] = v_seq_length
+            db_gen['V_GERM_START_IMGT'] = 1
+            db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
+
+            db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
+                                                       jn['N-REGION-nt nb'],
+                                                       jn['N1-REGION-nt nb'],
+                                                       jn["P5'D-nt nb"]] if i)
+            db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
+                                                         jn["P3'V-nt nb"],
+                                                         jn['N-REGION-nt nb'],
+                                                         jn['N1-REGION-nt nb'],
+                                                         jn["P5'D-nt nb"]] if i)
+            db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
+            db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
+            db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
+            db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
+                                                       jn['N2-REGION-nt nb'],
+                                                       jn["P5'J-nt nb"]] if i)
+
+            db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
+                                                         jn["P3'V-nt nb"],
+                                                         jn['N-REGION-nt nb'],
+                                                         jn['N1-REGION-nt nb'],
+                                                         jn["P5'D-nt nb"],
+                                                         jn["D-REGION-nt nb"],
+                                                         jn["P3'D-nt nb"],
+                                                         jn['N2-REGION-nt nb'],
+                                                         jn["P5'J-nt nb"]] if i)
+            db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
+            db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
+            db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
+
+            db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
+            db_gen['JUNCTION'] = jn['JUNCTION']
+
+            # Alignment scores
+            if score_fields:
+                try:  db_gen['V_SCORE'] = float(sm['V-REGION score'])
+                except (TypeError, ValueError):  db_gen['V_SCORE'] = 'None'
+
+                try:  db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
+                except (TypeError, ValueError):  db_gen['V_IDENTITY'] = 'None'
+
+                try:  db_gen['J_SCORE'] = float(sm['J-REGION score'])
+                except (TypeError, ValueError):  db_gen['J_SCORE'] = 'None'
+
+                try:  db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
+                except (TypeError, ValueError):  db_gen['J_IDENTITY'] = 'None'
+
+            # FWR and CDR regions
+            if region_fields: getRegions(db_gen)
+        else:
+            db_gen['V_CALL'] = 'None'
+            db_gen['D_CALL'] = 'None'
+            db_gen['J_CALL'] = 'None'
+
+        yield IgRecord(db_gen)
+
+    
+def getIDforIMGT(seq_file):
+    """
+    Create a sequence ID translation using IMGT truncation
+    
+    Arguments: 
+    seq_file = a fasta file of sequences input to IMGT
+                    
+    Returns: 
+    a dictionary of {truncated ID: full seq description} 
+    """
+    
+    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
+    ids = {}
+    for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
+        if len(rec.description) <= 50:
+            id_key = rec.description
+        else:
+            id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
+        ids.update({id_key:rec.description})
+
+    return ids
+
+
+def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
+            score_fields=False, region_fields=False, out_args=default_out_args):
+    """
+    Writes tab-delimited database file in output directory
+    
+    Arguments:
+    db_gen = a generator of IgRecord objects containing alignment data
+    file_prefix = directory and prefix for CLIP tab-delim file
+    total_count = number of records (for progress bar)
+    id_dict = a dictionary of {IMGT ID: full seq description}
+    no_parse = if ID is to be parsed for pRESTO output with default delimiters
+    score_fields = if True add alignment score fields to output file
+    region_fields = if True add FWR and CDR region fields to output file
+    out_args = common output argument dictionary from parseCommonArgs
+
+    Returns:
+    None
+    """
+    pass_file = "%s_db-pass.tab" % file_prefix
+    fail_file = "%s_db-fail.tab" % file_prefix
+    ordered_fields = ['SEQUENCE_ID',
+                      'SEQUENCE_INPUT',
+                      'FUNCTIONAL',
+                      'IN_FRAME',
+                      'STOP',
+                      'MUTATED_INVARIANT',
+                      'INDELS',
+                      'V_CALL',
+                      'D_CALL',
+                      'J_CALL',
+                      'SEQUENCE_VDJ',
+                      'SEQUENCE_IMGT',
+                      'V_SEQ_START',
+                      'V_SEQ_LENGTH',
+                      'V_GERM_START_VDJ',
+                      'V_GERM_LENGTH_VDJ',
+                      'V_GERM_START_IMGT',
+                      'V_GERM_LENGTH_IMGT',
+                      'N1_LENGTH',
+                      'D_SEQ_START',
+                      'D_SEQ_LENGTH',
+                      'D_GERM_START',
+                      'D_GERM_LENGTH',
+                      'N2_LENGTH',
+                      'J_SEQ_START',
+                      'J_SEQ_LENGTH',
+                      'J_GERM_START',
+                      'J_GERM_LENGTH',
+                      'JUNCTION_LENGTH',
+                      'JUNCTION']
+
+    if score_fields:
+        ordered_fields.extend(['V_SCORE',
+                               'V_IDENTITY',
+                               'V_EVALUE',
+                               'V_BTOP',
+                               'J_SCORE',
+                               'J_IDENTITY',
+                               'J_EVALUE',
+                               'J_BTOP'])
+
+    if region_fields:
+        ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
+                               'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
+
+
+    # TODO:  This is not the best approach. should pass in output fields.
+    # Initiate passed handle
+    pass_handle = None
+
+    # Open failed file
+    if out_args['failed']:
+        fail_handle = open(fail_file, 'wt')
+        fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
+    else:
+        fail_handle = None
+        fail_writer = None
+
+    # Initialize counters and file
+    pass_writer = None
+    start_time = time()
+    rec_count = pass_count = fail_count = 0
+    for record in db_gen:
+        #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+        printProgress(rec_count, total_count, 0.05, start_time)
+        rec_count += 1
+
+        # Count pass or fail
+        if (record.v_call == 'None' and record.j_call == 'None') or \
+                record.functional is None or \
+                not record.seq_vdj or \
+                not record.junction:
+            # print(record.v_call, record.j_call, record.functional, record.junction)
+            fail_count += 1
+            if fail_writer is not None: fail_writer.writerow(record.toDict())
+            continue
+        else: 
+            pass_count += 1
+            
+        # Build sample sequence description
+        if record.id in id_dict:
+            record.id = id_dict[record.id]
+
+        # Parse sequence description into new columns
+        if not no_parse:
+            record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
+            record.id = record.annotations['ID']
+            del record.annotations['ID']
+
+        # TODO:  This is not the best approach. should pass in output fields.
+        # If first sequence, use parsed description to create new columns and initialize writer
+        if pass_writer is None:
+            if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
+            pass_handle = open(pass_file, 'wt')
+            pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
+
+        # Write row to tab-delim CLIP file
+        pass_writer.writerow(record.toDict())
+    
+    # Print log
+    #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
+    printProgress(rec_count, total_count, 0.05, start_time)
+
+    log = OrderedDict()
+    log['OUTPUT'] = pass_file
+    log['PASS'] = pass_count
+    log['FAIL'] = fail_count
+    log['END'] = 'MakeDb'
+    printLog(log)
+    
+    if pass_handle is not None: pass_handle.close()
+    if fail_handle is not None: fail_handle.close()
+
+
+# TODO:  may be able to merge with parseIMGT
+def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
+                 region_fields=False, out_args=default_out_args):
+    """
+    Main for IgBlast aligned sample sequences
+
+    Arguments:
+    igblast_output = IgBlast output file to process
+    seq_file = fasta file input to IgBlast (from which to get sequence)
+    repo = folder with germline repertoire files
+    no_parse = if ID is to be parsed for pRESTO output with default delimiters
+    score_fields = if True add alignment score fields to output file
+    region_fields = if True add FWR and CDR region fields to output file
+    out_args = common output argument dictionary from parseCommonArgs
+
+    Returns:
+    None
+    """
+    # Print parameter info
+    log = OrderedDict()
+    log['START'] = 'MakeDB'
+    log['ALIGNER'] = 'IgBlast'
+    log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
+    log['SEQ_FILE'] = os.path.basename(seq_file)
+    log['NO_PARSE'] = no_parse
+    log['SCORE_FIELDS'] = score_fields
+    log['REGION_FIELDS'] = region_fields
+    printLog(log)
+
+    # Get input sequence dictionary
+    seq_dict = getSeqforIgBlast(seq_file)
+
+    # Formalize out_dir and file-prefix
+    if not out_args['out_dir']:
+        out_dir = os.path.split(igblast_output)[0]
+    else:
+        out_dir = os.path.abspath(out_args['out_dir'])
+        if not os.path.exists(out_dir):  os.mkdir(out_dir)
+    if out_args['out_name']:
+        file_prefix = out_args['out_name']
+    else:
+        file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
+    file_prefix = os.path.join(out_dir, file_prefix)
+
+    total_count = countSeqFile(seq_file)
+
+    # Create
+    repo_dict = getRepo(repo)
+    igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
+                               score_fields=score_fields, region_fields=region_fields)
+    writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
+            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+
+
+# TODO:  may be able to merge with parseIgBlast
+def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
+              region_fields=False, out_args=default_out_args):
+    """
+    Main for IMGT aligned sample sequences
+
+    Arguments:
+    imgt_output = zipped file or unzipped folder output by IMGT
+    seq_file = FASTA file input to IMGT (from which to get seqID)
+    no_parse = if ID is to be parsed for pRESTO output with default delimiters
+    score_fields = if True add alignment score fields to output file
+    region_fields = if True add FWR and CDR region fields to output file
+    out_args = common output argument dictionary from parseCommonArgs
+        
+    Returns: 
+    None
+    """
+    # Print parameter info
+    log = OrderedDict()
+    log['START'] = 'MakeDb'
+    log['ALIGNER'] = 'IMGT'
+    log['ALIGN_RESULTS'] = imgt_output
+    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+    log['NO_PARSE'] = no_parse
+    log['SCORE_FIELDS'] = score_fields
+    log['REGION_FIELDS'] = region_fields
+    printLog(log)
+    
+    # Get individual IMGT result files
+    temp_dir, imgt_files = extractIMGT(imgt_output)
+        
+    # Formalize out_dir and file-prefix
+    if not out_args['out_dir']:
+        out_dir = os.path.dirname(os.path.abspath(imgt_output))
+    else:
+        out_dir = os.path.abspath(out_args['out_dir'])
+        if not os.path.exists(out_dir):  os.mkdir(out_dir)
+    if out_args['out_name']:
+        file_prefix = out_args['out_name']
+    else:
+        file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
+    file_prefix = os.path.join(out_dir, file_prefix)
+
+    total_count = countDbFile(imgt_files[0])
+    
+    # Get (parsed) IDs from fasta file submitted to IMGT
+    id_dict = getIDforIMGT(seq_file) if seq_file else {}
+    
+    # Create
+    imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
+                         region_fields=region_fields)
+    writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
+            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+
+    # Delete temp directory
+    rmtree(temp_dir)
+
+
+def getArgParser():
+    """
+    Defines the ArgumentParser
+
+    Arguments: 
+    None
+                      
+    Returns: 
+    an ArgumentParser object
+    """
+    fields = dedent(
+             '''
+              output files:
+                  db-pass
+                      database of parsed alignment records.
+                  db-fail
+                      database with records failing alignment.
+
+              output fields:
+                  SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
+                  INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
+                  V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
+                  V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
+                  D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
+                  J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
+                  JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
+                  J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
+                  FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+              ''')
+                
+    # Define ArgumentParser
+    parser = ArgumentParser(description=__doc__, epilog=fields,
+                            formatter_class=CommonHelpFormatter)
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
+    subparsers = parser.add_subparsers(title='subcommands', dest='command',
+                                       help='Aligner used', metavar='')
+    # TODO:  This is a temporary fix for Python issue 9253
+    subparsers.required = True
+
+    # Parent parser    
+    parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
+
+    # IgBlast Aligner
+    parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
+                                           parents=[parser_parent],
+                                           formatter_class=CommonHelpFormatter)
+    parser_igblast.set_defaults(func=parseIgBlast)
+    parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+                                required=True,
+                                help='''IgBLAST output files in format 7 with query sequence
+                                     (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
+    parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+                                help='''List of folders and/or fasta files containing
+                                     IMGT-gapped germline sequences corresponding to the
+                                     set of germlines used in the IgBLAST alignment.''')
+    parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
+                                required=True,
+                                help='List of input FASTA files containing sequences')
+    parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
+                                help='''Specify if input IDs should not be parsed to add
+                                     new columns to database.''')
+    parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
+                                help='''Specify if alignment score metrics should be
+                                     included in the output. Adds the V_SCORE, V_IDENTITY,
+                                     V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
+                                     J_BTOP, and J_EVALUE columns.''')
+    parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
+                                help='''Specify if IMGT framework and CDR regions should be
+                                     included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+                                     FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+                                     CDR3_IMGT columns.''')
+    
+    # IMGT aligner
+    parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', 
+                                        parents=[parser_parent], 
+                                        formatter_class=CommonHelpFormatter)
+    imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True)
+    imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+                                help='''Either zipped IMGT output files (.zip) or a folder
+                                     containing unzipped IMGT output files (which must
+                                     include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+                                     and 6_Junction).''')
+    parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
+                             required=False,
+                             help='List of input FASTA files containing sequences')
+    parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', 
+                             help='''Specify if input IDs should not be parsed to add new
+                                  columns to database.''')
+    parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
+                             help='''Specify if alignment score metrics should be
+                                  included in the output. Adds the V_SCORE, V_IDENTITY,
+                                  J_SCORE and J_IDENTITY. Note, this will also add
+                                  the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
+                                  but they will be empty for IMGT output.''')
+    parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
+                             help='''Specify if IMGT framework and CDR regions should be
+                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+                                  CDR3_IMGT columns.''')
+    parser_imgt.set_defaults(func=parseIMGT)
+
+    return parser
+    
+    
+if __name__ == "__main__":
+    """
+    Parses command line arguments and calls main
+    """
+    parser = getArgParser()    
+    args = parser.parse_args()
+    args_dict = parseCommonArgs(args, in_arg='aligner_files')
+
+    # Set no ID parsing if sequence files are not provided
+    if 'seq_files' in args_dict and not args_dict['seq_files']:
+        args_dict['no_parse'] = True
+
+    # Delete
+    if 'seq_files' in args_dict: del args_dict['seq_files']
+    if 'aligner_files' in args_dict: del args_dict['aligner_files']
+    if 'command' in args_dict: del args_dict['command']
+    if 'func' in args_dict: del args_dict['func']           
+    
+    if args.command == 'imgt':
+        for i in range(len(args.__dict__['aligner_files'])):
+            args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
+            args_dict['seq_file'] = args.__dict__['seq_files'][i] \
+                                    if args.__dict__['seq_files'] else None
+            args.func(**args_dict)
+    elif args.command == 'igblast':
+        for i in range(len(args.__dict__['aligner_files'])):
+            args_dict['igblast_output'] =  args.__dict__['aligner_files'][i]
+            args_dict['seq_file'] = args.__dict__['seq_files'][i]
+            args.func(**args_dict)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/define_clones.r	Tue Aug 16 09:10:50 2016 -0400
@@ -0,0 +1,15 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input=args[1]
+output=args[2]
+
+change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F)
+
+freq = data.frame(table(change.o$CLONE))
+freq2 = data.frame(table(freq$Freq))
+
+freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1))
+
+names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences")
+
+write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/define_clones.sh	Tue Aug 16 09:10:50 2016 -0400
@@ -0,0 +1,39 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+#define_clones.sh $input $noparse $scores $regions $out_file
+
+type=$1
+input=$2
+
+mkdir -p $PWD/outdir
+
+cp $input $PWD/input.tab #file has to have a ".tab" extension
+
+if [ "bygroup" == "$type" ] ; then	
+	mode=$3
+	act=$4
+	model=$5
+	norm=$6
+	sym=$7
+	link=$8
+	dist=$9
+	output=${10}
+	output2=${11}
+	
+	/data/users/david/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+	#/home/galaxy/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+	
+	Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
+else
+	method=$3
+	output=$4
+	output2=$5
+	
+	/data/users/david/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+	#/home/galaxy/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+	
+	Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
+fi
+
+cp $PWD/outdir/output_clone-pass.tab $output
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/makedb.sh	Tue Aug 16 09:10:50 2016 -0400
@@ -0,0 +1,35 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+input=$1
+noparse=$2
+scores=$3
+regions=$4
+output=$5
+
+if [ "true" == "$noparse" ] ; then
+	noparse="--noparse"
+else
+	noparse=""
+fi
+
+if [ "true" == "$scores" ] ; then
+	scores="--scores"
+else
+	scores=""
+fi
+
+if [ "true" == "$regions" ] ; then
+	regions="--regions"
+else
+	regions=""
+fi
+
+mkdir $PWD/outdir
+
+echo "makedb: $PWD/outdir"
+
+/data/users/david/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+#/home/galaxy/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+
+mv $PWD/outdir/output_db-pass.tab $output
--- a/mutation_analysis.r	Thu Aug 11 10:35:52 2016 -0400
+++ b/mutation_analysis.r	Tue Aug 16 09:10:50 2016 -0400
@@ -280,8 +280,6 @@
 
 			print("Plotting stacked transition")
 
-			print(transition2)
-
 			png(filename=paste("transitions_stacked_", name, ".png", sep=""))
 			p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity") #stacked bar
 			p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL))
--- a/wrapper.sh	Thu Aug 11 10:35:52 2016 -0400
+++ b/wrapper.sh	Tue Aug 16 09:10:50 2016 -0400
@@ -45,14 +45,6 @@
 cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt
 cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt
 
-#cat $PWD/files/*/1_* > $PWD/summary.txt
-#cat $PWD/files/*/3_* > $PWD/sequences.txt
-#cat $PWD/files/*/5_* > $PWD/aa.txt
-#cat $PWD/files/*/6_* > $PWD/junction.txt
-#cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt
-#cat $PWD/files/*/8_* > $PWD/mutationstats.txt
-#cat $PWD/files/*/10_* > $PWD/hotspots.txt
-
 if [[ ${#BLASTN_DIR} -ge 5 ]] ; then
 	echo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"
 else
@@ -66,25 +58,11 @@
 if [[ "${method}" == "custom" ]] ; then
 	python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
 else
-	#ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
-	#ID_index=$((ID_index+1))
-	#sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
-	#sequence_index=$((sequence_index+1))
-	
-	#echo "${ID_index}, ${sequence_index}"
-	
-	#cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.tmp
-	#cat $PWD/summary.txt | tail -n+2 | awk -v id="${ID_index}" -v seq="${sequence_index}" 'BEGIN{FS="\t"} if(NF>10 && length($seq) > 0) {print ">" $id "\n" $seq} {}' > $PWD/sequences.fasta
-	
-	#cat $PWD/sequences.tmp | grep -B1 -vE ">.*|^$" | grep -v "^\-\-$" > sequences.fasta #filter out empty sequences
-	
 	echo "---------------- summary_to_fasta.py ----------------"
 	echo "---------------- summary_to_fasta.py ----------------<br />" >> $log
 	
 	python $dir/summary_to_fasta.py --input $PWD/summary.txt --fasta $PWD/sequences.fasta
 	
-	#rm $PWD/sequences.tmp
-	
 	echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
 	${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt
 fi
@@ -397,10 +375,6 @@
 echo "<tr><td>Baseline cg data</td><td><a href='baseline_cg.txt'>Download</a></td></tr>" >> $output
 echo "<tr><td>Baseline cm PDF</td><td><a href='baseline_cm.pdf'>Download</a></td></tr>" >> $output
 echo "<tr><td>Baseline cm data</td><td><a href='baseline_cm.txt'>Download</a></td></tr>" >> $output
-#echo "<tr><td></td><td><a href='IgAT.zip'>IgAT zip</a></td></tr>" >> $output
-#echo "<tr><td></td><td><a href='IgAT_ca.zip'>IgAT ca zip</a></td></tr>" >> $output
-#echo "<tr><td></td><td><a href='IgAT_cg.zip'>IgAT cg zip</a></td></tr>" >> $output
-#echo "<tr><td></td><td><a href='IgAT_cm.zip'>IgAT cm zip</a></td></tr>" >> $output
 echo "<tr><td>An IMGT archive with just the matched and filtered sequences</td><td><a href='new_IMGT.txz'>Download</a></td></tr>" >> $output
 echo "<tr><td>An IMGT archive with just the matched and filtered ca sequences</td><td><a href='new_IMGT_ca.txz'>Download</a></td></tr>" >> $output
 echo "<tr><td>An IMGT archive with just the matched and filtered ca1 sequences</td><td><a href='new_IMGT_ca1.txz'>Download</a></td></tr>" >> $output
@@ -411,6 +385,8 @@
 echo "<tr><td>An IMGT archive with just the matched and filtered cg3 sequences</td><td><a href='new_IMGT_cg3.txz'>Download</a></td></tr>" >> $output
 echo "<tr><td>An IMGT archive with just the matched and filtered cg4 sequences</td><td><a href='new_IMGT_cg4.txz'>Download</a></td></tr>" >> $output
 echo "<tr><td>An IMGT archive with just the matched and filtered cm sequences</td><td><a href='new_IMGT_cm.txz'>Download</a></td></tr>" >> $output
+echo "<tr><td>The Change-O DB file with defined clones</td><td><a href='change_o/change-o-db-defined_clones.txt'>Download</a></td></tr>" >> $output
+echo "<tr><td>The Change-O DB defined clones summary file</td><td><a href='change_o/change-o-defined_clones-summary.txt'>Download</a></td></tr>" >> $output
 echo "</table>" >> $output
 
 echo "</div>" >> $output #downloads tab end
@@ -465,15 +441,6 @@
 
 if [[ "$naive_output" != "None" ]]
 then
-	#echo "---------------- imgt_loader.r ----------------"
-	#echo "---------------- imgt_loader.r ----------------<br />" >> $log
-	#python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output
-	#Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1
-
-	#echo "---------------- naive_output.r ----------------"
-	#echo "---------------- naive_output.r ----------------<br />" >> $log
-	#Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1
-	
 	cp $outdir/new_IMGT_ca.txz ${naive_output_ca}
 	cp $outdir/new_IMGT_cg.txz ${naive_output_cg}
 	cp $outdir/new_IMGT_cm.txz ${naive_output_cm}
@@ -481,6 +448,24 @@
 
 echo "</table>" >> $outdir/base_overview.html
 
+echo "---------------- change-o MakeDB ----------------"
+echo "---------------- change-o MakeDB ----------------<br />" >> $log
+
+mkdir $outdir/change_o
+
+tmp="$PWD"
+
+cd $outdir/change_o
+
+bash $dir/change_o/makedb.sh $input false false false $outdir/change_o/change-o-db.txt
+
+echo "---------------- change-o DefineClones ----------------"
+echo "---------------- change-o DefineClones ----------------<br />" >> $log
+
+bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones.txt $outdir/change_o/change-o-defined_clones-summary.txt
+
+PWD="$tmp"
+
 mv $log $outdir/log.html
 
 echo "<html><center><h1><a href='index.html'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> 'open in new tab' on the link above)<br />" > $log