Mercurial > repos > davidvanzessen > mutation_analysis
changeset 120:613278c1bde0 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 16 Aug 2016 09:10:50 -0400 |
parents | 626a956f3811 |
children | 31cca6d3722a |
files | change_o/DefineClones.py change_o/MakeDb.py change_o/define_clones.r change_o/define_clones.sh change_o/makedb.sh mutation_analysis.r wrapper.sh |
diffstat | 7 files changed, 2186 insertions(+), 37 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/DefineClones.py Tue Aug 16 09:10:50 2016 -0400 @@ -0,0 +1,1052 @@ +#!/usr/bin/env python3 +""" +Assign Ig sequences into clones +""" +# Info +__author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman' +from changeo import __version__, __date__ + +# Imports +import os +import re +import sys +import numpy as np +from argparse import ArgumentParser +from collections import OrderedDict +from itertools import chain +from textwrap import dedent +from time import time +from Bio import pairwise2 +from Bio.Seq import translate + +# Presto and changeo imports +from presto.Defaults import default_out_args +from presto.IO import getFileType, getOutputHandle, printLog, printProgress +from presto.Multiprocessing import manageProcesses +from presto.Sequence import getDNAScoreDict +from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs +from changeo.Distance import getDNADistMatrix, getAADistMatrix, \ + hs1f_model, m1n_model, hs5f_model, \ + calcDistances, formClusters +from changeo.IO import getDbWriter, readDbFile, countDbFile +from changeo.Multiprocessing import DbData, DbResult + +# Defaults +default_translate = False +default_distance = 0.0 +default_bygroup_model = 'hs1f' +default_hclust_model = 'chen2010' +default_seq_field = 'JUNCTION' +default_norm = 'len' +default_sym = 'avg' +default_linkage = 'single' + +# TODO: should be in Distance, but need to be after function definitions +# Amino acid Hamming distance +aa_model = getAADistMatrix(mask_dist=1, gap_dist=0) + +# DNA Hamming distance +ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0) + + +# TODO: this function is an abstraction to facilitate later cleanup +def getModelMatrix(model): + """ + Simple wrapper to get distance matrix from model name + + Arguments: + model = model name + + Return: + a pandas.DataFrame containing the character distance matrix + """ + if model == 'aa': + return(aa_model) + elif model == 'ham': + return(ham_model) + elif model == 'm1n': + return(m1n_model) + elif model == 'hs1f': + return(hs1f_model) + elif model == 'hs5f': + return(hs5f_model) + else: + sys.stderr.write('Unrecognized distance model: %s.\n' % model) + + +def indexJunctions(db_iter, fields=None, mode='gene', action='first'): + """ + Identifies preclonal groups by V, J and junction length + + Arguments: + db_iter = an iterator of IgRecords defined by readDbFile + fields = additional annotation fields to use to group preclones; + if None use only V, J and junction length + mode = specificity of alignment call to use for assigning preclones; + one of ('allele', 'gene') + action = how to handle multiple value fields when assigning preclones; + one of ('first', 'set') + + Returns: + a dictionary of {(V, J, junction length):[IgRecords]} + """ + # Define functions for grouping keys + if mode == 'allele' and fields is None: + def _get_key(rec, act): + return (rec.getVAllele(act), rec.getJAllele(act), + None if rec.junction is None else len(rec.junction)) + elif mode == 'gene' and fields is None: + def _get_key(rec, act): + return (rec.getVGene(act), rec.getJGene(act), + None if rec.junction is None else len(rec.junction)) + elif mode == 'allele' and fields is not None: + def _get_key(rec, act): + vdj = [rec.getVAllele(act), rec.getJAllele(act), + None if rec.junction is None else len(rec.junction)] + ann = [rec.toDict().get(k, None) for k in fields] + return tuple(chain(vdj, ann)) + elif mode == 'gene' and fields is not None: + def _get_key(rec, act): + vdj = [rec.getVGene(act), rec.getJGene(act), + None if rec.junction is None else len(rec.junction)] + ann = [rec.toDict().get(k, None) for k in fields] + return tuple(chain(vdj, ann)) + + start_time = time() + clone_index = {} + rec_count = 0 + for rec in db_iter: + key = _get_key(rec, action) + + # Print progress + if rec_count == 0: + print('PROGRESS> Grouping sequences') + + printProgress(rec_count, step=1000, start_time=start_time) + rec_count += 1 + + # Assigned passed preclone records to key and failed to index None + if all([k is not None and k != '' for k in key]): + #print key + # TODO: Has much slow. Should have less slow. + if action == 'set': + + f_range = list(range(2, 3 + (len(fields) if fields else 0))) + vdj_range = list(range(2)) + + # Check for any keys that have matching columns and junction length and overlapping genes/alleles + to_remove = [] + if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index: + key = list(key) + for k in clone_index: + if k is not None and all([key[i] == k[i] for i in f_range]): + if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]): + for i in vdj_range: key[i] = tuple(set(key[i]).union(set(k[i]))) + to_remove.append(k) + + # Remove original keys, replace with union of all genes/alleles and append values to new key + val = [rec] + val += list(chain(*(clone_index.pop(k) for k in to_remove))) + clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val + + elif action == 'first': + clone_index.setdefault(key, []).append(rec) + else: + clone_index.setdefault(None, []).append(rec) + + printProgress(rec_count, step=1000, start_time=start_time, end=True) + + return clone_index + + +def distanceClones(records, model=default_bygroup_model, distance=default_distance, + dist_mat=None, norm=default_norm, sym=default_sym, + linkage=default_linkage, seq_field=default_seq_field): + """ + Separates a set of IgRecords into clones + + Arguments: + records = an iterator of IgRecords + model = substitution model used to calculate distance + distance = the distance threshold to assign clonal groups + dist_mat = pandas DataFrame of pairwise nucleotide or amino acid distances + norm = normalization method + sym = symmetry method + linkage = type of linkage + seq_field = sequence field used to calculate distance between records + + Returns: + a dictionary of lists defining {clone number: [IgRecords clonal group]} + """ + # Get distance matrix if not provided + if dist_mat is None: dist_mat = getModelMatrix(model) + + # Determine length of n-mers + if model in ['hs1f', 'm1n', 'aa', 'ham']: + nmer_len = 1 + elif model in ['hs5f']: + nmer_len = 5 + else: + sys.stderr.write('Unrecognized distance model: %s.\n' % model) + + # Define unique junction mapping + seq_map = {} + for ig in records: + seq = ig.getSeqField(seq_field) + # Check if sequence length is 0 + if len(seq) == 0: + return None + + seq = re.sub('[\.-]','N', str(seq)) + if model == 'aa': seq = translate(seq) + + seq_map.setdefault(seq, []).append(ig) + + # Process records + if len(seq_map) == 1: + return {1:records} + + # Define sequences + seqs = list(seq_map.keys()) + + # Calculate pairwise distance matrix + dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym) + + # Perform hierarchical clustering + clusters = formClusters(dists, linkage, distance) + + # Turn clusters into clone dictionary + clone_dict = {} + for i, c in enumerate(clusters): + clone_dict.setdefault(c, []).extend(seq_map[seqs[i]]) + + return clone_dict + + +def distChen2010(records): + """ + Calculate pairwise distances as defined in Chen 2010 + + Arguments: + records = list of IgRecords where first is query to be compared to others in list + + Returns: + list of distances + """ + # Pull out query sequence and V/J information + query = records.popitem(last=False) + query_cdr3 = query.junction[3:-3] + query_v_allele = query.getVAllele() + query_v_gene = query.getVGene() + query_v_family = query.getVFamily() + query_j_allele = query.getJAllele() + query_j_gene = query.getJGene() + # Create alignment scoring dictionary + score_dict = getDNAScoreDict() + + scores = [0]*len(records) + for i in range(len(records)): + ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3], + score_dict, -1, -1, one_alignment_only=True) + # Check V similarity + if records[i].getVAllele() == query_v_allele: ld += 0 + elif records[i].getVGene() == query_v_gene: ld += 1 + elif records[i].getVFamily() == query_v_family: ld += 3 + else: ld += 5 + # Check J similarity + if records[i].getJAllele() == query_j_allele: ld += 0 + elif records[i].getJGene() == query_j_gene: ld += 1 + else: ld += 3 + # Divide by length + scores[i] = ld/max(len(records[i].junction[3:-3]), query_cdr3) + + return scores + + +def distAdemokun2011(records): + """ + Calculate pairwise distances as defined in Ademokun 2011 + + Arguments: + records = list of IgRecords where first is query to be compared to others in list + + Returns: + list of distances + """ + # Pull out query sequence and V family information + query = records.popitem(last=False) + query_cdr3 = query.junction[3:-3] + query_v_family = query.getVFamily() + # Create alignment scoring dictionary + score_dict = getDNAScoreDict() + + scores = [0]*len(records) + for i in range(len(records)): + + if abs(len(query_cdr3) - len(records[i].junction[3:-3])) > 10: + scores[i] = 1 + elif query_v_family != records[i].getVFamily(): + scores[i] = 1 + else: + ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3], + score_dict, -1, -1, one_alignment_only=True) + scores[i] = ld/min(len(records[i].junction[3:-3]), query_cdr3) + + return scores + + +def hierClust(dist_mat, method='chen2010'): + """ + Calculate hierarchical clustering + + Arguments: + dist_mat = square-formed distance matrix of pairwise CDR3 comparisons + + Returns: + list of cluster ids + """ + if method == 'chen2010': + clusters = formClusters(dist_mat, 'average', 0.32) + elif method == 'ademokun2011': + clusters = formClusters(dist_mat, 'complete', 0.25) + else: clusters = np.ones(dist_mat.shape[0]) + + return clusters + +# TODO: Merge duplicate feed, process and collect functions. +def feedQueue(alive, data_queue, db_file, group_func, group_args={}): + """ + Feeds the data queue with Ig records + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue to hold data for processing + db_file = the Ig record database file + group_func = the function to use for assigning preclones + group_args = a dictionary of arguments to pass to group_func + + Returns: + None + """ + # Open input file and perform grouping + try: + # Iterate over Ig records and assign groups + db_iter = readDbFile(db_file) + clone_dict = group_func(db_iter, **group_args) + except: + #sys.stderr.write('Exception in feeder grouping step\n') + alive.value = False + raise + + # Add groups to data queue + try: + #print 'START FEED', alive.value + # Iterate over groups and feed data queue + clone_iter = iter(clone_dict.items()) + while alive.value: + # Get data from queue + if data_queue.full(): continue + else: data = next(clone_iter, None) + # Exit upon reaching end of iterator + if data is None: break + #print "FEED", alive.value, k + + # Feed queue + data_queue.put(DbData(*data)) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in feeder queue feeding step\n') + alive.value = False + raise + + return None + + +def feedQueueClust(alive, data_queue, db_file, group_func=None, group_args={}): + """ + Feeds the data queue with Ig records + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue to hold data for processing + db_file = the Ig record database file + + Returns: + None + """ + # Open input file and perform grouping + try: + # Iterate over Ig records and order by junction length + records = {} + db_iter = readDbFile(db_file) + for rec in db_iter: + records[rec.id] = rec + records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length)) + dist_dict = {} + for __ in range(len(records)): + k,v = records.popitem(last=False) + dist_dict[k] = [v].append(list(records.values())) + except: + #sys.stderr.write('Exception in feeder grouping step\n') + alive.value = False + raise + + # Add groups to data queue + try: + # print 'START FEED', alive.value + # Iterate over groups and feed data queue + dist_iter = iter(dist_dict.items()) + while alive.value: + # Get data from queue + if data_queue.full(): continue + else: data = next(dist_iter, None) + # Exit upon reaching end of iterator + if data is None: break + #print "FEED", alive.value, k + + # Feed queue + data_queue.put(DbData(*data)) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in feeder queue feeding step\n') + alive.value = False + raise + + return None + + +def processQueue(alive, data_queue, result_queue, clone_func, clone_args): + """ + Pulls from data queue, performs calculations, and feeds results queue + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue holding data to process + result_queue = a multiprocessing.Queue to hold processed results + clone_func = the function to call for clonal assignment + clone_args = a dictionary of arguments to pass to clone_func + + Returns: + None + """ + try: + # Iterator over data queue until sentinel object reached + while alive.value: + # Get data from queue + if data_queue.empty(): continue + else: data = data_queue.get() + # Exit upon reaching sentinel + if data is None: break + + # Define result object for iteration and get data records + records = data.data + result = DbResult(data.id, records) + + # Check for invalid data (due to failed indexing) and add failed result + if not data: + result_queue.put(result) + continue + + # Add V(D)J to log + result.log['ID'] = ','.join([str(x) for x in data.id]) + result.log['VALLELE'] = ','.join(set([(r.getVAllele() or '') for r in records])) + result.log['DALLELE'] = ','.join(set([(r.getDAllele() or '') for r in records])) + result.log['JALLELE'] = ','.join(set([(r.getJAllele() or '') for r in records])) + result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in records])) + result.log['SEQUENCES'] = len(records) + + # Checking for preclone failure and assign clones + clones = clone_func(records, **clone_args) if data else None + + # import cProfile + # prof = cProfile.Profile() + # clones = prof.runcall(clone_func, records, **clone_args) + # prof.dump_stats('worker-%d.prof' % os.getpid()) + + if clones is not None: + result.results = clones + result.valid = True + result.log['CLONES'] = len(clones) + else: + result.log['CLONES'] = 0 + + # Feed results to result queue + result_queue.put(result) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in worker\n') + alive.value = False + raise + + return None + + +def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args): + """ + Pulls from data queue, performs calculations, and feeds results queue + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue holding data to process + result_queue = a multiprocessing.Queue to hold processed results + clone_func = the function to call for calculating pairwise distances between sequences + clone_args = a dictionary of arguments to pass to clone_func + + Returns: + None + """ + + try: + # print 'START WORK', alive.value + # Iterator over data queue until sentinel object reached + while alive.value: + # Get data from queue + if data_queue.empty(): continue + else: data = data_queue.get() + # Exit upon reaching sentinel + if data is None: break + # print "WORK", alive.value, data['id'] + + # Define result object for iteration and get data records + records = data.data + result = DbResult(data.id, records) + + # Create row of distance matrix and check for error + dist_row = clone_func(records, **clone_args) if data else None + if dist_row is not None: + result.results = dist_row + result.valid = True + + # Feed results to result queue + result_queue.put(result) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in worker\n') + alive.value = False + raise + + return None + + +def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}): + """ + Assembles results from a queue of individual sequence results and manages log/file I/O + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + result_queue = a multiprocessing.Queue holding processQueue results + collect_queue = a multiprocessing.Queue to store collector return values + db_file = the input database file name + out_args = common output argument dictionary from parseCommonArgs + cluster_func = the function to call for carrying out clustering on distance matrix + cluster_args = a dictionary of arguments to pass to cluster_func + + Returns: + None + (adds 'log' and 'out_files' to collect_dict) + """ + # Open output files + try: + # Count records and define output format + out_type = getFileType(db_file) if out_args['out_type'] is None \ + else out_args['out_type'] + result_count = countDbFile(db_file) + + # Defined successful output handle + pass_handle = getOutputHandle(db_file, + out_label='clone-pass', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') + + # Defined failed alignment output handle + if out_args['failed']: + fail_handle = getOutputHandle(db_file, + out_label='clone-fail', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + fail_writer = getDbWriter(fail_handle, db_file) + else: + fail_handle = None + fail_writer = None + + # Define log handle + if out_args['log_file'] is None: + log_handle = None + else: + log_handle = open(out_args['log_file'], 'w') + except: + #sys.stderr.write('Exception in collector file opening step\n') + alive.value = False + raise + + # Get results from queue and write to files + try: + #print 'START COLLECT', alive.value + # Iterator over results queue until sentinel object reached + start_time = time() + rec_count = clone_count = pass_count = fail_count = 0 + while alive.value: + # Get result from queue + if result_queue.empty(): continue + else: result = result_queue.get() + # Exit upon reaching sentinel + if result is None: break + #print "COLLECT", alive.value, result['id'] + + # Print progress for previous iteration and update record count + if rec_count == 0: + print('PROGRESS> Assigning clones') + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += len(result.data) + + # Write passed and failed records + if result: + for clone in result.results.values(): + clone_count += 1 + for i, rec in enumerate(clone): + rec.annotations['CLONE'] = clone_count + pass_writer.writerow(rec.toDict()) + pass_count += 1 + result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) + + else: + for i, rec in enumerate(result.data): + if fail_writer is not None: fail_writer.writerow(rec.toDict()) + fail_count += 1 + result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) + + # Write log + printLog(result.log, handle=log_handle) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + + # Print total counts + printProgress(rec_count, result_count, 0.05, start_time) + + # Close file handles + pass_handle.close() + if fail_handle is not None: fail_handle.close() + if log_handle is not None: log_handle.close() + + # Update return list + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['CLONES'] = clone_count + log['RECORDS'] = rec_count + log['PASS'] = pass_count + log['FAIL'] = fail_count + collect_dict = {'log':log, 'out_files': [pass_handle.name]} + collect_queue.put(collect_dict) + except: + #sys.stderr.write('Exception in collector result processing step\n') + alive.value = False + raise + + return None + + +def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args): + """ + Assembles results from a queue of individual sequence results and manages log/file I/O + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + result_queue = a multiprocessing.Queue holding processQueue results + collect_queue = a multiprocessing.Queue to store collector return values + db_file = the input database file name + out_args = common output argument dictionary from parseCommonArgs + cluster_func = the function to call for carrying out clustering on distance matrix + cluster_args = a dictionary of arguments to pass to cluster_func + + Returns: + None + (adds 'log' and 'out_files' to collect_dict) + """ + # Open output files + try: + + # Iterate over Ig records to count and order by junction length + result_count = 0 + records = {} + # print 'Reading file...' + db_iter = readDbFile(db_file) + for rec in db_iter: + records[rec.id] = rec + result_count += 1 + records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length)) + + # Define empty matrix to store assembled results + dist_mat = np.zeros((result_count,result_count)) + + # Count records and define output format + out_type = getFileType(db_file) if out_args['out_type'] is None \ + else out_args['out_type'] + + # Defined successful output handle + pass_handle = getOutputHandle(db_file, + out_label='clone-pass', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') + + # Defined failed cloning output handle + if out_args['failed']: + fail_handle = getOutputHandle(db_file, + out_label='clone-fail', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + fail_writer = getDbWriter(fail_handle, db_file) + else: + fail_handle = None + fail_writer = None + + # Open log file + if out_args['log_file'] is None: + log_handle = None + else: + log_handle = open(out_args['log_file'], 'w') + except: + alive.value = False + raise + + try: + # Iterator over results queue until sentinel object reached + start_time = time() + row_count = rec_count = 0 + while alive.value: + # Get result from queue + if result_queue.empty(): continue + else: result = result_queue.get() + # Exit upon reaching sentinel + if result is None: break + + # Print progress for previous iteration + if row_count == 0: + print('PROGRESS> Assigning clones') + printProgress(row_count, result_count, 0.05, start_time) + + # Update counts for iteration + row_count += 1 + rec_count += len(result) + + # Add result row to distance matrix + if result: + dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results + + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + + # Calculate linkage and carry out clustering + # print dist_mat + clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None + clones = {} + # print clusters + for i, c in enumerate(clusters): + clones.setdefault(c, []).append(records[list(records.keys())[i]]) + + # Write passed and failed records + clone_count = pass_count = fail_count = 0 + if clones: + for clone in clones.values(): + clone_count += 1 + for i, rec in enumerate(clone): + rec.annotations['CLONE'] = clone_count + pass_writer.writerow(rec.toDict()) + pass_count += 1 + #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) + + else: + for i, rec in enumerate(result.data): + fail_writer.writerow(rec.toDict()) + fail_count += 1 + #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) + + # Print final progress + printProgress(row_count, result_count, 0.05, start_time) + + # Close file handles + pass_handle.close() + if fail_handle is not None: fail_handle.close() + if log_handle is not None: log_handle.close() + + # Update return list + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['CLONES'] = clone_count + log['RECORDS'] = rec_count + log['PASS'] = pass_count + log['FAIL'] = fail_count + collect_dict = {'log':log, 'out_files': [pass_handle.name]} + collect_queue.put(collect_dict) + except: + alive.value = False + raise + + return None + + +def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None, + group_func=None, group_args={}, clone_args={}, cluster_args={}, + out_args=default_out_args, nproc=None, queue_size=None): + """ + Define clonally related sequences + + Arguments: + db_file = filename of input database + feed_func = the function that feeds the queue + work_func = the worker function that will run on each CPU + collect_func = the function that collects results from the workers + group_func = the function to use for assigning preclones + clone_func = the function to use for determining clones within preclonal groups + group_args = a dictionary of arguments to pass to group_func + clone_args = a dictionary of arguments to pass to clone_func + out_args = common output argument dictionary from parseCommonArgs + nproc = the number of processQueue processes; + if None defaults to the number of CPUs + queue_size = maximum size of the argument queue; + if None defaults to 2*nproc + + Returns: + a list of successful output file names + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'DefineClones' + log['DB_FILE'] = os.path.basename(db_file) + if group_func is not None: + log['GROUP_FUNC'] = group_func.__name__ + log['GROUP_ARGS'] = group_args + log['CLONE_FUNC'] = clone_func.__name__ + + # TODO: this is yucky, but can be fixed by using a model class + clone_log = clone_args.copy() + if 'dist_mat' in clone_log: del clone_log['dist_mat'] + log['CLONE_ARGS'] = clone_log + + if cluster_func is not None: + log['CLUSTER_FUNC'] = cluster_func.__name__ + log['CLUSTER_ARGS'] = cluster_args + log['NPROC'] = nproc + printLog(log) + + # Define feeder function and arguments + feed_args = {'db_file': db_file, + 'group_func': group_func, + 'group_args': group_args} + # Define worker function and arguments + work_args = {'clone_func': clone_func, + 'clone_args': clone_args} + # Define collector function and arguments + collect_args = {'db_file': db_file, + 'out_args': out_args, + 'cluster_func': cluster_func, + 'cluster_args': cluster_args} + + # Call process manager + result = manageProcesses(feed_func, work_func, collect_func, + feed_args, work_args, collect_args, + nproc, queue_size) + + # Print log + result['log']['END'] = 'DefineClones' + printLog(result['log']) + + return result['out_files'] + + +def getArgParser(): + """ + Defines the ArgumentParser + + Arguments: + None + + Returns: + an ArgumentParser object + """ + # Define input and output fields + fields = dedent( + ''' + output files: + clone-pass + database with assigned clonal group numbers. + clone-fail + database with records failing clonal grouping. + + required fields: + SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH + + <field> + sequence field specified by the --sf parameter + + output fields: + CLONE + ''') + + # Define ArgumentParser + parser = ArgumentParser(description=__doc__, epilog=fields, + formatter_class=CommonHelpFormatter) + parser.add_argument('--version', action='version', + version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) + subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='', + help='Cloning method') + # TODO: This is a temporary fix for Python issue 9253 + subparsers.required = True + + # Parent parser + parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True, + multiproc=True) + + # Distance cloning method + parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='''Defines clones as having same V assignment, + J assignment, and junction length with + specified substitution distance model.''') + parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None, + help='Additional fields to use for grouping clones (non VDJ)') + parser_bygroup.add_argument('--mode', action='store', dest='mode', + choices=('allele', 'gene'), default='gene', + help='''Specifies whether to use the V(D)J allele or gene for + initial grouping.''') + parser_bygroup.add_argument('--act', action='store', dest='action', default='set', + choices=('first', 'set'), + help='''Specifies how to handle multiple V(D)J assignments + for initial grouping.''') + parser_bygroup.add_argument('--model', action='store', dest='model', + choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'), + default=default_bygroup_model, + help='''Specifies which substitution model to use for + calculating distance between sequences. Where m1n is the + mouse single nucleotide transition/trasversion model + of Smith et al, 1996; hs1f is the human single + nucleotide model derived from Yaari et al, 2013; hs5f + is the human S5F model of Yaari et al, 2013; ham is + nucleotide Hamming distance; and aa is amino acid + Hamming distance. The hs5f data should be + considered experimental.''') + parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float, + default=default_distance, + help='The distance threshold for clonal grouping') + parser_bygroup.add_argument('--norm', action='store', dest='norm', + choices=('len', 'mut', 'none'), default=default_norm, + help='''Specifies how to normalize distances. One of none + (do not normalize), len (normalize by length), + or mut (normalize by number of mutations between sequences).''') + parser_bygroup.add_argument('--sym', action='store', dest='sym', + choices=('avg', 'min'), default=default_sym, + help='''Specifies how to combine asymmetric distances. One of avg + (average of A->B and B->A) or min (minimum of A->B and B->A).''') + parser_bygroup.add_argument('--link', action='store', dest='linkage', + choices=('single', 'average', 'complete'), default=default_linkage, + help='''Type of linkage to use for hierarchical clustering.''') + parser_bygroup.add_argument('--sf', action='store', dest='seq_field', + default=default_seq_field, + help='''The name of the field to be used to calculate + distance between records''') + parser_bygroup.set_defaults(feed_func=feedQueue) + parser_bygroup.set_defaults(work_func=processQueue) + parser_bygroup.set_defaults(collect_func=collectQueue) + parser_bygroup.set_defaults(group_func=indexJunctions) + parser_bygroup.set_defaults(clone_func=distanceClones) + + + # Hierarchical clustering cloning method + parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Defines clones by specified distance metric on CDR3s and \ + cutting of hierarchical clustering tree') +# parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None, +# help='Fields to use for grouping clones (non VDJ)') + parser_hclust.add_argument('--method', action='store', dest='method', + choices=('chen2010', 'ademokun2011'), default=default_hclust_model, + help='Specifies which cloning method to use for calculating distance \ + between CDR3s, computing linkage, and cutting clusters') + parser_hclust.set_defaults(feed_func=feedQueueClust) + parser_hclust.set_defaults(work_func=processQueueClust) + parser_hclust.set_defaults(collect_func=collectQueueClust) + parser_hclust.set_defaults(cluster_func=hierClust) + + return parser + + +if __name__ == '__main__': + """ + Parses command line arguments and calls main function + """ + # Parse arguments + parser = getArgParser() + args = parser.parse_args() + args_dict = parseCommonArgs(args) + # Convert case of fields + if 'seq_field' in args_dict: + args_dict['seq_field'] = args_dict['seq_field'].upper() + if 'fields' in args_dict and args_dict['fields'] is not None: + args_dict['fields'] = [f.upper() for f in args_dict['fields']] + + # Define clone_args + if args.command == 'bygroup': + args_dict['group_args'] = {'fields': args_dict['fields'], + 'action': args_dict['action'], + 'mode':args_dict['mode']} + args_dict['clone_args'] = {'model': args_dict['model'], + 'distance': args_dict['distance'], + 'norm': args_dict['norm'], + 'sym': args_dict['sym'], + 'linkage': args_dict['linkage'], + 'seq_field': args_dict['seq_field']} + + # TODO: can be cleaned up with abstract model class + args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model']) + + del args_dict['fields'] + del args_dict['action'] + del args_dict['mode'] + del args_dict['model'] + del args_dict['distance'] + del args_dict['norm'] + del args_dict['sym'] + del args_dict['linkage'] + del args_dict['seq_field'] + + # Define clone_args + if args.command == 'hclust': + dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011} + args_dict['clone_func'] = dist_funcs[args_dict['method']] + args_dict['cluster_args'] = {'method': args_dict['method']} + #del args_dict['fields'] + del args_dict['method'] + + # Call defineClones + del args_dict['command'] + del args_dict['db_files'] + for f in args.__dict__['db_files']: + args_dict['db_file'] = f + defineClones(**args_dict) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/MakeDb.py Tue Aug 16 09:10:50 2016 -0400 @@ -0,0 +1,1025 @@ +#!/usr/bin/env python3 +""" +Create tab-delimited database file to store sequence alignment information +""" +# Info +__author__ = 'Namita Gupta, Jason Anthony Vander Heiden' +from changeo import __version__, __date__ + +# Imports +import csv +import os +import re +import sys +import pandas as pd +import tarfile +import zipfile +from argparse import ArgumentParser +from collections import OrderedDict +from itertools import groupby +from shutil import rmtree +from tempfile import mkdtemp +from textwrap import dedent +from time import time +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC + +# Presto and changeo imports +from presto.Defaults import default_out_args +from presto.Annotation import parseAnnotation +from presto.IO import countSeqFile, printLog, printProgress +from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs +from changeo.IO import getDbWriter, countDbFile, getRepo +from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ + j_allele_regex + +# Default parameters +default_delimiter = ('\t', ',', '-') + + +def gapV(ig_dict, repo_dict): + """ + Insert gaps into V region and update alignment information + + Arguments: + ig_dict : Dictionary of parsed IgBlast output + repo_dict : Dictionary of IMGT gapped germline sequences + + Returns: + dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields + """ + + seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] + + # Find gapped germline V segment + vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') + vkey = (vgene, ) + #TODO: Figure out else case + if vkey in repo_dict: + vgap = repo_dict[vkey] + # Iterate over gaps in the germline segment + gaps = re.finditer(r'\.', vgap) + gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 + for gap in gaps: + i = gap.start() + # Break if gap begins after V region + if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: + break + # Insert gap into IMGT sequence + seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] + # Update gap counter + gapcount += 1 + ig_dict['SEQUENCE_IMGT'] = seq_imgt + # Update IMGT positioning information for V + ig_dict['V_GERM_START_IMGT'] = 1 + ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount + + return ig_dict + + +def getIMGTJunc(ig_dict, repo_dict): + """ + Identify junction region by IMGT definition + + Arguments: + ig_dict : Dictionary of parsed IgBlast output + repo_dict : Dictionary of IMGT gapped germline sequences + + Returns: + dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields + """ + # Find germline J segment + jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') + jkey = (jgene, ) + #TODO: Figure out else case + if jkey in repo_dict: + # Get germline J sequence + jgerm = repo_dict[jkey] + jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] + # Look for (F|W)GXG aa motif in nt sequence + motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) + aa_end = len(ig_dict['SEQUENCE_IMGT']) + #TODO: Figure out else case + if motif: + # print('\n', motif.group()) + aa_end = motif.start() - len(jgerm) + 3 + # Add fields to dict + ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] + ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) + + return ig_dict + + +def getRegions(ig_dict): + """ + Identify FWR and CDR regions by IMGT definition + + Arguments: + ig_dict : Dictionary of parsed alignment output + + Returns: + dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, + CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields + """ + try: + seq_len = len(ig_dict['SEQUENCE_IMGT']) + ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] + except (KeyError, IndexError): + return ig_dict + + try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] + except (IndexError): return ig_dict + + try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] + except (IndexError): return ig_dict + + try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] + except (IndexError): return ig_dict + + try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] + except (IndexError): return ig_dict + + try: + cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] + ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] + ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] + except (KeyError, IndexError): + return ig_dict + + return ig_dict + + +def getSeqforIgBlast(seq_file): + """ + Fetch input sequences for IgBlast queries + + Arguments: + seq_file = a fasta file of sequences input to IgBlast + + Returns: + a dictionary of {ID:Seq} + """ + + seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) + + # Create a seq_dict ID translation using IDs truncate up to space or 50 chars + seqs = {} + for seq in seq_dict.values(): + seqs.update({seq.description:str(seq.seq)}) + + return seqs + + +def findLine(handle, query): + """ + Finds line with query string in file + + Arguments: + handle = file handle in which to search for line + query = query string for which to search in file + + Returns: + line from handle in which query string was found + """ + for line in handle: + if(re.match(query, line)): + return line + + +def extractIMGT(imgt_output): + """ + Extract necessary files from IMGT results, zipped or unzipped + + Arguments: + imgt_output = zipped file or unzipped folder output by IMGT + + Returns: + sorted list of filenames from which information will be read + """ + #file_ext = os.path.splitext(imgt_output)[1].lower() + imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') + temp_dir = mkdtemp() + if zipfile.is_zipfile(imgt_output): + # Open zip file + imgt_zip = zipfile.ZipFile(imgt_output, 'r') + # Extract required files + imgt_files = sorted([n for n in imgt_zip.namelist() \ + if os.path.basename(n).startswith(imgt_flags)]) + imgt_zip.extractall(temp_dir, imgt_files) + # Define file list + imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] + elif os.path.isdir(imgt_output): + # Find required files in folder + folder_files = [] + for root, dirs, files in os.walk(imgt_output): + folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) + # Define file list + imgt_files = sorted([n for n in folder_files \ + if os.path.basename(n).startswith(imgt_flags)]) + elif tarfile.is_tarfile(imgt_output): + # Open zip file + imgt_tar = tarfile.open(imgt_output, 'r') + # Extract required files + imgt_files = sorted([n for n in imgt_tar.getnames() \ + if os.path.basename(n).startswith(imgt_flags)]) + imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) + # Define file list + imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] + else: + sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') + + if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files + sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) + elif len(imgt_files) < len(imgt_flags): + sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) + + return temp_dir, imgt_files + + +# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks +def readOneIgBlastResult(block): + """ + Parse a single IgBLAST query result + + Arguments: + block = itertools groupby object of single result + + Returns: + None if no results, otherwise list of DataFrames for each result block + """ + results = list() + i = 0 + for match, subblock in groupby(block, lambda l: l=='\n'): + if not match: + # Strip whitespace and comments + sub = [s.strip() for s in subblock if not s.startswith('#')] + + # Continue on empty block + if not sub: continue + else: i += 1 + + # Split by tabs + sub = [s.split('\t') for s in sub] + + # Append list for "V-(D)-J rearrangement summary" (i == 1) + # And "V-(D)-J junction details" (i == 2) + # Otherwise append DataFrame of subblock + if i == 1 or i == 2: + results.append(sub[0]) + else: + df = pd.DataFrame(sub) + if not df.empty: results.append(df) + + return results if results else None + + +# TODO: needs more speeds. pandas is probably to blame. +def readIgBlast(igblast_output, seq_dict, repo_dict, + score_fields=False, region_fields=False): + """ + Reads IgBlast output + + Arguments: + igblast_output = IgBlast output file (format 7) + seq_dict = a dictionary of {ID:Seq} from input fasta file + repo_dict = dictionary of IMGT gapped germline sequences + score_fields = if True parse alignment scores + region_fields = if True add FWR and CDR region fields + + Returns: + a generator of dictionaries containing alignment data + """ + + # Open IgBlast output file + with open(igblast_output) as f: + # Iterate over individual results (separated by # IGBLASTN) + for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): + block = list(block) + if not k1: + # TODO: move query name extraction into block parser readOneIgBlastResult(). + # Extract sequence ID + query_name = ' '.join(block[0].strip().split(' ')[2:]) + # Initialize db_gen to have ID and input sequence + db_gen = {'SEQUENCE_ID': query_name, + 'SEQUENCE_INPUT': seq_dict[query_name]} + + # Parse further sub-blocks + block_list = readOneIgBlastResult(block) + + # TODO: this is indented pretty far. should be a separate function. or several functions. + # If results exist, parse further to obtain full db_gen + if block_list is not None: + # Parse quality information + db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' + db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' + db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' + if block_list[0][-1] == '-': + db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], + IUPAC.ambiguous_dna).reverse_complement()) + + # Parse V, D, and J calls + call_str = ' '.join(block_list[0]) + v_call = parseAllele(call_str, v_allele_regex, action='list') + d_call = parseAllele(call_str, d_allele_regex, action='list') + j_call = parseAllele(call_str, j_allele_regex, action='list') + db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' + db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' + db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' + + # Parse junction sequence + # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) + # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) + + # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why? + # TODO: maybe we should fail these. they look craptastic. + #pd.set_option('display.width', 500) + #print query_name, len(block_list), hit_idx + #for i, x in enumerate(block_list): + # print '[%i]' % i + # print x + + # Parse segment start and stop positions + hit_df = block_list[-1] + + # Alignment info block + # 0: segment + # 1: query id + # 2: subject id + # 3: % identity + # 4: alignment length + # 5: mismatches + # 6: gap opens + # 7: gaps + # 8: q. start + # 9: q. end + # 10: s. start + # 11: s. end + # 12: evalue + # 13: bit score + # 14: query seq + # 15: subject seq + # 16: btop + + # If V call exists, parse V alignment information + seq_vdj = '' + if v_call is not None: + v_align = hit_df[hit_df[0] == 'V'].iloc[0] + # Germline positions + db_gen['V_GERM_START_VDJ'] = int(v_align[10]) + db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 + # Query sequence positions + db_gen['V_SEQ_START'] = int(v_align[8]) + db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 + + if int(v_align[6]) == 0: + db_gen['INDELS'] = 'F' + else: + db_gen['INDELS'] = 'T' + # Set functional to none so record gets tossed (junction will be wrong) + # db_gen['FUNCTIONAL'] = None + + # V alignment scores + if score_fields: + try: db_gen['V_SCORE'] = float(v_align[13]) + except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' + + try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 + except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' + + try: db_gen['V_EVALUE'] = float(v_align[12]) + except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' + + try: db_gen['V_BTOP'] = v_align[16] + except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' + + # Update VDJ sequence, removing insertions + start = 0 + for m in re.finditer(r'-', v_align[15]): + ins = m.start() + seq_vdj += v_align[14][start:ins] + start = ins + 1 + seq_vdj += v_align[14][start:] + + # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. + # If D call exists, parse D alignment information + if d_call is not None: + d_align = hit_df[hit_df[0] == 'D'].iloc[0] + + # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. + # Determine N-region length and amount of J overlap with V or D alignment + overlap = 0 + if v_call is not None: + n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) + if n1_len < 0: + db_gen['N1_LENGTH'] = 0 + overlap = abs(n1_len) + else: + db_gen['N1_LENGTH'] = n1_len + n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) + n1_end = int(d_align[8])-1 + seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] + + # Query sequence positions + db_gen['D_SEQ_START'] = int(d_align[8]) + overlap + db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) + + # Germline positions + db_gen['D_GERM_START'] = int(d_align[10]) + overlap + db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) + + # Update VDJ sequence, removing insertions + start = overlap + for m in re.finditer(r'-', d_align[15]): + ins = m.start() + seq_vdj += d_align[14][start:ins] + start = ins + 1 + seq_vdj += d_align[14][start:] + + # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. + # If J call exists, parse J alignment information + if j_call is not None: + j_align = hit_df[hit_df[0] == 'J'].iloc[0] + + # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. + # Determine N-region length and amount of J overlap with V or D alignment + overlap = 0 + if d_call is not None: + n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) + if n2_len < 0: + db_gen['N2_LENGTH'] = 0 + overlap = abs(n2_len) + else: + db_gen['N2_LENGTH'] = n2_len + n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) + n2_end = int(j_align[8])-1 + seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] + elif v_call is not None: + n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) + if n1_len < 0: + db_gen['N1_LENGTH'] = 0 + overlap = abs(n1_len) + else: + db_gen['N1_LENGTH'] = n1_len + n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) + n1_end = int(j_align[8])-1 + seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] + else: + db_gen['N1_LENGTH'] = 0 + + # Query positions + db_gen['J_SEQ_START'] = int(j_align[8]) + overlap + db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) + + # Germline positions + db_gen['J_GERM_START'] = int(j_align[10]) + overlap + db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) + + # J alignment scores + if score_fields: + try: db_gen['J_SCORE'] = float(j_align[13]) + except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' + + try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 + except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' + + try: db_gen['J_EVALUE'] = float(j_align[12]) + except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' + + try: db_gen['J_BTOP'] = j_align[16] + except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' + + # Update VDJ sequence, removing insertions + start = overlap + for m in re.finditer(r'-', j_align[15]): + ins = m.start() + seq_vdj += j_align[14][start:ins] + start = ins + 1 + seq_vdj += j_align[14][start:] + + db_gen['SEQUENCE_VDJ'] = seq_vdj + + # Create IMGT-gapped sequence and infer IMGT junction + if v_call is not None: + db_gen = gapV(db_gen, repo_dict) + if j_call is not None: + db_gen = getIMGTJunc(db_gen, repo_dict) + + # FWR and CDR regions + if region_fields: getRegions(db_gen) + + yield IgRecord(db_gen) + + +# TODO: should be more readable +def readIMGT(imgt_files, score_fields=False, region_fields=False): + """ + Reads IMGT/HighV-Quest output + + Arguments: + imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 + score_fields = if True parse alignment scores + region_fields = if True add FWR and CDR region fields + + Returns: + a generator of dictionaries containing alignment data + """ + imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] + # Create a dictionary for each sequence alignment and yield its generator + for sm, gp, nt, jn in zip(*imgt_iters): + if len(set([sm['Sequence ID'], + gp['Sequence ID'], + nt['Sequence ID'], + jn['Sequence ID']])) != 1: + sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ + % sm['Sequence ID']) + + db_gen = {'SEQUENCE_ID': sm['Sequence ID'], + 'SEQUENCE_INPUT': sm['Sequence']} + + if 'No results' not in sm['Functionality']: + db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + + ('unprod' in sm['Functionality'])] + db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + + ('out-of-frame' in sm['JUNCTION frame'])], + db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + + ('unprod' in sm['Functionality'])] + db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], + 'missing' in sm['V-REGION potential ins/del']))) + + ('unprod' in sm['Functionality'])] + db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], + sm['V-REGION insertions'], + sm['V-REGION deletions']))] + + db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] + db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] + + db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) + db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) + db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) + + v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 + db_gen['V_SEQ_START'] = nt['V-REGION start'] + db_gen['V_SEQ_LENGTH'] = v_seq_length + db_gen['V_GERM_START_IMGT'] = 1 + db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 + + db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], + jn['N-REGION-nt nb'], + jn['N1-REGION-nt nb'], + jn["P5'D-nt nb"]] if i) + db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, + jn["P3'V-nt nb"], + jn['N-REGION-nt nb'], + jn['N1-REGION-nt nb'], + jn["P5'D-nt nb"]] if i) + db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) + db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 + db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) + db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], + jn['N2-REGION-nt nb'], + jn["P5'J-nt nb"]] if i) + + db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, + jn["P3'V-nt nb"], + jn['N-REGION-nt nb'], + jn['N1-REGION-nt nb'], + jn["P5'D-nt nb"], + jn["D-REGION-nt nb"], + jn["P3'D-nt nb"], + jn['N2-REGION-nt nb'], + jn["P5'J-nt nb"]] if i) + db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 + db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 + db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 + + db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 + db_gen['JUNCTION'] = jn['JUNCTION'] + + # Alignment scores + if score_fields: + try: db_gen['V_SCORE'] = float(sm['V-REGION score']) + except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' + + try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 + except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' + + try: db_gen['J_SCORE'] = float(sm['J-REGION score']) + except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' + + try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 + except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' + + # FWR and CDR regions + if region_fields: getRegions(db_gen) + else: + db_gen['V_CALL'] = 'None' + db_gen['D_CALL'] = 'None' + db_gen['J_CALL'] = 'None' + + yield IgRecord(db_gen) + + +def getIDforIMGT(seq_file): + """ + Create a sequence ID translation using IMGT truncation + + Arguments: + seq_file = a fasta file of sequences input to IMGT + + Returns: + a dictionary of {truncated ID: full seq description} + """ + + # Create a seq_dict ID translation using IDs truncate up to space or 50 chars + ids = {} + for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): + if len(rec.description) <= 50: + id_key = rec.description + else: + id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) + ids.update({id_key:rec.description}) + + return ids + + +def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, + score_fields=False, region_fields=False, out_args=default_out_args): + """ + Writes tab-delimited database file in output directory + + Arguments: + db_gen = a generator of IgRecord objects containing alignment data + file_prefix = directory and prefix for CLIP tab-delim file + total_count = number of records (for progress bar) + id_dict = a dictionary of {IMGT ID: full seq description} + no_parse = if ID is to be parsed for pRESTO output with default delimiters + score_fields = if True add alignment score fields to output file + region_fields = if True add FWR and CDR region fields to output file + out_args = common output argument dictionary from parseCommonArgs + + Returns: + None + """ + pass_file = "%s_db-pass.tab" % file_prefix + fail_file = "%s_db-fail.tab" % file_prefix + ordered_fields = ['SEQUENCE_ID', + 'SEQUENCE_INPUT', + 'FUNCTIONAL', + 'IN_FRAME', + 'STOP', + 'MUTATED_INVARIANT', + 'INDELS', + 'V_CALL', + 'D_CALL', + 'J_CALL', + 'SEQUENCE_VDJ', + 'SEQUENCE_IMGT', + 'V_SEQ_START', + 'V_SEQ_LENGTH', + 'V_GERM_START_VDJ', + 'V_GERM_LENGTH_VDJ', + 'V_GERM_START_IMGT', + 'V_GERM_LENGTH_IMGT', + 'N1_LENGTH', + 'D_SEQ_START', + 'D_SEQ_LENGTH', + 'D_GERM_START', + 'D_GERM_LENGTH', + 'N2_LENGTH', + 'J_SEQ_START', + 'J_SEQ_LENGTH', + 'J_GERM_START', + 'J_GERM_LENGTH', + 'JUNCTION_LENGTH', + 'JUNCTION'] + + if score_fields: + ordered_fields.extend(['V_SCORE', + 'V_IDENTITY', + 'V_EVALUE', + 'V_BTOP', + 'J_SCORE', + 'J_IDENTITY', + 'J_EVALUE', + 'J_BTOP']) + + if region_fields: + ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', + 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) + + + # TODO: This is not the best approach. should pass in output fields. + # Initiate passed handle + pass_handle = None + + # Open failed file + if out_args['failed']: + fail_handle = open(fail_file, 'wt') + fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) + else: + fail_handle = None + fail_writer = None + + # Initialize counters and file + pass_writer = None + start_time = time() + rec_count = pass_count = fail_count = 0 + for record in db_gen: + #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) + printProgress(rec_count, total_count, 0.05, start_time) + rec_count += 1 + + # Count pass or fail + if (record.v_call == 'None' and record.j_call == 'None') or \ + record.functional is None or \ + not record.seq_vdj or \ + not record.junction: + # print(record.v_call, record.j_call, record.functional, record.junction) + fail_count += 1 + if fail_writer is not None: fail_writer.writerow(record.toDict()) + continue + else: + pass_count += 1 + + # Build sample sequence description + if record.id in id_dict: + record.id = id_dict[record.id] + + # Parse sequence description into new columns + if not no_parse: + record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) + record.id = record.annotations['ID'] + del record.annotations['ID'] + + # TODO: This is not the best approach. should pass in output fields. + # If first sequence, use parsed description to create new columns and initialize writer + if pass_writer is None: + if not no_parse: ordered_fields.extend(list(record.annotations.keys())) + pass_handle = open(pass_file, 'wt') + pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) + + # Write row to tab-delim CLIP file + pass_writer.writerow(record.toDict()) + + # Print log + #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) + printProgress(rec_count, total_count, 0.05, start_time) + + log = OrderedDict() + log['OUTPUT'] = pass_file + log['PASS'] = pass_count + log['FAIL'] = fail_count + log['END'] = 'MakeDb' + printLog(log) + + if pass_handle is not None: pass_handle.close() + if fail_handle is not None: fail_handle.close() + + +# TODO: may be able to merge with parseIMGT +def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, + region_fields=False, out_args=default_out_args): + """ + Main for IgBlast aligned sample sequences + + Arguments: + igblast_output = IgBlast output file to process + seq_file = fasta file input to IgBlast (from which to get sequence) + repo = folder with germline repertoire files + no_parse = if ID is to be parsed for pRESTO output with default delimiters + score_fields = if True add alignment score fields to output file + region_fields = if True add FWR and CDR region fields to output file + out_args = common output argument dictionary from parseCommonArgs + + Returns: + None + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDB' + log['ALIGNER'] = 'IgBlast' + log['ALIGN_RESULTS'] = os.path.basename(igblast_output) + log['SEQ_FILE'] = os.path.basename(seq_file) + log['NO_PARSE'] = no_parse + log['SCORE_FIELDS'] = score_fields + log['REGION_FIELDS'] = region_fields + printLog(log) + + # Get input sequence dictionary + seq_dict = getSeqforIgBlast(seq_file) + + # Formalize out_dir and file-prefix + if not out_args['out_dir']: + out_dir = os.path.split(igblast_output)[0] + else: + out_dir = os.path.abspath(out_args['out_dir']) + if not os.path.exists(out_dir): os.mkdir(out_dir) + if out_args['out_name']: + file_prefix = out_args['out_name'] + else: + file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) + file_prefix = os.path.join(out_dir, file_prefix) + + total_count = countSeqFile(seq_file) + + # Create + repo_dict = getRepo(repo) + igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, + score_fields=score_fields, region_fields=region_fields) + writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, + score_fields=score_fields, region_fields=region_fields, out_args=out_args) + + +# TODO: may be able to merge with parseIgBlast +def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, + region_fields=False, out_args=default_out_args): + """ + Main for IMGT aligned sample sequences + + Arguments: + imgt_output = zipped file or unzipped folder output by IMGT + seq_file = FASTA file input to IMGT (from which to get seqID) + no_parse = if ID is to be parsed for pRESTO output with default delimiters + score_fields = if True add alignment score fields to output file + region_fields = if True add FWR and CDR region fields to output file + out_args = common output argument dictionary from parseCommonArgs + + Returns: + None + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDb' + log['ALIGNER'] = 'IMGT' + log['ALIGN_RESULTS'] = imgt_output + log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' + log['NO_PARSE'] = no_parse + log['SCORE_FIELDS'] = score_fields + log['REGION_FIELDS'] = region_fields + printLog(log) + + # Get individual IMGT result files + temp_dir, imgt_files = extractIMGT(imgt_output) + + # Formalize out_dir and file-prefix + if not out_args['out_dir']: + out_dir = os.path.dirname(os.path.abspath(imgt_output)) + else: + out_dir = os.path.abspath(out_args['out_dir']) + if not os.path.exists(out_dir): os.mkdir(out_dir) + if out_args['out_name']: + file_prefix = out_args['out_name'] + else: + file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] + file_prefix = os.path.join(out_dir, file_prefix) + + total_count = countDbFile(imgt_files[0]) + + # Get (parsed) IDs from fasta file submitted to IMGT + id_dict = getIDforIMGT(seq_file) if seq_file else {} + + # Create + imgt_dict = readIMGT(imgt_files, score_fields=score_fields, + region_fields=region_fields) + writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, + score_fields=score_fields, region_fields=region_fields, out_args=out_args) + + # Delete temp directory + rmtree(temp_dir) + + +def getArgParser(): + """ + Defines the ArgumentParser + + Arguments: + None + + Returns: + an ArgumentParser object + """ + fields = dedent( + ''' + output files: + db-pass + database of parsed alignment records. + db-fail + database with records failing alignment. + + output fields: + SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, + INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, + V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, + V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, + D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, + J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, + JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, + J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, + FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT + ''') + + # Define ArgumentParser + parser = ArgumentParser(description=__doc__, epilog=fields, + formatter_class=CommonHelpFormatter) + parser.add_argument('--version', action='version', + version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) + subparsers = parser.add_subparsers(title='subcommands', dest='command', + help='Aligner used', metavar='') + # TODO: This is a temporary fix for Python issue 9253 + subparsers.required = True + + # Parent parser + parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) + + # IgBlast Aligner + parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', + parents=[parser_parent], + formatter_class=CommonHelpFormatter) + parser_igblast.set_defaults(func=parseIgBlast) + parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', + required=True, + help='''IgBLAST output files in format 7 with query sequence + (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') + parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, + help='''List of folders and/or fasta files containing + IMGT-gapped germline sequences corresponding to the + set of germlines used in the IgBLAST alignment.''') + parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', + required=True, + help='List of input FASTA files containing sequences') + parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', + help='''Specify if input IDs should not be parsed to add + new columns to database.''') + parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify if alignment score metrics should be + included in the output. Adds the V_SCORE, V_IDENTITY, + V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, + J_BTOP, and J_EVALUE columns.''') + parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', + help='''Specify if IMGT framework and CDR regions should be + included in the output. Adds the FWR1_IMGT, FWR2_IMGT, + FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and + CDR3_IMGT columns.''') + + # IMGT aligner + parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', + parents=[parser_parent], + formatter_class=CommonHelpFormatter) + imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) + imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', + help='''Either zipped IMGT output files (.zip) or a folder + containing unzipped IMGT output files (which must + include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, + and 6_Junction).''') + parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', + required=False, + help='List of input FASTA files containing sequences') + parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', + help='''Specify if input IDs should not be parsed to add new + columns to database.''') + parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify if alignment score metrics should be + included in the output. Adds the V_SCORE, V_IDENTITY, + J_SCORE and J_IDENTITY. Note, this will also add + the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, + but they will be empty for IMGT output.''') + parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', + help='''Specify if IMGT framework and CDR regions should be + included in the output. Adds the FWR1_IMGT, FWR2_IMGT, + FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and + CDR3_IMGT columns.''') + parser_imgt.set_defaults(func=parseIMGT) + + return parser + + +if __name__ == "__main__": + """ + Parses command line arguments and calls main + """ + parser = getArgParser() + args = parser.parse_args() + args_dict = parseCommonArgs(args, in_arg='aligner_files') + + # Set no ID parsing if sequence files are not provided + if 'seq_files' in args_dict and not args_dict['seq_files']: + args_dict['no_parse'] = True + + # Delete + if 'seq_files' in args_dict: del args_dict['seq_files'] + if 'aligner_files' in args_dict: del args_dict['aligner_files'] + if 'command' in args_dict: del args_dict['command'] + if 'func' in args_dict: del args_dict['func'] + + if args.command == 'imgt': + for i in range(len(args.__dict__['aligner_files'])): + args_dict['imgt_output'] = args.__dict__['aligner_files'][i] + args_dict['seq_file'] = args.__dict__['seq_files'][i] \ + if args.__dict__['seq_files'] else None + args.func(**args_dict) + elif args.command == 'igblast': + for i in range(len(args.__dict__['aligner_files'])): + args_dict['igblast_output'] = args.__dict__['aligner_files'][i] + args_dict['seq_file'] = args.__dict__['seq_files'][i] + args.func(**args_dict)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/define_clones.r Tue Aug 16 09:10:50 2016 -0400 @@ -0,0 +1,15 @@ +args <- commandArgs(trailingOnly = TRUE) + +input=args[1] +output=args[2] + +change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F) + +freq = data.frame(table(change.o$CLONE)) +freq2 = data.frame(table(freq$Freq)) + +freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1)) + +names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences") + +write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/define_clones.sh Tue Aug 16 09:10:50 2016 -0400 @@ -0,0 +1,39 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +#define_clones.sh $input $noparse $scores $regions $out_file + +type=$1 +input=$2 + +mkdir -p $PWD/outdir + +cp $input $PWD/input.tab #file has to have a ".tab" extension + +if [ "bygroup" == "$type" ] ; then + mode=$3 + act=$4 + model=$5 + norm=$6 + sym=$7 + link=$8 + dist=$9 + output=${10} + output2=${11} + + /data/users/david/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + + Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 +else + method=$3 + output=$4 + output2=$5 + + /data/users/david/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + + Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 +fi + +cp $PWD/outdir/output_clone-pass.tab $output
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/makedb.sh Tue Aug 16 09:10:50 2016 -0400 @@ -0,0 +1,35 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +input=$1 +noparse=$2 +scores=$3 +regions=$4 +output=$5 + +if [ "true" == "$noparse" ] ; then + noparse="--noparse" +else + noparse="" +fi + +if [ "true" == "$scores" ] ; then + scores="--scores" +else + scores="" +fi + +if [ "true" == "$regions" ] ; then + regions="--regions" +else + regions="" +fi + +mkdir $PWD/outdir + +echo "makedb: $PWD/outdir" + +/data/users/david/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +#/home/galaxy/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions + +mv $PWD/outdir/output_db-pass.tab $output
--- a/mutation_analysis.r Thu Aug 11 10:35:52 2016 -0400 +++ b/mutation_analysis.r Tue Aug 16 09:10:50 2016 -0400 @@ -280,8 +280,6 @@ print("Plotting stacked transition") - print(transition2) - png(filename=paste("transitions_stacked_", name, ".png", sep="")) p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity") #stacked bar p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL))
--- a/wrapper.sh Thu Aug 11 10:35:52 2016 -0400 +++ b/wrapper.sh Tue Aug 16 09:10:50 2016 -0400 @@ -45,14 +45,6 @@ cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt -#cat $PWD/files/*/1_* > $PWD/summary.txt -#cat $PWD/files/*/3_* > $PWD/sequences.txt -#cat $PWD/files/*/5_* > $PWD/aa.txt -#cat $PWD/files/*/6_* > $PWD/junction.txt -#cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt -#cat $PWD/files/*/8_* > $PWD/mutationstats.txt -#cat $PWD/files/*/10_* > $PWD/hotspots.txt - if [[ ${#BLASTN_DIR} -ge 5 ]] ; then echo "On server, using BLASTN_DIR env: ${BLASTN_DIR}" else @@ -66,25 +58,11 @@ if [[ "${method}" == "custom" ]] ; then python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt else - #ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l) - #ID_index=$((ID_index+1)) - #sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l) - #sequence_index=$((sequence_index+1)) - - #echo "${ID_index}, ${sequence_index}" - - #cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.tmp - #cat $PWD/summary.txt | tail -n+2 | awk -v id="${ID_index}" -v seq="${sequence_index}" 'BEGIN{FS="\t"} if(NF>10 && length($seq) > 0) {print ">" $id "\n" $seq} {}' > $PWD/sequences.fasta - - #cat $PWD/sequences.tmp | grep -B1 -vE ">.*|^$" | grep -v "^\-\-$" > sequences.fasta #filter out empty sequences - echo "---------------- summary_to_fasta.py ----------------" echo "---------------- summary_to_fasta.py ----------------<br />" >> $log python $dir/summary_to_fasta.py --input $PWD/summary.txt --fasta $PWD/sequences.fasta - #rm $PWD/sequences.tmp - echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt fi @@ -397,10 +375,6 @@ echo "<tr><td>Baseline cg data</td><td><a href='baseline_cg.txt'>Download</a></td></tr>" >> $output echo "<tr><td>Baseline cm PDF</td><td><a href='baseline_cm.pdf'>Download</a></td></tr>" >> $output echo "<tr><td>Baseline cm data</td><td><a href='baseline_cm.txt'>Download</a></td></tr>" >> $output -#echo "<tr><td></td><td><a href='IgAT.zip'>IgAT zip</a></td></tr>" >> $output -#echo "<tr><td></td><td><a href='IgAT_ca.zip'>IgAT ca zip</a></td></tr>" >> $output -#echo "<tr><td></td><td><a href='IgAT_cg.zip'>IgAT cg zip</a></td></tr>" >> $output -#echo "<tr><td></td><td><a href='IgAT_cm.zip'>IgAT cm zip</a></td></tr>" >> $output echo "<tr><td>An IMGT archive with just the matched and filtered sequences</td><td><a href='new_IMGT.txz'>Download</a></td></tr>" >> $output echo "<tr><td>An IMGT archive with just the matched and filtered ca sequences</td><td><a href='new_IMGT_ca.txz'>Download</a></td></tr>" >> $output echo "<tr><td>An IMGT archive with just the matched and filtered ca1 sequences</td><td><a href='new_IMGT_ca1.txz'>Download</a></td></tr>" >> $output @@ -411,6 +385,8 @@ echo "<tr><td>An IMGT archive with just the matched and filtered cg3 sequences</td><td><a href='new_IMGT_cg3.txz'>Download</a></td></tr>" >> $output echo "<tr><td>An IMGT archive with just the matched and filtered cg4 sequences</td><td><a href='new_IMGT_cg4.txz'>Download</a></td></tr>" >> $output echo "<tr><td>An IMGT archive with just the matched and filtered cm sequences</td><td><a href='new_IMGT_cm.txz'>Download</a></td></tr>" >> $output +echo "<tr><td>The Change-O DB file with defined clones</td><td><a href='change_o/change-o-db-defined_clones.txt'>Download</a></td></tr>" >> $output +echo "<tr><td>The Change-O DB defined clones summary file</td><td><a href='change_o/change-o-defined_clones-summary.txt'>Download</a></td></tr>" >> $output echo "</table>" >> $output echo "</div>" >> $output #downloads tab end @@ -465,15 +441,6 @@ if [[ "$naive_output" != "None" ]] then - #echo "---------------- imgt_loader.r ----------------" - #echo "---------------- imgt_loader.r ----------------<br />" >> $log - #python $dir/imgt_loader.py --summ $PWD/summary.txt --aa $PWD/aa.txt --junction $PWD/junction.txt --output $naive_output - #Rscript --verbose $dir/imgt_loader.r $PWD/summary.txt $PWD/aa.txt $PWD/junction.txt $outdir/loader_output.txt 2>&1 - - #echo "---------------- naive_output.r ----------------" - #echo "---------------- naive_output.r ----------------<br />" >> $log - #Rscript $dir/naive_output.r $outdir/loader_output.txt $outdir/merged.txt ${naive_output_ca} ${naive_output_cg} ${naive_output_cm} $outdir/ntoverview.txt $outdir/ntsum.txt 2>&1 - cp $outdir/new_IMGT_ca.txz ${naive_output_ca} cp $outdir/new_IMGT_cg.txz ${naive_output_cg} cp $outdir/new_IMGT_cm.txz ${naive_output_cm} @@ -481,6 +448,24 @@ echo "</table>" >> $outdir/base_overview.html +echo "---------------- change-o MakeDB ----------------" +echo "---------------- change-o MakeDB ----------------<br />" >> $log + +mkdir $outdir/change_o + +tmp="$PWD" + +cd $outdir/change_o + +bash $dir/change_o/makedb.sh $input false false false $outdir/change_o/change-o-db.txt + +echo "---------------- change-o DefineClones ----------------" +echo "---------------- change-o DefineClones ----------------<br />" >> $log + +bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones.txt $outdir/change_o/change-o-defined_clones-summary.txt + +PWD="$tmp" + mv $log $outdir/log.html echo "<html><center><h1><a href='index.html'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> 'open in new tab' on the link above)<br />" > $log