Mercurial > repos > mvdbeek > tepid_merge_insertions
view merge_insertions.py @ 0:6e4b5319cb89 draft default tip
planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
author | mvdbeek |
---|---|
date | Mon, 23 Jan 2017 10:05:12 -0500 |
parents | |
children |
line wrap: on
line source
from argparse import ArgumentParser import os import tempfile import pandas as pd import pybedtools COLUMNS = ['ins_chrom', 'ins_start', 'ins_end', 'ref_chrom', 'ref_start', 'ref_end', 'agi', 'accession', 'cluster'] def overlap(start1, stop1, start2, stop2, d=50): """returns True if sets of coordinates overlap. Assumes coordinates are on same chromosome""" return start1 <= stop2+d and stop1 >= start2-d def merge(i, insertion, result): if len(result) == 0: result[i] = insertion else: if not can_merge(insertion, result): result[i] = insertion def can_merge(insertion, result): """ Merges insertions and returns True if all requirements are met """ for j, master_insertion in result.items(): if insertion['agi'] & master_insertion['agi']: if overlap(master_insertion['ins_start'], master_insertion['ins_end'], insertion['ins_start'],insertion['ins_end']): # Adjusting the insertion start (doesn't really do anything?!) if len(insertion['agi']) < len(master_insertion['agi']): ref_start = master_insertion['ref_start'] else: ref_start = insertion['ref_start'] if master_insertion['ins_chrom'] == insertion['ins_chrom'] and insertion['ref_chrom'] == master_insertion['ref_chrom'] and ref_start == master_insertion['ref_start']: result[j]['accession'] = result[j]['accession'] | (insertion['accession']) result[j]['agi'] = result[j]['agi'] | (insertion['agi']) return True return False def inner_merge(s): result = {} for i, insertion in s.items(): merge(i, insertion, result) return result.values() def reduce_and_cluster(inputfiles): """ Read in inputfiles using pandas, write additional column with sample identifier, sort and cluster using pybedtools and return dataframe. """ usecols = [0,1,2,3,4,5,6] # skip col 7, which contains the read support id tables = [pd.read_table(f, header=None) for f in inputfiles] sample_ids = [os.path.basename(f).rsplit('.')[0] for f in inputfiles] for sample_id, df in zip(sample_ids, tables): df[7] = sample_id merged_table = pd.concat(tables) tfile = tempfile.NamedTemporaryFile() merged_table.to_csv(tfile, sep="\t", header=None, index=False) tfile.flush() bedfile = pybedtools.BedTool(tfile.name).sort().cluster(d=50) df = bedfile.to_dataframe() df.columns = COLUMNS # Split comma separated agi values and make set df['agi'] = [set(v.split(',')) for v in df['agi'].values] df['accession'] = [set(str(v).split(',')) for v in df['accession'].values] return df def split_clusters(df): """ clusters as defined by bedtools allow for 50 nt distance. This means that clusters can be many kb large, so we check each individual insertion in the cluster against the other insertions. We split the clusters based on whether the overlap and TE identity criteria are fulfilled (so a different TE would lead to a split in the clusters) """ groups = df.groupby('cluster') nested_list = [inner_merge(group.transpose().to_dict()) for _, group in groups] return pd.DataFrame([i for n in nested_list for i in n])[COLUMNS] def write_output(df, output): # Turn sets back to comma-separated values df['agi'] = [",".join(agi) for agi in df['agi']] df['accession'] = [",".join(acc) for acc in df['accession']] # write out result without last column df.to_csv(output, sep="\t",header=None, index=None, columns=COLUMNS[:-1]) def main(inputfiles, output): df = reduce_and_cluster(inputfiles) df = split_clusters(df) write_output(df, output) if __name__ == "__main__": parser = ArgumentParser(description='Merge TE insertions calls') parser.add_argument('-o', '--output', help='output file', required=True) parser.add_argument('-i', '--input', help='Insertion files to merge', nargs="+", required=True) options = parser.parse_args() main(inputfiles=options.input, output=options.output)