| 36 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 # | 
|  | 4 #------------------------------------------------------------------------------ | 
|  | 5 #                         University of Minnesota | 
|  | 6 #         Copyright 2014, Regents of the University of Minnesota | 
|  | 7 #------------------------------------------------------------------------------ | 
|  | 8 # Author: | 
|  | 9 # | 
|  | 10 #  James E Johnson | 
|  | 11 # | 
|  | 12 #------------------------------------------------------------------------------ | 
|  | 13 """ | 
|  | 14 | 
|  | 15 | 
|  | 16 """ | 
|  | 17 This tool takes the defuse results.tsv  tab-delimited file, trinity | 
|  | 18 and creates a tabular report | 
|  | 19 """ | 
|  | 20 | 
|  | 21 import sys,re,os.path | 
|  | 22 import optparse | 
|  | 23 from optparse import OptionParser | 
|  | 24 | 
|  | 25 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1]) | 
|  | 26 | 
|  | 27 def read_fasta(fp): | 
|  | 28     name, seq = None, [] | 
|  | 29     for line in fp: | 
|  | 30         line = line.rstrip() | 
|  | 31         if line.startswith(">"): | 
|  | 32             if name: yield (name, ''.join(seq)) | 
|  | 33             name, seq = line, [] | 
|  | 34         else: | 
|  | 35             seq.append(line) | 
|  | 36     if name: yield (name, ''.join(seq)) | 
|  | 37 | 
|  | 38 | 
|  | 39 def test_rcomplement(seq, target): | 
|  | 40   try: | 
|  | 41     comp = revcompl(seq) | 
|  | 42     return comp in target | 
|  | 43   except: | 
|  | 44     pass | 
|  | 45   return False | 
|  | 46 | 
|  | 47 def test_reverse(seq,target): | 
|  | 48   return options.test_reverse and seq and seq[::-1] in target | 
|  | 49 | 
|  | 50 def cmp_alphanumeric(s1,s2): | 
|  | 51   if s1 == s2: | 
|  | 52     return 0 | 
|  | 53   a1 = re.findall("\d+|[a-zA-Z]+",s1) | 
|  | 54   a2 = re.findall("\d+|[a-zA-Z]+",s2) | 
|  | 55   for i in range(min(len(a1),len(a2))): | 
|  | 56     if a1[i] == a2[i]: | 
|  | 57       continue | 
|  | 58     if a1[i].isdigit() and a2[i].isdigit(): | 
|  | 59       return int(a1[i]) - int(a2[i]) | 
|  | 60     return 1 if a1[i] >  a2[i] else -1 | 
|  | 61   return len(a1) - len(a2) | 
|  | 62 | 
|  | 63 | 
|  | 64 def parse_defuse_results(inputFile): | 
|  | 65   columns = [] | 
|  | 66   defuse_results = [] | 
|  | 67   # {cluster_id : { field : value}) | 
|  | 68   try: | 
|  | 69     for linenum,line in enumerate(inputFile): | 
|  | 70       ## print >> sys.stderr, "%d: %s\n" % (linenum,line) | 
|  | 71       fields = line.strip().split('\t') | 
|  | 72       if line.startswith('cluster_id'): | 
|  | 73         columns = fields | 
|  | 74         ## print >> sys.stderr, "columns: %s\n" % columns | 
|  | 75         continue | 
|  | 76       cluster_dict = dict() | 
|  | 77       cluster_id = fields[columns.index('cluster_id')] | 
|  | 78       cluster_dict['cluster_id'] = fields[columns.index('cluster_id')] | 
|  | 79       cluster_dict['gene_chromosome1'] = fields[columns.index('gene_chromosome1')] | 
|  | 80       cluster_dict['gene_chromosome2'] = fields[columns.index('gene_chromosome2')] | 
|  | 81       cluster_dict['genomic_strand1'] = fields[columns.index('genomic_strand1')] | 
|  | 82       cluster_dict['genomic_strand2'] = fields[columns.index('genomic_strand2')] | 
|  | 83       cluster_dict['gene1'] = fields[columns.index('gene1')] | 
|  | 84       cluster_dict['gene2'] = fields[columns.index('gene2')] | 
|  | 85       cluster_dict['gene_name1'] = fields[columns.index('gene_name1')] | 
|  | 86       cluster_dict['gene_name2'] = fields[columns.index('gene_name2')] | 
|  | 87       cluster_dict['gene_location1'] = fields[columns.index('gene_location1')] | 
|  | 88       cluster_dict['gene_location2'] = fields[columns.index('gene_location2')] | 
|  | 89       cluster_dict['expression1'] = int(fields[columns.index('expression1')]) | 
|  | 90       cluster_dict['expression2'] = int(fields[columns.index('expression2')]) | 
|  | 91       cluster_dict['genomic_break_pos1'] = int(fields[columns.index('genomic_break_pos1')]) | 
|  | 92       cluster_dict['genomic_break_pos2'] = int(fields[columns.index('genomic_break_pos2')]) | 
|  | 93       cluster_dict['breakpoint_homology'] = int(fields[columns.index('breakpoint_homology')]) | 
|  | 94       cluster_dict['orf'] = fields[columns.index('orf')] == 'Y' | 
|  | 95       cluster_dict['exonboundaries'] = fields[columns.index('exonboundaries')] == 'Y' | 
|  | 96       cluster_dict['read_through'] = fields[columns.index('read_through')] == 'Y' | 
|  | 97       cluster_dict['interchromosomal'] = fields[columns.index('interchromosomal')] == 'Y' | 
|  | 98       cluster_dict['adjacent'] = fields[columns.index('adjacent')] == 'Y' | 
|  | 99       cluster_dict['altsplice'] = fields[columns.index('altsplice')] == 'Y' | 
|  | 100       cluster_dict['deletion'] = fields[columns.index('deletion')] == 'Y' | 
|  | 101       cluster_dict['eversion'] = fields[columns.index('eversion')] == 'Y' | 
|  | 102       cluster_dict['inversion'] = fields[columns.index('inversion')] == 'Y' | 
|  | 103       cluster_dict['span_count'] = int(fields[columns.index('span_count')]) | 
|  | 104       cluster_dict['splitr_count'] = int(fields[columns.index('splitr_count')]) | 
|  | 105       cluster_dict['splice_score'] = int(fields[columns.index('splice_score')]) | 
|  | 106       cluster_dict['probability'] = float(fields[columns.index('probability')] if columns.index('probability') else 'nan') | 
|  | 107       cluster_dict['splitr_sequence'] = fields[columns.index('splitr_sequence')] | 
|  | 108       defuse_results.append(cluster_dict) | 
|  | 109   except Exception, e: | 
|  | 110     print >> sys.stderr, "failed: %s" % e | 
|  | 111     sys.exit(1) | 
|  | 112   return defuse_results | 
|  | 113 | 
|  | 114 ## deFuse params to the mapping application? | 
|  | 115 | 
|  | 116 def __main__(): | 
|  | 117   #Parse Command Line | 
|  | 118   parser = optparse.OptionParser() | 
|  | 119   # files | 
|  | 120   parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' ) | 
|  | 121   parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' ) | 
|  | 122   parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' ) | 
|  | 123   parser.add_option( '-o', '--output', dest='output', help='The output report (else write to stdout)' ) | 
|  | 124   parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', help='The output alignment file' ) | 
|  | 125   parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', help='The output alignment file' ) | 
|  | 126   parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' ) | 
|  | 127   parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' ) | 
|  | 128   parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' ) | 
|  | 129   parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' ) | 
|  | 130   # min_orf_len | 
|  | 131   # split_na_len | 
|  | 132   # tic_len = 1000000 | 
|  | 133   # prior | 
|  | 134   # deFuse direction reversed | 
|  | 135   # in frame ? | 
|  | 136   # contain known protein elements | 
|  | 137   # what protein change | 
|  | 138   # trinity provides full transctipt, defuse doesn't show full | 
|  | 139   #parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference fasta' ) | 
|  | 140   #parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='The genomic reference gtf feature file') | 
|  | 141   (options, args) = parser.parse_args() | 
|  | 142 | 
|  | 143   # results.tsv input | 
|  | 144   if options.input != None: | 
|  | 145     try: | 
|  | 146       inputPath = os.path.abspath(options.input) | 
|  | 147       inputFile = open(inputPath, 'r') | 
|  | 148     except Exception, e: | 
|  | 149       print >> sys.stderr, "failed: %s" % e | 
|  | 150       exit(2) | 
|  | 151   else: | 
|  | 152     inputFile = sys.stdin | 
|  | 153   # vcf output | 
|  | 154   if options.output != None: | 
|  | 155     try: | 
|  | 156       outputPath = os.path.abspath(options.output) | 
|  | 157       outputFile = open(outputPath, 'w') | 
|  | 158     except Exception, e: | 
|  | 159       print >> sys.stderr, "failed: %s" % e | 
|  | 160       exit(3) | 
|  | 161   else: | 
|  | 162     outputFile = sys.stdout | 
|  | 163 | 
|  | 164   ## Read defuse results | 
|  | 165   fusions = parse_defuse_results(inputFile) | 
|  | 166   ## Create a field with the 12 nt before and after the fusion point. | 
|  | 167   ## Create a field with the reverse complement of the 24 nt fusion point field. | 
|  | 168   ## Add fusion type filed (INTER, INTRA, TIC) | 
|  | 169   for i,fusion in enumerate(fusions): | 
|  | 170       fusion['ordinal'] = i + 1 | 
|  | 171       split_seqs = fusion['splitr_sequence'].split('|') | 
|  | 172       fusion['split_seqs'] = split_seqs | 
|  | 173       fwd_seq = split_seqs[0][-(min(abs(options.nbases),len(split_seqs[0]))):] + split_seqs[1][:min(abs(options.nbases),len(split_seqs[1]))] | 
|  | 174       rev_seq =  revcompl(fwd_seq) | 
|  | 175       fusion['fwd_seq'] = fwd_seq | 
|  | 176       fusion['rev_seq'] = rev_seq | 
|  | 177       fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC' | 
|  | 178       fusion['fusion_type'] = fusion_type | 
|  | 179       fusion['transcripts'] = [] | 
|  | 180       fusion['Transcript'] = 'No' | 
|  | 181       fusion['Protein'] = 'No' | 
|  | 182       print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2']) | 
|  | 183   inputFile.close() | 
|  | 184 | 
|  | 185   ## Process Trinity data and compare to deFuse | 
|  | 186   matched_transcripts = dict() | 
|  | 187   matched_orfs = dict() | 
|  | 188   fusions_with_transcripts = set() | 
|  | 189   fusions_with_orfs = set() | 
|  | 190   n = 0 | 
|  | 191   if options.transcripts: | 
|  | 192     with open(options.transcripts) as fp: | 
|  | 193       for name, seq in read_fasta(fp): | 
|  | 194         n += 1 | 
|  | 195         for i,fusion in enumerate(fusions): | 
|  | 196           if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq: | 
|  | 197             fusions_with_transcripts.add(i) | 
|  | 198             matched_transcripts[name] = seq | 
|  | 199             fusion['transcripts'].append(name) | 
|  | 200             fusion['Transcript'] = 'Yes' | 
|  | 201     print >> sys.stdout, "fusions_with_transcripts: %d  %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts)) | 
|  | 202     for i,fusion in enumerate(fusions): | 
|  | 203       print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts']) | 
|  | 204     ## Process ORFs and compare to matched deFuse and Trinity data. | 
|  | 205     ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*". | 
|  | 206     if options.peptides: | 
|  | 207       with open(options.peptides) as fp: | 
|  | 208         for name, seq in read_fasta(fp): | 
|  | 209           n += 1 | 
|  | 210           if len(seq) < options.min_pep_len: | 
|  | 211             continue | 
|  | 212           for i,fusion in enumerate(fusions): | 
|  | 213             if len(fusion['transcripts']) > 0: | 
|  | 214               for id_string in fusion['transcripts']: | 
|  | 215                 tx_id = id_string.lstrip('>').split()[0] | 
|  | 216                 if tx_id in name: | 
|  | 217                   pep_len = len(seq) | 
|  | 218                   start = seq.find('M') | 
|  | 219                   if pep_len - start < options.min_pep_len: | 
|  | 220                     continue | 
|  | 221                   fusions_with_orfs.add(i) | 
|  | 222                   matched_orfs[name] = seq | 
|  | 223                   fusion['Protein'] = 'Yes' | 
|  | 224                   # fwd or reverse | 
|  | 225                   tx_seq = matched_transcripts(tx_id) | 
|  | 226                   pos = tx_seq.find(fusion['fwd_seq']) | 
|  | 227                   if pos < 0: | 
|  | 228                     pos = tx_seq.find(fusion['rev_seq']) | 
|  | 229                   # locate fusion in transcript | 
|  | 230                   # locate fusion in ORF | 
|  | 231                   fusion['prior_pep_seq'] = '' | 
|  | 232                   fusion['novel_pep_seq'] = '' | 
|  | 233       print >> sys.stdout, "fusions_with_orfs: %d  %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs)) | 
|  | 234   ## Write reports | 
|  | 235   report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein'] | 
|  | 236   report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point','fusion_type':'Type','Transcript':'Transcript?','Protein':'Protein?' } | 
|  | 237   print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields])) | 
|  | 238   for i,fusion in enumerate(fusions): | 
|  | 239     print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields])) | 
|  | 240     # print >> outputFile, "%d\t%s\t%s\t%d\t%f\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['gene_name1'],fusion['gene_name2'],fusion['span_count'],fusion['probability'],fusion['gene_chromosome1'],fusion['gene_location1'],fusion['gene_chromosome2'],fusion['gene_location2'],fusion['fusion_type'],fusion['Transcript'],fusion['Protein']) | 
|  | 241 | 
|  | 242 if __name__ == "__main__" : __main__() | 
|  | 243 |