Mercurial > repos > jjohnson > defuse
comparison defuse_trinity_analysis.py @ 36:4353f776dfa3
Add defuse_trinity_analysis
| author | Jim Johnson <jj@umn.edu> |
|---|---|
| date | Tue, 10 Feb 2015 19:35:52 -0600 |
| parents | |
| children | 90127ee1eae5 |
comparison
equal
deleted
inserted
replaced
| 35:a004033614d4 | 36:4353f776dfa3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 # | |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2014, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | |
| 9 # | |
| 10 # James E Johnson | |
| 11 # | |
| 12 #------------------------------------------------------------------------------ | |
| 13 """ | |
| 14 | |
| 15 | |
| 16 """ | |
| 17 This tool takes the defuse results.tsv tab-delimited file, trinity | |
| 18 and creates a tabular report | |
| 19 """ | |
| 20 | |
| 21 import sys,re,os.path | |
| 22 import optparse | |
| 23 from optparse import OptionParser | |
| 24 | |
| 25 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1]) | |
| 26 | |
| 27 def read_fasta(fp): | |
| 28 name, seq = None, [] | |
| 29 for line in fp: | |
| 30 line = line.rstrip() | |
| 31 if line.startswith(">"): | |
| 32 if name: yield (name, ''.join(seq)) | |
| 33 name, seq = line, [] | |
| 34 else: | |
| 35 seq.append(line) | |
| 36 if name: yield (name, ''.join(seq)) | |
| 37 | |
| 38 | |
| 39 def test_rcomplement(seq, target): | |
| 40 try: | |
| 41 comp = revcompl(seq) | |
| 42 return comp in target | |
| 43 except: | |
| 44 pass | |
| 45 return False | |
| 46 | |
| 47 def test_reverse(seq,target): | |
| 48 return options.test_reverse and seq and seq[::-1] in target | |
| 49 | |
| 50 def cmp_alphanumeric(s1,s2): | |
| 51 if s1 == s2: | |
| 52 return 0 | |
| 53 a1 = re.findall("\d+|[a-zA-Z]+",s1) | |
| 54 a2 = re.findall("\d+|[a-zA-Z]+",s2) | |
| 55 for i in range(min(len(a1),len(a2))): | |
| 56 if a1[i] == a2[i]: | |
| 57 continue | |
| 58 if a1[i].isdigit() and a2[i].isdigit(): | |
| 59 return int(a1[i]) - int(a2[i]) | |
| 60 return 1 if a1[i] > a2[i] else -1 | |
| 61 return len(a1) - len(a2) | |
| 62 | |
| 63 | |
| 64 def parse_defuse_results(inputFile): | |
| 65 columns = [] | |
| 66 defuse_results = [] | |
| 67 # {cluster_id : { field : value}) | |
| 68 try: | |
| 69 for linenum,line in enumerate(inputFile): | |
| 70 ## print >> sys.stderr, "%d: %s\n" % (linenum,line) | |
| 71 fields = line.strip().split('\t') | |
| 72 if line.startswith('cluster_id'): | |
| 73 columns = fields | |
| 74 ## print >> sys.stderr, "columns: %s\n" % columns | |
| 75 continue | |
| 76 cluster_dict = dict() | |
| 77 cluster_id = fields[columns.index('cluster_id')] | |
| 78 cluster_dict['cluster_id'] = fields[columns.index('cluster_id')] | |
| 79 cluster_dict['gene_chromosome1'] = fields[columns.index('gene_chromosome1')] | |
| 80 cluster_dict['gene_chromosome2'] = fields[columns.index('gene_chromosome2')] | |
| 81 cluster_dict['genomic_strand1'] = fields[columns.index('genomic_strand1')] | |
| 82 cluster_dict['genomic_strand2'] = fields[columns.index('genomic_strand2')] | |
| 83 cluster_dict['gene1'] = fields[columns.index('gene1')] | |
| 84 cluster_dict['gene2'] = fields[columns.index('gene2')] | |
| 85 cluster_dict['gene_name1'] = fields[columns.index('gene_name1')] | |
| 86 cluster_dict['gene_name2'] = fields[columns.index('gene_name2')] | |
| 87 cluster_dict['gene_location1'] = fields[columns.index('gene_location1')] | |
| 88 cluster_dict['gene_location2'] = fields[columns.index('gene_location2')] | |
| 89 cluster_dict['expression1'] = int(fields[columns.index('expression1')]) | |
| 90 cluster_dict['expression2'] = int(fields[columns.index('expression2')]) | |
| 91 cluster_dict['genomic_break_pos1'] = int(fields[columns.index('genomic_break_pos1')]) | |
| 92 cluster_dict['genomic_break_pos2'] = int(fields[columns.index('genomic_break_pos2')]) | |
| 93 cluster_dict['breakpoint_homology'] = int(fields[columns.index('breakpoint_homology')]) | |
| 94 cluster_dict['orf'] = fields[columns.index('orf')] == 'Y' | |
| 95 cluster_dict['exonboundaries'] = fields[columns.index('exonboundaries')] == 'Y' | |
| 96 cluster_dict['read_through'] = fields[columns.index('read_through')] == 'Y' | |
| 97 cluster_dict['interchromosomal'] = fields[columns.index('interchromosomal')] == 'Y' | |
| 98 cluster_dict['adjacent'] = fields[columns.index('adjacent')] == 'Y' | |
| 99 cluster_dict['altsplice'] = fields[columns.index('altsplice')] == 'Y' | |
| 100 cluster_dict['deletion'] = fields[columns.index('deletion')] == 'Y' | |
| 101 cluster_dict['eversion'] = fields[columns.index('eversion')] == 'Y' | |
| 102 cluster_dict['inversion'] = fields[columns.index('inversion')] == 'Y' | |
| 103 cluster_dict['span_count'] = int(fields[columns.index('span_count')]) | |
| 104 cluster_dict['splitr_count'] = int(fields[columns.index('splitr_count')]) | |
| 105 cluster_dict['splice_score'] = int(fields[columns.index('splice_score')]) | |
| 106 cluster_dict['probability'] = float(fields[columns.index('probability')] if columns.index('probability') else 'nan') | |
| 107 cluster_dict['splitr_sequence'] = fields[columns.index('splitr_sequence')] | |
| 108 defuse_results.append(cluster_dict) | |
| 109 except Exception, e: | |
| 110 print >> sys.stderr, "failed: %s" % e | |
| 111 sys.exit(1) | |
| 112 return defuse_results | |
| 113 | |
| 114 ## deFuse params to the mapping application? | |
| 115 | |
| 116 def __main__(): | |
| 117 #Parse Command Line | |
| 118 parser = optparse.OptionParser() | |
| 119 # files | |
| 120 parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' ) | |
| 121 parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' ) | |
| 122 parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' ) | |
| 123 parser.add_option( '-o', '--output', dest='output', help='The output report (else write to stdout)' ) | |
| 124 parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', help='The output alignment file' ) | |
| 125 parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', help='The output alignment file' ) | |
| 126 parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' ) | |
| 127 parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' ) | |
| 128 parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' ) | |
| 129 parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' ) | |
| 130 # min_orf_len | |
| 131 # split_na_len | |
| 132 # tic_len = 1000000 | |
| 133 # prior | |
| 134 # deFuse direction reversed | |
| 135 # in frame ? | |
| 136 # contain known protein elements | |
| 137 # what protein change | |
| 138 # trinity provides full transctipt, defuse doesn't show full | |
| 139 #parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference fasta' ) | |
| 140 #parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='The genomic reference gtf feature file') | |
| 141 (options, args) = parser.parse_args() | |
| 142 | |
| 143 # results.tsv input | |
| 144 if options.input != None: | |
| 145 try: | |
| 146 inputPath = os.path.abspath(options.input) | |
| 147 inputFile = open(inputPath, 'r') | |
| 148 except Exception, e: | |
| 149 print >> sys.stderr, "failed: %s" % e | |
| 150 exit(2) | |
| 151 else: | |
| 152 inputFile = sys.stdin | |
| 153 # vcf output | |
| 154 if options.output != None: | |
| 155 try: | |
| 156 outputPath = os.path.abspath(options.output) | |
| 157 outputFile = open(outputPath, 'w') | |
| 158 except Exception, e: | |
| 159 print >> sys.stderr, "failed: %s" % e | |
| 160 exit(3) | |
| 161 else: | |
| 162 outputFile = sys.stdout | |
| 163 | |
| 164 ## Read defuse results | |
| 165 fusions = parse_defuse_results(inputFile) | |
| 166 ## Create a field with the 12 nt before and after the fusion point. | |
| 167 ## Create a field with the reverse complement of the 24 nt fusion point field. | |
| 168 ## Add fusion type filed (INTER, INTRA, TIC) | |
| 169 for i,fusion in enumerate(fusions): | |
| 170 fusion['ordinal'] = i + 1 | |
| 171 split_seqs = fusion['splitr_sequence'].split('|') | |
| 172 fusion['split_seqs'] = split_seqs | |
| 173 fwd_seq = split_seqs[0][-(min(abs(options.nbases),len(split_seqs[0]))):] + split_seqs[1][:min(abs(options.nbases),len(split_seqs[1]))] | |
| 174 rev_seq = revcompl(fwd_seq) | |
| 175 fusion['fwd_seq'] = fwd_seq | |
| 176 fusion['rev_seq'] = rev_seq | |
| 177 fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC' | |
| 178 fusion['fusion_type'] = fusion_type | |
| 179 fusion['transcripts'] = [] | |
| 180 fusion['Transcript'] = 'No' | |
| 181 fusion['Protein'] = 'No' | |
| 182 print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2']) | |
| 183 inputFile.close() | |
| 184 | |
| 185 ## Process Trinity data and compare to deFuse | |
| 186 matched_transcripts = dict() | |
| 187 matched_orfs = dict() | |
| 188 fusions_with_transcripts = set() | |
| 189 fusions_with_orfs = set() | |
| 190 n = 0 | |
| 191 if options.transcripts: | |
| 192 with open(options.transcripts) as fp: | |
| 193 for name, seq in read_fasta(fp): | |
| 194 n += 1 | |
| 195 for i,fusion in enumerate(fusions): | |
| 196 if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq: | |
| 197 fusions_with_transcripts.add(i) | |
| 198 matched_transcripts[name] = seq | |
| 199 fusion['transcripts'].append(name) | |
| 200 fusion['Transcript'] = 'Yes' | |
| 201 print >> sys.stdout, "fusions_with_transcripts: %d %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts)) | |
| 202 for i,fusion in enumerate(fusions): | |
| 203 print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts']) | |
| 204 ## Process ORFs and compare to matched deFuse and Trinity data. | |
| 205 ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*". | |
| 206 if options.peptides: | |
| 207 with open(options.peptides) as fp: | |
| 208 for name, seq in read_fasta(fp): | |
| 209 n += 1 | |
| 210 if len(seq) < options.min_pep_len: | |
| 211 continue | |
| 212 for i,fusion in enumerate(fusions): | |
| 213 if len(fusion['transcripts']) > 0: | |
| 214 for id_string in fusion['transcripts']: | |
| 215 tx_id = id_string.lstrip('>').split()[0] | |
| 216 if tx_id in name: | |
| 217 pep_len = len(seq) | |
| 218 start = seq.find('M') | |
| 219 if pep_len - start < options.min_pep_len: | |
| 220 continue | |
| 221 fusions_with_orfs.add(i) | |
| 222 matched_orfs[name] = seq | |
| 223 fusion['Protein'] = 'Yes' | |
| 224 # fwd or reverse | |
| 225 tx_seq = matched_transcripts(tx_id) | |
| 226 pos = tx_seq.find(fusion['fwd_seq']) | |
| 227 if pos < 0: | |
| 228 pos = tx_seq.find(fusion['rev_seq']) | |
| 229 # locate fusion in transcript | |
| 230 # locate fusion in ORF | |
| 231 fusion['prior_pep_seq'] = '' | |
| 232 fusion['novel_pep_seq'] = '' | |
| 233 print >> sys.stdout, "fusions_with_orfs: %d %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs)) | |
| 234 ## Write reports | |
| 235 report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein'] | |
| 236 report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point','fusion_type':'Type','Transcript':'Transcript?','Protein':'Protein?' } | |
| 237 print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields])) | |
| 238 for i,fusion in enumerate(fusions): | |
| 239 print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields])) | |
| 240 # print >> outputFile, "%d\t%s\t%s\t%d\t%f\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['gene_name1'],fusion['gene_name2'],fusion['span_count'],fusion['probability'],fusion['gene_chromosome1'],fusion['gene_location1'],fusion['gene_chromosome2'],fusion['gene_location2'],fusion['fusion_type'],fusion['Transcript'],fusion['Protein']) | |
| 241 | |
| 242 if __name__ == "__main__" : __main__() | |
| 243 |
