comparison BlastParser_and_hits.py @ 2:e0985bad7b92 draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Fri, 19 Jun 2015 12:53:52 -0400
parents 3959a271cf3f
children 22641bb68b91
comparison
equal deleted inserted replaced
1:958e769c1c86 2:e0985bad7b92
10 the_parser = argparse.ArgumentParser() 10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)") 11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)")
12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") 12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences")
13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") 13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits")
14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") 14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis")
15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") 15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences")
16 the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs")
16 args = the_parser.parse_args() 17 args = the_parser.parse_args()
17 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): 18 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ):
18 the_parser.error('argument(s) missing, call the -h option of the script') 19 the_parser.error('argument(s) missing, call the -h option of the script')
19 if not args.flanking: 20 if not args.flanking:
20 args.flanking = 0 21 args.flanking = 0
26 return None 27 return None
27 if len(lst) %2 == 1: 28 if len(lst) %2 == 1:
28 return lst[((len(lst)+1)/2)-1] 29 return lst[((len(lst)+1)/2)-1]
29 if len(lst) %2 == 0: 30 if len(lst) %2 == 0:
30 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 31 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0
32
33 def mean(lst):
34 if len(lst) < 1:
35 return 0
36 return sum(lst) / float(len(lst))
31 37
32 def getfasta (fastafile): 38 def getfasta (fastafile):
33 fastadic = {} 39 fastadic = {}
34 for line in open (fastafile): 40 for line in open (fastafile):
35 if line[0] == ">": 41 if line[0] == ">":
100 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] 106 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8]
101 bitScores.append(hit[9]) 107 bitScores.append(hit[9])
102 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple 108 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple
103 TotalSubjectCoverage = len ( set (SubjectCoverageList) ) 109 TotalSubjectCoverage = len ( set (SubjectCoverageList) )
104 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) 110 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength)
105 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores) 111 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores)
106 112
107 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): 113 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue):
108 if rightCoordinate > leftCoordinate: 114 if rightCoordinate > leftCoordinate:
109 polarity = "direct" 115 polarity = "direct"
110 else: 116 else:
113 if leftCoordinate - FlankingValue > 0: 119 if leftCoordinate - FlankingValue > 0:
114 leftCoordinate -= FlankingValue 120 leftCoordinate -= FlankingValue
115 else: 121 else:
116 leftCoordinate = 1 122 leftCoordinate = 1
117 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) 123 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity)
124
125 def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"):
126 F= open(F, "w")
127 Fasta=open(Fasta, "w")
128 if mode == "verbose":
129 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
130 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
131 print >> F, "#\n# %s" % subject
132 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
133 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
134 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
135 print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"])
136 print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"])
137 for header in results[subject]["HitDic"]:
138 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
139 print >> Fasta, "" # final carriage return for the sequence
140 for transcript in Xblastdict[subject]:
141 transcriptSize = float(len(fastadict[transcript]))
142 for hit in Xblastdict[subject][transcript]:
143 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
144 Eval, BitScore = hit[8], hit[9]
145 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
146 info = [str(i) for i in info]
147 info = "\t".join(info)
148 print >> F, info
149 else:
150 print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score"
151 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
152 line = []
153 line.append(subject)
154 line.append(results[subject]["subjectLength"])
155 line.append(results[subject]["TotalCoverage"])
156 line.append(results[subject]["RelativeSubjectCoverage"])
157 line.append(results[subject]["maxBitScores"])
158 line.append(results[subject]["meanBitScores"])
159 line = [str(i) for i in line]
160 print >> F, "\t".join(line)
161 for header in results[subject]["HitDic"]:
162 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
163 print >> Fasta, "" # final carriage return for the sequence
164 F.close()
165 Fasta.close()
166
167
118 168
119 def __main__ (): 169 def __main__ ():
120 args = Parser() 170 args = Parser()
121 fastadict = getfasta (args.sequences) 171 fastadict = getfasta (args.sequences)
122 Xblastdict = getblast (args.blast) 172 Xblastdict = getblast (args.blast)
123 results = defaultdict(dict) 173 results = defaultdict(dict)
124 F = open(args.tabularOutput, "w")
125 Fasta = open(args.fastaOutput, "w")
126 for subject in Xblastdict: 174 for subject in Xblastdict:
127 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) 175 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
128 ## data output 176 outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode)
129 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
130 for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True):
131 print >> F, "#\n# %s" % subject
132 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
133 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
134 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
135 print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"])
136 print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"])
137 for header in results[subject]["HitDic"]:
138 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
139 for transcript in Xblastdict[subject]:
140 transcriptSize = float(len(fastadict[transcript]))
141 for hit in Xblastdict[subject][transcript]:
142 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
143 Eval, BitScore = hit[8], hit[9]
144 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
145 info = [str(i) for i in info]
146 info = "\t".join(info)
147 print >> F, info
148 print >> Fasta, ""
149 F.close()
150 Fasta.close()
151
152 if __name__=="__main__": __main__() 177 if __name__=="__main__": __main__()