comparison BlastParser_and_hits.py @ 0:3959a271cf3f draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Tue, 09 Jun 2015 04:15:34 -0400
parents
children e0985bad7b92
comparison
equal deleted inserted replaced
-1:000000000000 0:3959a271cf3f
1 #!/usr/bin/python
2 # blastn blastx parser revised debugged: 3-4-2015. Commit issue.
3 # drosofff@gmail.com
4
5 import sys
6 import argparse
7 from collections import defaultdict
8
9 def Parser():
10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)")
12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences")
13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits")
14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis")
15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences")
16 args = the_parser.parse_args()
17 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ):
18 the_parser.error('argument(s) missing, call the -h option of the script')
19 if not args.flanking:
20 args.flanking = 0
21 return args
22
23 def median(lst):
24 lst = sorted(lst)
25 if len(lst) < 1:
26 return None
27 if len(lst) %2 == 1:
28 return lst[((len(lst)+1)/2)-1]
29 if len(lst) %2 == 0:
30 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0
31
32 def getfasta (fastafile):
33 fastadic = {}
34 for line in open (fastafile):
35 if line[0] == ">":
36 header = line[1:-1]
37 fastadic[header] = ""
38 else:
39 fastadic[header] += line
40 for header in fastadic:
41 fastadic[header] = "".join(fastadic[header].split("\n"))
42 return fastadic
43
44 def insert_newlines(string, every=60):
45 lines = []
46 for i in xrange(0, len(string), every):
47 lines.append(string[i:i+every])
48 return '\n'.join(lines)
49
50 def getblast (blastfile):
51 '''blastinfo [0] Percentage of identical matches
52 blastinfo [1] Alignment length
53 blastinfo [2] Number of mismatches
54 blastinfo [3] Number of gap openings
55 blastinfo [4] Start of alignment in query
56 blastinfo [5] End of alignment in query
57 blastinfo [6] Start of alignment in subject (database hit)
58 blastinfo [7] End of alignment in subject (database hit)
59 blastinfo [8] Expectation value (E-value)
60 blastinfo [9] Bit score
61 blastinfo [10] Subject length (NEED TO BE SPECIFIED WHEN RUNNING BLAST) '''
62 blastdic = defaultdict (dict)
63 for line in open (blastfile):
64 fields = line[:-1].split("\t")
65 transcript = fields[0]
66 subject = fields[1]
67 blastinfo = [float(fields[2]) ] # blastinfo[0]
68 blastinfo = blastinfo + [int(i) for i in fields[3:10] ] # blastinfo[1:8] insets 1 to 7
69 blastinfo.append(fields[10]) # blastinfo[8] E-value remains as a string type
70 blastinfo.append(float(fields[11])) # blastinfo[9] Bit score
71 blastinfo.append(int(fields[12])) # blastinfo[10] Subject length MUST BE RETRIEVED THROUGH A 13 COLUMN BLAST OUTPUT
72 try:
73 blastdic[subject][transcript].append(blastinfo)
74 except:
75 blastdic[subject][transcript] = [ blastinfo ]
76 return blastdic
77
78 def getseq (fastadict, transcript, up, down, orientation="direct"):
79 def reverse (seq):
80 revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
81 revseq = [revdict[i] for i in seq[::-1]]
82 return "".join(revseq)
83 pickseq = fastadict[transcript][up-1:down]
84 if orientation == "direct":
85 return pickseq
86 else:
87 return reverse(pickseq)
88
89 def subjectCoverage (fastadict, blastdict, subject, QueriesFlankingNucleotides=0):
90 SubjectCoverageList = []
91 HitDic = {}
92 bitScores = []
93 for transcript in blastdict[subject]:
94 prefix = "%s--%s_" % (subject, transcript)
95 hitNumber = 0
96 for hit in blastdict[subject][transcript]:
97 hitNumber += 1
98 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, hit[0], hit[1], hit[8])
99 HitDic[prefix+suffix] = GetHitSequence (fastadict, transcript, hit[4], hit[5], QueriesFlankingNucleotides) #query coverage by a hit is in hit[4:6]
100 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8]
101 bitScores.append(hit[9])
102 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple
103 TotalSubjectCoverage = len ( set (SubjectCoverageList) )
104 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength)
105 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores)
106
107 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue):
108 if rightCoordinate > leftCoordinate:
109 polarity = "direct"
110 else:
111 polarity = "reverse"
112 leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate
113 if leftCoordinate - FlankingValue > 0:
114 leftCoordinate -= FlankingValue
115 else:
116 leftCoordinate = 1
117 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity)
118
119 def __main__ ():
120 args = Parser()
121 fastadict = getfasta (args.sequences)
122 Xblastdict = getblast (args.blast)
123 results = defaultdict(dict)
124 F = open(args.tabularOutput, "w")
125 Fasta = open(args.fastaOutput, "w")
126 for subject in Xblastdict:
127 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
128 ## data output
129 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
130 for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True):
131 print >> F, "#\n# %s" % subject
132 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
133 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
134 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
135 print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"])
136 print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"])
137 for header in results[subject]["HitDic"]:
138 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
139 for transcript in Xblastdict[subject]:
140 transcriptSize = float(len(fastadict[transcript]))
141 for hit in Xblastdict[subject][transcript]:
142 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
143 Eval, BitScore = hit[8], hit[9]
144 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
145 info = [str(i) for i in info]
146 info = "\t".join(info)
147 print >> F, info
148 print >> Fasta, ""
149 F.close()
150 Fasta.close()
151
152 if __name__=="__main__": __main__()