annotate BlastParser_and_hits.py @ 9:86f424753b2d draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
author drosofff
date Fri, 15 Jan 2016 12:29:10 -0500
parents efb051ac0da9
children cf78a10bcfcf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
1 #!/usr/bin/python
9
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
2 # blastn tblastn blastx parser revised 14-1-2016.
0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
3 # drosofff@gmail.com
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
4
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
5 import sys
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
6 import argparse
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
7 from collections import defaultdict
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
8
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
9 def Parser():
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
10 the_parser = argparse.ArgumentParser()
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)")
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences")
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits")
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis")
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences")
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
16 the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs")
4
22641bb68b91 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 2
diff changeset
17 the_parser.add_argument('--filter_relativeCov', action="store", type=float, default=0, help="filter out relative coverages below the specified ratio (float number)")
8
efb051ac0da9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents: 7
diff changeset
18 the_parser.add_argument('--filter_maxScore', action="store", type=float, default=0, help="filter out best BitScores below the specified float number")
efb051ac0da9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents: 7
diff changeset
19 the_parser.add_argument('--filter_meanScore', action="store", type=float, default=0, help="filter out mean BitScores below the specified float number")
9
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
20 the_parser.add_argument('--filter_term_in', action="store", type=str, default="", help="select the specified term in the subject list")
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
21 the_parser.add_argument('--filter_term_out', action="store", type=str, default="", help="exclude the specified term from the subject list")
6
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
22 the_parser.add_argument('--al_sequences', action="store", type=str, help="sequences that have been blast aligned")
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
23 the_parser.add_argument('--un_sequences', action="store", type=str, help="sequences that have not been blast aligned")
0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
24 args = the_parser.parse_args()
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
25 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
26 the_parser.error('argument(s) missing, call the -h option of the script')
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
27 if not args.flanking:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
28 args.flanking = 0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
29 return args
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
30
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
31 def median(lst):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
32 lst = sorted(lst)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
33 if len(lst) < 1:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
34 return None
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
35 if len(lst) %2 == 1:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
36 return lst[((len(lst)+1)/2)-1]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
37 if len(lst) %2 == 0:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
38 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
39
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
40 def mean(lst):
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
41 if len(lst) < 1:
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
42 return 0
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
43 return sum(lst) / float(len(lst))
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
44
0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
45 def getfasta (fastafile):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
46 fastadic = {}
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
47 for line in open (fastafile):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
48 if line[0] == ">":
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
49 header = line[1:-1]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
50 fastadic[header] = ""
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
51 else:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
52 fastadic[header] += line
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
53 for header in fastadic:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
54 fastadic[header] = "".join(fastadic[header].split("\n"))
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
55 return fastadic
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
56
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
57 def insert_newlines(string, every=60):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
58 lines = []
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
59 for i in xrange(0, len(string), every):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
60 lines.append(string[i:i+every])
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
61 return '\n'.join(lines)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
62
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
63 def getblast (blastfile):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
64 '''blastinfo [0] Percentage of identical matches
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
65 blastinfo [1] Alignment length
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
66 blastinfo [2] Number of mismatches
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
67 blastinfo [3] Number of gap openings
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
68 blastinfo [4] Start of alignment in query
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
69 blastinfo [5] End of alignment in query
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
70 blastinfo [6] Start of alignment in subject (database hit)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
71 blastinfo [7] End of alignment in subject (database hit)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
72 blastinfo [8] Expectation value (E-value)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
73 blastinfo [9] Bit score
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
74 blastinfo [10] Subject length (NEED TO BE SPECIFIED WHEN RUNNING BLAST) '''
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
75 blastdic = defaultdict (dict)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
76 for line in open (blastfile):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
77 fields = line[:-1].split("\t")
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
78 transcript = fields[0]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
79 subject = fields[1]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
80 blastinfo = [float(fields[2]) ] # blastinfo[0]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
81 blastinfo = blastinfo + [int(i) for i in fields[3:10] ] # blastinfo[1:8] insets 1 to 7
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
82 blastinfo.append(fields[10]) # blastinfo[8] E-value remains as a string type
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
83 blastinfo.append(float(fields[11])) # blastinfo[9] Bit score
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
84 blastinfo.append(int(fields[12])) # blastinfo[10] Subject length MUST BE RETRIEVED THROUGH A 13 COLUMN BLAST OUTPUT
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
85 try:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
86 blastdic[subject][transcript].append(blastinfo)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
87 except:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
88 blastdic[subject][transcript] = [ blastinfo ]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
89 return blastdic
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
90
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
91 def getseq (fastadict, transcript, up, down, orientation="direct"):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
92 def reverse (seq):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
93 revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
94 revseq = [revdict[i] for i in seq[::-1]]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
95 return "".join(revseq)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
96 pickseq = fastadict[transcript][up-1:down]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
97 if orientation == "direct":
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
98 return pickseq
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
99 else:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
100 return reverse(pickseq)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
101
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
102 def subjectCoverage (fastadict, blastdict, subject, QueriesFlankingNucleotides=0):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
103 SubjectCoverageList = []
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
104 HitDic = {}
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
105 bitScores = []
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
106 for transcript in blastdict[subject]:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
107 prefix = "%s--%s_" % (subject, transcript)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
108 hitNumber = 0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
109 for hit in blastdict[subject][transcript]:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
110 hitNumber += 1
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
111 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, hit[0], hit[1], hit[8])
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
112 HitDic[prefix+suffix] = GetHitSequence (fastadict, transcript, hit[4], hit[5], QueriesFlankingNucleotides) #query coverage by a hit is in hit[4:6]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
113 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8]
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
114 bitScores.append(hit[9])
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
115 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
116 TotalSubjectCoverage = len ( set (SubjectCoverageList) )
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
117 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength)
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
118 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores)
0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
119
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
120 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue):
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
121 if rightCoordinate > leftCoordinate:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
122 polarity = "direct"
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
123 else:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
124 polarity = "reverse"
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
125 leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
126 if leftCoordinate - FlankingValue > 0:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
127 leftCoordinate -= FlankingValue
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
128 else:
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
129 leftCoordinate = 1
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
130 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity)
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
131
9
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
132 def outputParsing (F, Fasta, results, Xblastdict, fastadict, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, filter_term_in="", filter_term_out="", mode="verbose"):
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
133 def filter_results (results, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, filter_term_in="", filter_term_out=""):
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
134 print "###", filter_term_in
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
135 for subject in results.keys():
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
136 if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov:
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
137 del results[subject]
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
138 continue
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
139 if results[subject]["maxBitScores"]<filter_maxScore:
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
140 del results[subject]
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
141 continue
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
142 if results[subject]["meanBitScores"]<filter_meanScore:
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
143 del results[subject]
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
144 continue
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
145 if filter_term_in in subject:
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
146 pass
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
147 else:
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
148 del results[subject]
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
149 continue
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
150 if filter_term_out and filter_term_out in subject:
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
151 del results[subject]
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
152 continue
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
153 return results
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
154
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
155 F= open(F, "w")
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
156 Fasta=open(Fasta, "w")
7
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
157 blasted_transcripts = []
9
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
158 filter_results (results, filter_relativeCov, filter_maxScore, filter_meanScore, filter_term_in, filter_term_out)
7
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
159 for subject in results:
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
160 for transcript in Xblastdict[subject]:
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
161 blasted_transcripts.append(transcript)
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
162 blasted_transcripts = list( set( blasted_transcripts))
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
163 if mode == "verbose":
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
164 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
165 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
166 print >> F, "#\n# %s" % subject
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
167 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
168 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
169 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
8
efb051ac0da9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents: 7
diff changeset
170 print >> F, "# Best Bit Score: %s" % (results[subject]["maxBitScores"])
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
171 print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
172 for header in results[subject]["HitDic"]:
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
173 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
174 print >> Fasta, "" # final carriage return for the sequence
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
175 for transcript in Xblastdict[subject]:
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
176 transcriptSize = float(len(fastadict[transcript]))
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
177 for hit in Xblastdict[subject][transcript]:
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
178 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
179 Eval, BitScore = hit[8], hit[9]
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
180 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
181 info = [str(i) for i in info]
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
182 info = "\t".join(info)
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
183 print >> F, info
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
184 else:
8
efb051ac0da9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit 3d9ddd0f6f3c3b97a3bebf52646731ad6771e178
drosofff
parents: 7
diff changeset
185 print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tBest Bit Score\tMean Bit Score"
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
186 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
187 line = []
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
188 line.append(subject)
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
189 line.append(results[subject]["subjectLength"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
190 line.append(results[subject]["TotalCoverage"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
191 line.append(results[subject]["RelativeSubjectCoverage"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
192 line.append(results[subject]["maxBitScores"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
193 line.append(results[subject]["meanBitScores"])
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
194 line = [str(i) for i in line]
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
195 print >> F, "\t".join(line)
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
196 for header in results[subject]["HitDic"]:
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
197 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
198 print >> Fasta, "" # final carriage return for the sequence
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
199 F.close()
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
200 Fasta.close()
7
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
201 return blasted_transcripts
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
202
7
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
203 def dispatch_sequences (fastadict, blasted_transcripts, matched_sequences, unmatched_sequences):
6
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
204 '''to output the sequences that matched and did not matched in the blast'''
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
205 F_matched = open (matched_sequences, "w")
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
206 F_unmatched = open (unmatched_sequences, "w")
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
207 for transcript in fastadict:
7
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
208 if transcript in blasted_transcripts: # le list of blasted_transcripts is generated by the outputParsing function
6
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
209 print >> F_matched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) )
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
210 else:
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
211 print >> F_unmatched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) )
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
212 F_matched.close()
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
213 F_unmatched.close()
3f7cfa1cf90c planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit e511de70e387d5033ab91f37c8ddb5665fa61a87
drosofff
parents: 4
diff changeset
214 return
0
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
215
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
216 def __main__ ():
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
217 args = Parser()
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
218 fastadict = getfasta (args.sequences)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
219 Xblastdict = getblast (args.blast)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
220 results = defaultdict(dict)
3959a271cf3f planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
221 for subject in Xblastdict:
2
e0985bad7b92 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents: 0
diff changeset
222 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
7
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
223 blasted_transcripts = outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict,
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
224 filter_relativeCov=args.filter_relativeCov, filter_maxScore=args.filter_maxScore,
9
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
225 filter_meanScore=args.filter_meanScore, filter_term_in=args.filter_term_in,
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
226 filter_term_out=args.filter_term_out, mode=args.mode)
7
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
227 dispatch_sequences (fastadict, blasted_transcripts, args.al_sequences, args.un_sequences)
72ef366ef55e planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 319fbe5bf4fe32ab9329a411a0d0bd70b83efd9a
drosofff
parents: 6
diff changeset
228
9
86f424753b2d planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_blastparser_and_hits commit e842488e979d8a00b9646061573355cb427bc89c
drosofff
parents: 8
diff changeset
229 if __name__=="__main__": __main__()