msp_blastparser_and_hits: BlastParser_and

comparison BlastParser_and_hits.py @ 2:e0985bad7b92 draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/

author	drosofff
date	Fri, 19 Jun 2015 12:53:52 -0400
parents	3959a271cf3f
children	22641bb68b91

comparison

equal deleted inserted replaced

-:958e769c1c86
+:e0985bad7b92
 the_parser = argparse.ArgumentParser()
 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)")
 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences")
 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits")
 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis")
 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences")
+the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs")
 args = the_parser.parse_args()
 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ):
 the_parser.error('argument(s) missing, call the -h option of the script')
 if not args.flanking:
 args.flanking = 0
 return None
 if len(lst) %2 == 1:
 return lst[((len(lst)+1)/2)-1]
 if len(lst) %2 == 0:
 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0
+def mean(lst):
+if len(lst) < 1:
+return 0
+return sum(lst) / float(len(lst))
 def getfasta (fastafile):
 fastadic = {}
 for line in open (fastafile):
 if line[0] == ">":
 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8]
 bitScores.append(hit[9])
 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple
 TotalSubjectCoverage = len ( set (SubjectCoverageList) )
 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength)
-return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores)
+return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores)
 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue):
 if rightCoordinate > leftCoordinate:
 polarity = "direct"
 else:
 if leftCoordinate - FlankingValue > 0:
 leftCoordinate -= FlankingValue
 else:
 leftCoordinate = 1
 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity)
+def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"):
+F= open(F, "w")
+Fasta=open(Fasta, "w")
+if mode == "verbose":
+print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
+for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+print >> F, "#\n# %s" % subject
+print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
+print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
+print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
+print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"])
+print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"])
+for header in results[subject]["HitDic"]:
+print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
+print >> Fasta, "" # final carriage return for the sequence
+for transcript in Xblastdict[subject]:
+transcriptSize = float(len(fastadict[transcript]))
+for hit in Xblastdict[subject][transcript]:
+percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
+Eval, BitScore = hit[8], hit[9]
+info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
+info = [str(i) for i in info]
+info = "\t".join(info)
+print >> F, info
+else:
+print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score"
+for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+line = []
+line.append(subject)
+line.append(results[subject]["subjectLength"])
+line.append(results[subject]["TotalCoverage"])
+line.append(results[subject]["RelativeSubjectCoverage"])
+line.append(results[subject]["maxBitScores"])
+line.append(results[subject]["meanBitScores"])
+line = [str(i) for i in line]
+print >> F, "\t".join(line)
+for header in results[subject]["HitDic"]:
+print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
+print >> Fasta, "" # final carriage return for the sequence
+F.close()
+Fasta.close()
 def __main__ ():
 args = Parser()
 fastadict = getfasta (args.sequences)
 Xblastdict = getblast (args.blast)
 results = defaultdict(dict)
-F = open(args.tabularOutput, "w")
-Fasta = open(args.fastaOutput, "w")
 for subject in Xblastdict:
-results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"]  = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
+results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"]  = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
-## data output
+outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode)
-print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
-for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True):
-print >> F, "#\n# %s" % subject
-print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
-print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
-print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
-print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"])
-print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"])
-for header in results[subject]["HitDic"]:
-print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
-for transcript in Xblastdict[subject]:
-transcriptSize = float(len(fastadict[transcript]))
-for hit in Xblastdict[subject][transcript]:
-percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
-Eval, BitScore = hit[8], hit[9]
-info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
-info = [str(i) for i in info]
-info = "\t".join(info)
-print >> F, info
-print >> Fasta, ""
-F.close()
-Fasta.close()
 if __name__=="__main__": __main__()

Mercurial > repos > drosofff > msp_blastparser_and_hits

comparison BlastParser_and_hits.py @ 2:e0985bad7b92 draft