Mercurial > repos > drosofff > msp_blastparser_and_hits

--- a/BlastParser_and_hits.py	Tue Jun 09 04:32:44 2015 -0400
+++ b/BlastParser_and_hits.py	Fri Jun 19 12:53:52 2015 -0400
@@ -12,7 +12,8 @@
     the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences")
     the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits")
     the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis")
-    the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences")
+    the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences")
+    the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs")
     args = the_parser.parse_args()
     if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ):
         the_parser.error('argument(s) missing, call the -h option of the script')
@@ -29,6 +30,11 @@
     if len(lst) %2 == 0:
             return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0

+def mean(lst):
+    if len(lst) < 1:
+        return 0
+    return sum(lst) / float(len(lst))
+
 def getfasta (fastafile):
     fastadic = {}
     for line in open (fastafile):
@@ -102,7 +108,7 @@
             subjectLength = hit [10] # always the same value for a given subject. Stupid but simple
     TotalSubjectCoverage = len ( set (SubjectCoverageList) )
     RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength)
-    return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores)
+    return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores)

 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue):
     if rightCoordinate > leftCoordinate:
@@ -115,38 +121,57 @@
     else:
         leftCoordinate = 1
     return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity)
+
+def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"):
+    F= open(F, "w")
+    Fasta=open(Fasta, "w")
+    if mode == "verbose":
+        print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
+        for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+            print >> F, "#\n# %s" % subject
+            print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
+            print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
+            print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
+            print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"])
+            print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"])
+            for header in results[subject]["HitDic"]:
+                print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
+            print >> Fasta, "" # final carriage return for the sequence
+            for transcript in Xblastdict[subject]:
+                transcriptSize = float(len(fastadict[transcript]))
+                for hit in Xblastdict[subject][transcript]:
+                    percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
+                    Eval, BitScore = hit[8], hit[9]
+                    info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
+                    info = [str(i) for i in info]
+                    info = "\t".join(info)
+                    print >> F, info
+    else:
+        print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score"
+        for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+            line = []
+            line.append(subject)
+            line.append(results[subject]["subjectLength"])
+            line.append(results[subject]["TotalCoverage"])
+            line.append(results[subject]["RelativeSubjectCoverage"])
+            line.append(results[subject]["maxBitScores"])
+            line.append(results[subject]["meanBitScores"])
+            line = [str(i) for i in line]
+            print >> F, "\t".join(line)
+            for header in results[subject]["HitDic"]:
+                print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
+            print >> Fasta, "" # final carriage return for the sequence
+    F.close()
+    Fasta.close()
+
+

 def __main__ ():
     args = Parser()
     fastadict = getfasta (args.sequences)
     Xblastdict = getblast (args.blast)
     results = defaultdict(dict)
-    F = open(args.tabularOutput, "w")
-    Fasta = open(args.fastaOutput, "w")
     for subject in Xblastdict:
-        results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"]  = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
-    ## data output
-    print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
-    for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True):
-        print >> F, "#\n# %s" % subject
-        print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
-        print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
-        print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"])
-        print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"])
-        print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"])
-        for header in results[subject]["HitDic"]:
-            print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) )
-        for transcript in Xblastdict[subject]:
-            transcriptSize = float(len(fastadict[transcript]))
-            for hit in Xblastdict[subject][transcript]:
-                percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100)
-                Eval, BitScore = hit[8], hit[9]
-                info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore]
-                info = [str(i) for i in info]
-                info = "\t".join(info)
-                print >> F, info
-        print >> Fasta, ""
-    F.close()
-    Fasta.close()
-
+        results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"]  = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
+    outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode)
 if __name__=="__main__": __main__()
--- a/BlastParser_and_hits.xml	Tue Jun 09 04:32:44 2015 -0400
+++ b/BlastParser_and_hits.xml	Fri Jun 19 12:53:52 2015 -0400
@@ -1,4 +1,4 @@
-<tool id="BlastParser_and_hits" name="Parse blast output and compile hits" version="2.0.0">
+<tool id="BlastParser_and_hits" name="Parse blast output and compile hits" version="2.1.0">
 <description>for virus discovery</description>
 <requirements></requirements>
 <command interpreter="python">
@@ -8,11 +8,16 @@
 	--tabularOutput $tabularOutput
 	--fastaOutput $fastaOutput
 	--flanking $flanking
+	--mode $mode
 </command>
 <inputs>
 	<param name="sequences" type="data" format="fasta"  label="fasta sequences that have been blasted" />
 	<param name="blast" type="data" format="tabular" label="The blast output you wish to parse" />
 	<param name="flanking" type="text" size="5" value= "5" label="Number of flanking nucleotides to add to hits for CAP3 assembly"/>
+	<param name="mode" type="select" label="Verbose or short reporting mode" help="display or not the oases contigs">
+	    <option value="verbose" default="true">verbose</option>
+	    <option value="short">do not report oases contigs</option>
+	</param>
 </inputs>
 <outputs>
 	<data name="tabularOutput" format="tabular" label="blast analysis, by subjects"/>
@@ -24,6 +29,7 @@
         <param ftype="fasta" name="sequences" value="input.fa" />
         <param ftype="tabular" name="blast" value="blast.tab" />
         <param name="flanking" value="5" />
+        <param name="mode" value="verbose" />
         <output name="tabularOutput" ftype="tabular" file="output.tab" />
         <output name="fastaOutput" ftype="fasta" file="output.fa" />
     </test>
@@ -36,7 +42,4 @@
 Parse blast outputs for viruses genome assembly. Outputs analysis and hit sequences for further assembly

 </help>
-<test>
-</test>
-
 </tool>