Mercurial > repos > drosofff > msp_blastparser_and_hits
changeset 2:e0985bad7b92 draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Fri, 19 Jun 2015 12:53:52 -0400 |
parents | 958e769c1c86 |
children | fa936e163bbd |
files | BlastParser_and_hits.py BlastParser_and_hits.xml |
diffstat | 2 files changed, 61 insertions(+), 33 deletions(-) [+] |
line wrap: on
line diff
--- a/BlastParser_and_hits.py Tue Jun 09 04:32:44 2015 -0400 +++ b/BlastParser_and_hits.py Fri Jun 19 12:53:52 2015 -0400 @@ -12,7 +12,8 @@ the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") - the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") + the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") + the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs") args = the_parser.parse_args() if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): the_parser.error('argument(s) missing, call the -h option of the script') @@ -29,6 +30,11 @@ if len(lst) %2 == 0: return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 +def mean(lst): + if len(lst) < 1: + return 0 + return sum(lst) / float(len(lst)) + def getfasta (fastafile): fastadic = {} for line in open (fastafile): @@ -102,7 +108,7 @@ subjectLength = hit [10] # always the same value for a given subject. Stupid but simple TotalSubjectCoverage = len ( set (SubjectCoverageList) ) RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) - return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores) + return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores) def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): if rightCoordinate > leftCoordinate: @@ -115,38 +121,57 @@ else: leftCoordinate = 1 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) + +def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"): + F= open(F, "w") + Fasta=open(Fasta, "w") + if mode == "verbose": + print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" + for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + print >> F, "#\n# %s" % subject + print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) + print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) + print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) + print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) + print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"]) + for header in results[subject]["HitDic"]: + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) + print >> Fasta, "" # final carriage return for the sequence + for transcript in Xblastdict[subject]: + transcriptSize = float(len(fastadict[transcript])) + for hit in Xblastdict[subject][transcript]: + percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) + Eval, BitScore = hit[8], hit[9] + info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] + info = [str(i) for i in info] + info = "\t".join(info) + print >> F, info + else: + print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score" + for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + line = [] + line.append(subject) + line.append(results[subject]["subjectLength"]) + line.append(results[subject]["TotalCoverage"]) + line.append(results[subject]["RelativeSubjectCoverage"]) + line.append(results[subject]["maxBitScores"]) + line.append(results[subject]["meanBitScores"]) + line = [str(i) for i in line] + print >> F, "\t".join(line) + for header in results[subject]["HitDic"]: + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) + print >> Fasta, "" # final carriage return for the sequence + F.close() + Fasta.close() + + def __main__ (): args = Parser() fastadict = getfasta (args.sequences) Xblastdict = getblast (args.blast) results = defaultdict(dict) - F = open(args.tabularOutput, "w") - Fasta = open(args.fastaOutput, "w") for subject in Xblastdict: - results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) - ## data output - print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" - for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True): - print >> F, "#\n# %s" % subject - print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) - print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) - print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) - print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) - print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"]) - for header in results[subject]["HitDic"]: - print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) - for transcript in Xblastdict[subject]: - transcriptSize = float(len(fastadict[transcript])) - for hit in Xblastdict[subject][transcript]: - percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) - Eval, BitScore = hit[8], hit[9] - info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] - info = [str(i) for i in info] - info = "\t".join(info) - print >> F, info - print >> Fasta, "" - F.close() - Fasta.close() - + results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) + outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode) if __name__=="__main__": __main__()
--- a/BlastParser_and_hits.xml Tue Jun 09 04:32:44 2015 -0400 +++ b/BlastParser_and_hits.xml Fri Jun 19 12:53:52 2015 -0400 @@ -1,4 +1,4 @@ -<tool id="BlastParser_and_hits" name="Parse blast output and compile hits" version="2.0.0"> +<tool id="BlastParser_and_hits" name="Parse blast output and compile hits" version="2.1.0"> <description>for virus discovery</description> <requirements></requirements> <command interpreter="python"> @@ -8,11 +8,16 @@ --tabularOutput $tabularOutput --fastaOutput $fastaOutput --flanking $flanking + --mode $mode </command> <inputs> <param name="sequences" type="data" format="fasta" label="fasta sequences that have been blasted" /> <param name="blast" type="data" format="tabular" label="The blast output you wish to parse" /> <param name="flanking" type="text" size="5" value= "5" label="Number of flanking nucleotides to add to hits for CAP3 assembly"/> + <param name="mode" type="select" label="Verbose or short reporting mode" help="display or not the oases contigs"> + <option value="verbose" default="true">verbose</option> + <option value="short">do not report oases contigs</option> + </param> </inputs> <outputs> <data name="tabularOutput" format="tabular" label="blast analysis, by subjects"/> @@ -24,6 +29,7 @@ <param ftype="fasta" name="sequences" value="input.fa" /> <param ftype="tabular" name="blast" value="blast.tab" /> <param name="flanking" value="5" /> + <param name="mode" value="verbose" /> <output name="tabularOutput" ftype="tabular" file="output.tab" /> <output name="fastaOutput" ftype="fasta" file="output.fa" /> </test> @@ -36,7 +42,4 @@ Parse blast outputs for viruses genome assembly. Outputs analysis and hit sequences for further assembly </help> -<test> -</test> - </tool>