# HG changeset patch # User drosofff # Date 1434732832 14400 # Node ID e0985bad7b927956e3d0372bb3abf487a3ab6098 # Parent 958e769c1c86ab55930713f823c5df0744a336f4 planemo upload for repository https://bitbucket.org/drosofff/gedtools/ diff -r 958e769c1c86 -r e0985bad7b92 BlastParser_and_hits.py --- a/BlastParser_and_hits.py Tue Jun 09 04:32:44 2015 -0400 +++ b/BlastParser_and_hits.py Fri Jun 19 12:53:52 2015 -0400 @@ -12,7 +12,8 @@ the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") - the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") + the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") + the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs") args = the_parser.parse_args() if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): the_parser.error('argument(s) missing, call the -h option of the script') @@ -29,6 +30,11 @@ if len(lst) %2 == 0: return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 +def mean(lst): + if len(lst) < 1: + return 0 + return sum(lst) / float(len(lst)) + def getfasta (fastafile): fastadic = {} for line in open (fastafile): @@ -102,7 +108,7 @@ subjectLength = hit [10] # always the same value for a given subject. Stupid but simple TotalSubjectCoverage = len ( set (SubjectCoverageList) ) RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) - return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores) + return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores) def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): if rightCoordinate > leftCoordinate: @@ -115,38 +121,57 @@ else: leftCoordinate = 1 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) + +def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"): + F= open(F, "w") + Fasta=open(Fasta, "w") + if mode == "verbose": + print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" + for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + print >> F, "#\n# %s" % subject + print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) + print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) + print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) + print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) + print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"]) + for header in results[subject]["HitDic"]: + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) + print >> Fasta, "" # final carriage return for the sequence + for transcript in Xblastdict[subject]: + transcriptSize = float(len(fastadict[transcript])) + for hit in Xblastdict[subject][transcript]: + percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) + Eval, BitScore = hit[8], hit[9] + info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] + info = [str(i) for i in info] + info = "\t".join(info) + print >> F, info + else: + print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score" + for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): + line = [] + line.append(subject) + line.append(results[subject]["subjectLength"]) + line.append(results[subject]["TotalCoverage"]) + line.append(results[subject]["RelativeSubjectCoverage"]) + line.append(results[subject]["maxBitScores"]) + line.append(results[subject]["meanBitScores"]) + line = [str(i) for i in line] + print >> F, "\t".join(line) + for header in results[subject]["HitDic"]: + print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) + print >> Fasta, "" # final carriage return for the sequence + F.close() + Fasta.close() + + def __main__ (): args = Parser() fastadict = getfasta (args.sequences) Xblastdict = getblast (args.blast) results = defaultdict(dict) - F = open(args.tabularOutput, "w") - Fasta = open(args.fastaOutput, "w") for subject in Xblastdict: - results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) - ## data output - print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" - for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True): - print >> F, "#\n# %s" % subject - print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) - print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) - print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) - print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) - print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"]) - for header in results[subject]["HitDic"]: - print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) - for transcript in Xblastdict[subject]: - transcriptSize = float(len(fastadict[transcript])) - for hit in Xblastdict[subject][transcript]: - percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) - Eval, BitScore = hit[8], hit[9] - info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] - info = [str(i) for i in info] - info = "\t".join(info) - print >> F, info - print >> Fasta, "" - F.close() - Fasta.close() - + results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) + outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode) if __name__=="__main__": __main__() diff -r 958e769c1c86 -r e0985bad7b92 BlastParser_and_hits.xml --- a/BlastParser_and_hits.xml Tue Jun 09 04:32:44 2015 -0400 +++ b/BlastParser_and_hits.xml Fri Jun 19 12:53:52 2015 -0400 @@ -1,4 +1,4 @@ - + for virus discovery @@ -8,11 +8,16 @@ --tabularOutput $tabularOutput --fastaOutput $fastaOutput --flanking $flanking + --mode $mode + + + + @@ -24,6 +29,7 @@ + @@ -36,7 +42,4 @@ Parse blast outputs for viruses genome assembly. Outputs analysis and hit sequences for further assembly - - -