Mercurial > repos > drosofff > msp_blastparser_and_hits
comparison BlastParser_and_hits.py @ 2:e0985bad7b92 draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
| author | drosofff | 
|---|---|
| date | Fri, 19 Jun 2015 12:53:52 -0400 | 
| parents | 3959a271cf3f | 
| children | 22641bb68b91 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 1:958e769c1c86 | 2:e0985bad7b92 | 
|---|---|
| 10 the_parser = argparse.ArgumentParser() | 10 the_parser = argparse.ArgumentParser() | 
| 11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)") | 11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)") | 
| 12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") | 12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") | 
| 13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") | 13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") | 
| 14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") | 14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") | 
| 15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") | 15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") | 
| 16 the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs") | |
| 16 args = the_parser.parse_args() | 17 args = the_parser.parse_args() | 
| 17 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): | 18 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): | 
| 18 the_parser.error('argument(s) missing, call the -h option of the script') | 19 the_parser.error('argument(s) missing, call the -h option of the script') | 
| 19 if not args.flanking: | 20 if not args.flanking: | 
| 20 args.flanking = 0 | 21 args.flanking = 0 | 
| 26 return None | 27 return None | 
| 27 if len(lst) %2 == 1: | 28 if len(lst) %2 == 1: | 
| 28 return lst[((len(lst)+1)/2)-1] | 29 return lst[((len(lst)+1)/2)-1] | 
| 29 if len(lst) %2 == 0: | 30 if len(lst) %2 == 0: | 
| 30 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 | 31 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 | 
| 32 | |
| 33 def mean(lst): | |
| 34 if len(lst) < 1: | |
| 35 return 0 | |
| 36 return sum(lst) / float(len(lst)) | |
| 31 | 37 | 
| 32 def getfasta (fastafile): | 38 def getfasta (fastafile): | 
| 33 fastadic = {} | 39 fastadic = {} | 
| 34 for line in open (fastafile): | 40 for line in open (fastafile): | 
| 35 if line[0] == ">": | 41 if line[0] == ">": | 
| 100 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] | 106 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] | 
| 101 bitScores.append(hit[9]) | 107 bitScores.append(hit[9]) | 
| 102 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple | 108 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple | 
| 103 TotalSubjectCoverage = len ( set (SubjectCoverageList) ) | 109 TotalSubjectCoverage = len ( set (SubjectCoverageList) ) | 
| 104 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) | 110 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) | 
| 105 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores) | 111 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), mean(bitScores) | 
| 106 | 112 | 
| 107 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): | 113 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): | 
| 108 if rightCoordinate > leftCoordinate: | 114 if rightCoordinate > leftCoordinate: | 
| 109 polarity = "direct" | 115 polarity = "direct" | 
| 110 else: | 116 else: | 
| 113 if leftCoordinate - FlankingValue > 0: | 119 if leftCoordinate - FlankingValue > 0: | 
| 114 leftCoordinate -= FlankingValue | 120 leftCoordinate -= FlankingValue | 
| 115 else: | 121 else: | 
| 116 leftCoordinate = 1 | 122 leftCoordinate = 1 | 
| 117 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) | 123 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) | 
| 124 | |
| 125 def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"): | |
| 126 F= open(F, "w") | |
| 127 Fasta=open(Fasta, "w") | |
| 128 if mode == "verbose": | |
| 129 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" | |
| 130 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): | |
| 131 print >> F, "#\n# %s" % subject | |
| 132 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) | |
| 133 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) | |
| 134 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) | |
| 135 print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) | |
| 136 print >> F, "# Mean Bit Score: %s" % (results[subject]["meanBitScores"]) | |
| 137 for header in results[subject]["HitDic"]: | |
| 138 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) | |
| 139 print >> Fasta, "" # final carriage return for the sequence | |
| 140 for transcript in Xblastdict[subject]: | |
| 141 transcriptSize = float(len(fastadict[transcript])) | |
| 142 for hit in Xblastdict[subject][transcript]: | |
| 143 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) | |
| 144 Eval, BitScore = hit[8], hit[9] | |
| 145 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] | |
| 146 info = [str(i) for i in info] | |
| 147 info = "\t".join(info) | |
| 148 print >> F, info | |
| 149 else: | |
| 150 print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score" | |
| 151 for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True): | |
| 152 line = [] | |
| 153 line.append(subject) | |
| 154 line.append(results[subject]["subjectLength"]) | |
| 155 line.append(results[subject]["TotalCoverage"]) | |
| 156 line.append(results[subject]["RelativeSubjectCoverage"]) | |
| 157 line.append(results[subject]["maxBitScores"]) | |
| 158 line.append(results[subject]["meanBitScores"]) | |
| 159 line = [str(i) for i in line] | |
| 160 print >> F, "\t".join(line) | |
| 161 for header in results[subject]["HitDic"]: | |
| 162 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) | |
| 163 print >> Fasta, "" # final carriage return for the sequence | |
| 164 F.close() | |
| 165 Fasta.close() | |
| 166 | |
| 167 | |
| 118 | 168 | 
| 119 def __main__ (): | 169 def __main__ (): | 
| 120 args = Parser() | 170 args = Parser() | 
| 121 fastadict = getfasta (args.sequences) | 171 fastadict = getfasta (args.sequences) | 
| 122 Xblastdict = getblast (args.blast) | 172 Xblastdict = getblast (args.blast) | 
| 123 results = defaultdict(dict) | 173 results = defaultdict(dict) | 
| 124 F = open(args.tabularOutput, "w") | |
| 125 Fasta = open(args.fastaOutput, "w") | |
| 126 for subject in Xblastdict: | 174 for subject in Xblastdict: | 
| 127 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) | 175 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) | 
| 128 ## data output | 176 outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode) | 
| 129 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" | |
| 130 for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True): | |
| 131 print >> F, "#\n# %s" % subject | |
| 132 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) | |
| 133 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) | |
| 134 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) | |
| 135 print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) | |
| 136 print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"]) | |
| 137 for header in results[subject]["HitDic"]: | |
| 138 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) | |
| 139 for transcript in Xblastdict[subject]: | |
| 140 transcriptSize = float(len(fastadict[transcript])) | |
| 141 for hit in Xblastdict[subject][transcript]: | |
| 142 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) | |
| 143 Eval, BitScore = hit[8], hit[9] | |
| 144 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] | |
| 145 info = [str(i) for i in info] | |
| 146 info = "\t".join(info) | |
| 147 print >> F, info | |
| 148 print >> Fasta, "" | |
| 149 F.close() | |
| 150 Fasta.close() | |
| 151 | |
| 152 if __name__=="__main__": __main__() | 177 if __name__=="__main__": __main__() | 
