Mercurial > repos > drosofff > msp_blastparser_and_hits
comparison BlastParser_and_hits.py @ 0:3959a271cf3f draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
| author | drosofff |
|---|---|
| date | Tue, 09 Jun 2015 04:15:34 -0400 |
| parents | |
| children | e0985bad7b92 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3959a271cf3f |
|---|---|
| 1 #!/usr/bin/python | |
| 2 # blastn blastx parser revised debugged: 3-4-2015. Commit issue. | |
| 3 # drosofff@gmail.com | |
| 4 | |
| 5 import sys | |
| 6 import argparse | |
| 7 from collections import defaultdict | |
| 8 | |
| 9 def Parser(): | |
| 10 the_parser = argparse.ArgumentParser() | |
| 11 the_parser.add_argument('--blast', action="store", type=str, help="Path to the blast output (tabular format, 12 column)") | |
| 12 the_parser.add_argument('--sequences', action="store", type=str, help="Path to the fasta file with blasted sequences") | |
| 13 the_parser.add_argument('--fastaOutput', action="store", type=str, help="fasta output file of blast hits") | |
| 14 the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis") | |
| 15 the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") | |
| 16 args = the_parser.parse_args() | |
| 17 if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ): | |
| 18 the_parser.error('argument(s) missing, call the -h option of the script') | |
| 19 if not args.flanking: | |
| 20 args.flanking = 0 | |
| 21 return args | |
| 22 | |
| 23 def median(lst): | |
| 24 lst = sorted(lst) | |
| 25 if len(lst) < 1: | |
| 26 return None | |
| 27 if len(lst) %2 == 1: | |
| 28 return lst[((len(lst)+1)/2)-1] | |
| 29 if len(lst) %2 == 0: | |
| 30 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 | |
| 31 | |
| 32 def getfasta (fastafile): | |
| 33 fastadic = {} | |
| 34 for line in open (fastafile): | |
| 35 if line[0] == ">": | |
| 36 header = line[1:-1] | |
| 37 fastadic[header] = "" | |
| 38 else: | |
| 39 fastadic[header] += line | |
| 40 for header in fastadic: | |
| 41 fastadic[header] = "".join(fastadic[header].split("\n")) | |
| 42 return fastadic | |
| 43 | |
| 44 def insert_newlines(string, every=60): | |
| 45 lines = [] | |
| 46 for i in xrange(0, len(string), every): | |
| 47 lines.append(string[i:i+every]) | |
| 48 return '\n'.join(lines) | |
| 49 | |
| 50 def getblast (blastfile): | |
| 51 '''blastinfo [0] Percentage of identical matches | |
| 52 blastinfo [1] Alignment length | |
| 53 blastinfo [2] Number of mismatches | |
| 54 blastinfo [3] Number of gap openings | |
| 55 blastinfo [4] Start of alignment in query | |
| 56 blastinfo [5] End of alignment in query | |
| 57 blastinfo [6] Start of alignment in subject (database hit) | |
| 58 blastinfo [7] End of alignment in subject (database hit) | |
| 59 blastinfo [8] Expectation value (E-value) | |
| 60 blastinfo [9] Bit score | |
| 61 blastinfo [10] Subject length (NEED TO BE SPECIFIED WHEN RUNNING BLAST) ''' | |
| 62 blastdic = defaultdict (dict) | |
| 63 for line in open (blastfile): | |
| 64 fields = line[:-1].split("\t") | |
| 65 transcript = fields[0] | |
| 66 subject = fields[1] | |
| 67 blastinfo = [float(fields[2]) ] # blastinfo[0] | |
| 68 blastinfo = blastinfo + [int(i) for i in fields[3:10] ] # blastinfo[1:8] insets 1 to 7 | |
| 69 blastinfo.append(fields[10]) # blastinfo[8] E-value remains as a string type | |
| 70 blastinfo.append(float(fields[11])) # blastinfo[9] Bit score | |
| 71 blastinfo.append(int(fields[12])) # blastinfo[10] Subject length MUST BE RETRIEVED THROUGH A 13 COLUMN BLAST OUTPUT | |
| 72 try: | |
| 73 blastdic[subject][transcript].append(blastinfo) | |
| 74 except: | |
| 75 blastdic[subject][transcript] = [ blastinfo ] | |
| 76 return blastdic | |
| 77 | |
| 78 def getseq (fastadict, transcript, up, down, orientation="direct"): | |
| 79 def reverse (seq): | |
| 80 revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | |
| 81 revseq = [revdict[i] for i in seq[::-1]] | |
| 82 return "".join(revseq) | |
| 83 pickseq = fastadict[transcript][up-1:down] | |
| 84 if orientation == "direct": | |
| 85 return pickseq | |
| 86 else: | |
| 87 return reverse(pickseq) | |
| 88 | |
| 89 def subjectCoverage (fastadict, blastdict, subject, QueriesFlankingNucleotides=0): | |
| 90 SubjectCoverageList = [] | |
| 91 HitDic = {} | |
| 92 bitScores = [] | |
| 93 for transcript in blastdict[subject]: | |
| 94 prefix = "%s--%s_" % (subject, transcript) | |
| 95 hitNumber = 0 | |
| 96 for hit in blastdict[subject][transcript]: | |
| 97 hitNumber += 1 | |
| 98 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, hit[0], hit[1], hit[8]) | |
| 99 HitDic[prefix+suffix] = GetHitSequence (fastadict, transcript, hit[4], hit[5], QueriesFlankingNucleotides) #query coverage by a hit is in hit[4:6] | |
| 100 SubjectCoverageList += range (min([hit[6], hit[7]]), max([hit[6], hit[7]]) + 1) # subject coverage by a hit is in hit[6:8] | |
| 101 bitScores.append(hit[9]) | |
| 102 subjectLength = hit [10] # always the same value for a given subject. Stupid but simple | |
| 103 TotalSubjectCoverage = len ( set (SubjectCoverageList) ) | |
| 104 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) | |
| 105 return HitDic, subjectLength, TotalSubjectCoverage, RelativeSubjectCoverage, max(bitScores), median(bitScores) | |
| 106 | |
| 107 def GetHitSequence (fastadict, FastaHeader, leftCoordinate, rightCoordinate, FlankingValue): | |
| 108 if rightCoordinate > leftCoordinate: | |
| 109 polarity = "direct" | |
| 110 else: | |
| 111 polarity = "reverse" | |
| 112 leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate | |
| 113 if leftCoordinate - FlankingValue > 0: | |
| 114 leftCoordinate -= FlankingValue | |
| 115 else: | |
| 116 leftCoordinate = 1 | |
| 117 return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity) | |
| 118 | |
| 119 def __main__ (): | |
| 120 args = Parser() | |
| 121 fastadict = getfasta (args.sequences) | |
| 122 Xblastdict = getblast (args.blast) | |
| 123 results = defaultdict(dict) | |
| 124 F = open(args.tabularOutput, "w") | |
| 125 Fasta = open(args.fastaOutput, "w") | |
| 126 for subject in Xblastdict: | |
| 127 results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["medianBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, args.flanking) | |
| 128 ## data output | |
| 129 print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n" | |
| 130 for subject in sorted (results, key=lambda x: results[x]["TotalCoverage"], reverse=True): | |
| 131 print >> F, "#\n# %s" % subject | |
| 132 print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"]) | |
| 133 print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"]) | |
| 134 print >> F, "# Relative Subject Coverage: %s" % (results[subject]["RelativeSubjectCoverage"]) | |
| 135 print >> F, "# Maximum Bit Score: %s" % (results[subject]["maxBitScores"]) | |
| 136 print >> F, "# Median Bit Score: %s" % (results[subject]["medianBitScores"]) | |
| 137 for header in results[subject]["HitDic"]: | |
| 138 print >> Fasta, ">%s\n%s" % (header, insert_newlines(results[subject]["HitDic"][header]) ) | |
| 139 for transcript in Xblastdict[subject]: | |
| 140 transcriptSize = float(len(fastadict[transcript])) | |
| 141 for hit in Xblastdict[subject][transcript]: | |
| 142 percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov = hit[0], hit[1], hit[6], hit[7], "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) | |
| 143 Eval, BitScore = hit[8], hit[9] | |
| 144 info = [transcript] + [percentIdentity, alignLenght, subjectStart, subjectEnd, queryCov, Eval, BitScore] | |
| 145 info = [str(i) for i in info] | |
| 146 info = "\t".join(info) | |
| 147 print >> F, info | |
| 148 print >> Fasta, "" | |
| 149 F.close() | |
| 150 Fasta.close() | |
| 151 | |
| 152 if __name__=="__main__": __main__() |
