Mercurial > repos > drosofff > blast_to_scaffold
comparison blast_to_scaffold.py @ 0:a97704553a32 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blast_to_scaffold commit 5ecc0c7c2e7a6c1dfff04a881e55aa4b6d6e60a9
| author | drosofff | 
|---|---|
| date | Fri, 15 Jan 2016 12:35:10 -0500 | 
| parents | |
| children | 991bf4769500 | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:a97704553a32 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import argparse | |
| 4 | |
| 5 def insert_newlines(string, every=60): | |
| 6 lines = [] | |
| 7 for i in xrange(0, len(string), every): | |
| 8 lines.append(string[i:i+every]) | |
| 9 return '\n'.join(lines) | |
| 10 | |
| 11 def getseq (fastadict, transcript, up, down, orientation="direct"): | |
| 12 def reverse (seq): | |
| 13 revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | |
| 14 revseq = [revdict[i] for i in seq[::-1]] | |
| 15 return "".join(revseq) | |
| 16 pickseq = fastadict[transcript][up-1:down] | |
| 17 if orientation == "direct": | |
| 18 return pickseq | |
| 19 else: | |
| 20 return reverse(pickseq) | |
| 21 | |
| 22 def Parser(): | |
| 23 the_parser = argparse.ArgumentParser( | |
| 24 description="Generate DNA scaffold from blastn or tblastx alignment of Contigs") | |
| 25 the_parser.add_argument( | |
| 26 '--sequences', action="store", type=str, help="input sequence file in fasta format") | |
| 27 the_parser.add_argument( | |
| 28 '--guideSequence', action="store", type=str, help="the reference sequence to guide the scaffold assembly in fasta format") | |
| 29 the_parser.add_argument( | |
| 30 '--blast-tab', dest="blast_tab", action="store", type=str, help="13-columns tabular blastn or tblastx output") | |
| 31 the_parser.add_argument( | |
| 32 '--output', action="store", type=str, help="output file path, fasta format") | |
| 33 args = the_parser.parse_args() | |
| 34 return args | |
| 35 | |
| 36 def blatnInfo (file): | |
| 37 blastlist = [] | |
| 38 with open(file, "r") as f: | |
| 39 for line in f: | |
| 40 minilist = [] | |
| 41 fields = line.rstrip().split() | |
| 42 minilist.append(fields[0]) | |
| 43 minilist.extend(fields[6:10]) | |
| 44 blastlist.append(minilist) | |
| 45 blastlist.sort(key=lambda x: x[3], reverse=True) | |
| 46 return blastlist | |
| 47 | |
| 48 def myContigs (file): | |
| 49 Contigs = {} | |
| 50 with open(file, "r") as f: | |
| 51 for line in f: | |
| 52 if line[0] == ">": | |
| 53 header = line[1:-1] | |
| 54 Contigs[header] = "" | |
| 55 else: | |
| 56 Contigs[header] += line[:-1] | |
| 57 return Contigs | |
| 58 | |
| 59 def myGuide (file): | |
| 60 Guide = {} | |
| 61 coordinate = 0 | |
| 62 with open(file, "r") as f: | |
| 63 for line in f: | |
| 64 if line[0] == ">": | |
| 65 continue | |
| 66 else: | |
| 67 for nucleotide in line[:-1]: | |
| 68 coordinate += 1 | |
| 69 Guide[coordinate] = nucleotide.lower() | |
| 70 return Guide | |
| 71 | |
| 72 def updateGuide (blastlist, GuideDict, ContigsDict): | |
| 73 ''' | |
| 74 the blastlist object is a list of list with | |
| 75 element [0] : name of the blasted Contig | |
| 76 element [1] : queryStart of the alignment to the reference | |
| 77 element [2] = queryStop of the alignment to the reference | |
| 78 element [3] : subjectStart of the alignment to the reference | |
| 79 element [4] = subjectStop of the alignment to the reference | |
| 80 ''' | |
| 81 for fields in blastlist: | |
| 82 seqHeader = fields[0] | |
| 83 queryStart = int(fields[1]) | |
| 84 queryStop = int(fields[2]) | |
| 85 subjectStart = int(fields[3]) | |
| 86 subjectStop = int(fields[4]) | |
| 87 if subjectStart > subjectStop: | |
| 88 subjectStart, subjectStop = subjectStop, subjectStart | |
| 89 orientation = "reverse" | |
| 90 else: | |
| 91 orientation = "direct" | |
| 92 sequence = getseq (ContigsDict, seqHeader, queryStart, queryStop, orientation) | |
| 93 for i in range(subjectStart, subjectStop+1): | |
| 94 try: | |
| 95 del GuideDict[i] | |
| 96 except KeyError: | |
| 97 continue | |
| 98 for i, nucleotide in enumerate(sequence): | |
| 99 GuideDict[i+subjectStart] = nucleotide | |
| 100 | |
| 101 def finalAssembly (GuideDict, outputfile): | |
| 102 finalSeqList = [] | |
| 103 for keys in sorted(GuideDict): | |
| 104 finalSeqList.append(GuideDict[keys]) | |
| 105 finalSequence = insert_newlines("".join(finalSeqList) ) | |
| 106 Out = open (outputfile, "w") | |
| 107 print >> Out, ">Scaffold" | |
| 108 print >> Out, finalSequence | |
| 109 Out.close() | |
| 110 | |
| 111 def __main__(): | |
| 112 args = Parser() | |
| 113 ContigsDict = myContigs (args.sequences) | |
| 114 GuideDict = myGuide (args.guideSequence) | |
| 115 blastlist = blatnInfo(args.blast_tab) | |
| 116 updateGuide(blastlist, GuideDict, ContigsDict) | |
| 117 finalAssembly(GuideDict, args.output) | |
| 118 | |
| 119 if __name__ == "__main__": | |
| 120 __main__() | 
