Mercurial > repos > matthias > longorf
comparison getLongestORF.py @ 0:e09750baa9ac draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 8e118a4d24047e2c62912b962e854f789d6ff559-dirty
| author | matthias |
|---|---|
| date | Wed, 20 Jun 2018 10:55:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e09750baa9ac |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 usage: getLongestORF.py input output.fas output.tab | |
| 5 | |
| 6 | |
| 7 input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") | |
| 8 output.fas: fasta file with all longest ORFs per transcript | |
| 9 output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs | |
| 10 | |
| 11 example: | |
| 12 | |
| 13 >253936-254394(+)_1 [28 - 63] | |
| 14 LTNYCQMVHNIL | |
| 15 >253936-254394(+)_2 [18 - 77] | |
| 16 HKLIDKLLPNGAQYFVKSTQ | |
| 17 >253936-254394(+)_3 [32 - 148] | |
| 18 QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV | |
| 19 >253936-254394(+)_4 [117 - 152] | |
| 20 LGIIVPSLLLCN | |
| 21 >248351-252461(+)_1 [14 - 85] | |
| 22 VLARKYPRCLSPSKKSPCQLRQRS | |
| 23 >248351-252461(+)_2 [21 - 161] | |
| 24 PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP | |
| 25 >248351-252461(+)_3 [89 - 202] | |
| 26 VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG | |
| 27 >248351-252461(+)_4 [206 - 259] | |
| 28 SRTTSGRQSVLSEKLWRR | |
| 29 >248351-252461(+)_5 [263 - 313] | |
| 30 CLSPLWVPCCSRHSCHG | |
| 31 """ | |
| 32 | |
| 33 import sys,re | |
| 34 | |
| 35 def findlongestOrf(transcriptDict,old_seqID): | |
| 36 #write for previous seqID | |
| 37 prevTranscript = transcriptDict[old_seqID] | |
| 38 i_max = 0 | |
| 39 #find longest orf in transcript | |
| 40 for i in range(0,len(prevTranscript)): | |
| 41 if(prevTranscript[i][2] >= prevTranscript[i_max][2]): | |
| 42 i_max = i | |
| 43 for i in range(0,len(prevTranscript)): | |
| 44 prevStart = prevTranscript[i][0] | |
| 45 prevEnd = prevTranscript[i][1] | |
| 46 prevLength = prevTranscript[i][2] | |
| 47 output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength) | |
| 48 if (end - start > 0): | |
| 49 output+="\tForward" | |
| 50 else: | |
| 51 output+="\tReverse" | |
| 52 if(i == i_max): | |
| 53 output += "\ty\n" | |
| 54 else: | |
| 55 output += "\tn\n" | |
| 56 OUTPUT_ORF_SUMMARY.write(output) | |
| 57 transcriptDict.pop(old_seqID, None) | |
| 58 return None | |
| 59 | |
| 60 INPUT = open(sys.argv[1],"r") | |
| 61 OUTPUT_FASTA = open(sys.argv[2],"w") | |
| 62 OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") | |
| 63 | |
| 64 seqID = "" | |
| 65 old_seqID = "" | |
| 66 lengthDict = {} | |
| 67 seqDict = {} | |
| 68 headerDict = {} | |
| 69 transcriptDict = {} | |
| 70 skip = False | |
| 71 | |
| 72 OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") | |
| 73 | |
| 74 for line in INPUT: | |
| 75 line = line.strip() | |
| 76 # print line | |
| 77 if(re.match(">",line)): #header | |
| 78 seqID = "_".join(line.split(">")[1].split("_")[:-1]) | |
| 79 #seqID = line.split(">")[1].split("_")[0] | |
| 80 start = int (re.search('\ \[(\d+)\ -', line).group(1)) | |
| 81 end = int (re.search('-\ (\d+)\]',line).group(1)) | |
| 82 length = abs(end - start) | |
| 83 if(seqID not in transcriptDict and old_seqID != ""): #new transcript | |
| 84 findlongestOrf(transcriptDict,old_seqID) | |
| 85 if seqID not in transcriptDict: | |
| 86 transcriptDict[seqID] = [] | |
| 87 transcriptDict[seqID].append([start,end,length]) | |
| 88 if(seqID not in lengthDict and old_seqID != ""): #new transcript | |
| 89 #write FASTA | |
| 90 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n") | |
| 91 #delete old dict entry | |
| 92 headerDict.pop(old_seqID, None) | |
| 93 seqDict.pop(old_seqID, None) | |
| 94 lengthDict.pop(old_seqID, None) | |
| 95 #if several longest sequences exist with the same length, the dictionary saves the last occuring. | |
| 96 if(seqID not in lengthDict or length >= lengthDict[seqID]): | |
| 97 headerDict[seqID] = line | |
| 98 lengthDict[seqID] = length | |
| 99 seqDict[seqID] = "" | |
| 100 skip = False | |
| 101 else: | |
| 102 skip = True | |
| 103 next | |
| 104 old_seqID = seqID | |
| 105 elif(skip): | |
| 106 next | |
| 107 else: | |
| 108 seqDict[seqID] += line | |
| 109 | |
| 110 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) | |
| 111 findlongestOrf(transcriptDict,old_seqID) | |
| 112 INPUT.close() | |
| 113 OUTPUT_FASTA.close() | |
| 114 OUTPUT_ORF_SUMMARY.close() |
