comparison pep_pointer.py @ 3:a26f551d819b draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit e3996c3bda75b16d19997d1e2f67267dd0ea2dff
author galaxyp
date Fri, 06 Apr 2018 18:12:50 -0400
parents aac535d694d4
children 3a3aff93ee9e
comparison
equal deleted inserted replaced
2:ac3494caea96 3:a26f551d819b
1 1
2 # 2 #
3 # Author: Praveen Kumar 3 # Author: Praveen Kumar
4 # Updated: Nov 8th, 2017 4 # Updated: April 6th, 2018
5 # 5 #
6 # 6 #
7 # 7 #
8 8
9 import re 9 import re
36 end = a[4].strip() 36 end = a[4].strip()
37 elif int(a[4].strip()) < int(a[3].strip()): 37 elif int(a[4].strip()) < int(a[3].strip()):
38 start = a[4].strip() 38 start = a[4].strip()
39 end = a[3].strip() 39 end = a[3].strip()
40 else: 40 else:
41 print "Something fishy in start end coordinates" 41 print "Please check the start end coordinates in the GTF file"
42 else: 42 else:
43 print "Something fishy in reading" 43 print "Please check the strand information in the GTF file. It should be '+' or '-'."
44 if not gtf.has_key(strand): 44 if not gtf.has_key(strand):
45 gtf[strand] = {} 45 gtf[strand] = {}
46 if not gtf[strand].has_key(type): 46 if not gtf[strand].has_key(type):
47 gtf[strand][type] = [] 47 gtf[strand][type] = []
48 b = re.search("gene_id \"(.+?)\";", a[8].strip()) 48 b = re.search("gene_id \"(.+?)\";", a[8].strip())
146 if strand == "+": 146 if strand == "+":
147 st = "positive" 147 st = "positive"
148 elif strand == "-": 148 elif strand == "-":
149 st = "negative" 149 st = "negative"
150 else: 150 else:
151 print "Something fishy in writing . . ." 151 print "Please check the strand information in the GTF file. It should be '+' or '-'."
152 152
153 for type in gtf[strand].keys(): 153 for type in gtf[strand].keys():
154 data = gtf[strand][type] 154 data = gtf[strand][type]
155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) 155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data)
156 156
160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") 160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")
161 data = infh.readlines() 161 data = infh.readlines()
162 # output file 162 # output file
163 outfh = open(inputFile[3], 'w') 163 outfh = open(inputFile[3], 'w')
164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") 164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")
165 165
166 for each in data: 166 for each in data:
167 a = each.split("\t") 167 a = each.strip().split("\t")
168 chr = a[0].strip() 168 chr = a[0].strip()
169 pep_start = a[1].strip() 169 pep_start = str(int(a[1].strip())+1)
170 pep_end = a[2].strip() 170 pep_end = a[2].strip()
171 strand = a[5].strip() 171 strand = a[5].strip()
172 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 172 each = "\t".join(a[:6])
173 rows = c.fetchall() 173 if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6):
174 if len(rows) > 0: 174 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
175 outfh.write(each.strip() + "\tCDS\n")
176 else:
177 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
178 rows = c.fetchall() 175 rows = c.fetchall()
179 if len(rows) > 0: 176 if len(rows) > 0:
180 outfh.write(each.strip() + "\tfive_prime_utr\n") 177 outfh.write(each.strip() + "\tCDS\n")
181 else: 178 else:
182 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 179 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
183 rows = c.fetchall() 180 rows = c.fetchall()
184 if len(rows) > 0: 181 if len(rows) > 0:
185 outfh.write(each.strip() + "\tthree_prime_utr\n") 182 outfh.write(each.strip() + "\tfive_prime_utr\n")
186 else: 183 else:
187 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 184 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
188 rows = c.fetchall() 185 rows = c.fetchall()
189 if len(rows) > 0: 186 if len(rows) > 0:
190 outfh.write(each.strip() + "\texon\n") 187 outfh.write(each.strip() + "\tthree_prime_utr\n")
191 else: 188 else:
192 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 189 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
193 rows = c.fetchall() 190 rows = c.fetchall()
194 if len(rows) > 0: 191 if len(rows) > 0:
195 outfh.write(each.strip() + "\tintron\n") 192 outfh.write(each.strip() + "\texon\n")
196 else: 193 else:
197 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 194 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
198 rows = c.fetchall() 195 rows = c.fetchall()
199 if len(rows) > 0: 196 if len(rows) > 0:
200 outfh.write(each.strip() + "\tgene\n") 197 outfh.write(each.strip() + "\tintron\n")
201 else: 198 else:
202 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") 199 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
203 rows = c.fetchall() 200 rows = c.fetchall()
204 if len(rows) > 0: 201 if len(rows) > 0:
205 outfh.write(each.strip() + "\tintergene\n") 202 outfh.write(each.strip() + "\tgene\n")
206 else: 203 else:
207 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") 204 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
205 rows = c.fetchall()
206 if len(rows) > 0:
207 outfh.write(each.strip() + "\tintergene\n")
208 else:
209 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n")
210 elif (len(a) == 12 and int(a[9]) == 2):
211 outfh.write(each.strip() + "\tSpliceJunction\n")
212 else:
213 outfh.write(each.strip() + "\tPlease check\n")
208 214
209 conn.close() 215 conn.close()
210 outfh.close() 216 outfh.close()
211 else: 217 else:
212 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" 218 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>"