Mercurial > repos > galaxyp > pep_pointer
comparison pep_pointer.py @ 3:a26f551d819b draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit e3996c3bda75b16d19997d1e2f67267dd0ea2dff
author | galaxyp |
---|---|
date | Fri, 06 Apr 2018 18:12:50 -0400 |
parents | aac535d694d4 |
children | 3a3aff93ee9e |
comparison
equal
deleted
inserted
replaced
2:ac3494caea96 | 3:a26f551d819b |
---|---|
1 | 1 |
2 # | 2 # |
3 # Author: Praveen Kumar | 3 # Author: Praveen Kumar |
4 # Updated: Nov 8th, 2017 | 4 # Updated: April 6th, 2018 |
5 # | 5 # |
6 # | 6 # |
7 # | 7 # |
8 | 8 |
9 import re | 9 import re |
36 end = a[4].strip() | 36 end = a[4].strip() |
37 elif int(a[4].strip()) < int(a[3].strip()): | 37 elif int(a[4].strip()) < int(a[3].strip()): |
38 start = a[4].strip() | 38 start = a[4].strip() |
39 end = a[3].strip() | 39 end = a[3].strip() |
40 else: | 40 else: |
41 print "Something fishy in start end coordinates" | 41 print "Please check the start end coordinates in the GTF file" |
42 else: | 42 else: |
43 print "Something fishy in reading" | 43 print "Please check the strand information in the GTF file. It should be '+' or '-'." |
44 if not gtf.has_key(strand): | 44 if not gtf.has_key(strand): |
45 gtf[strand] = {} | 45 gtf[strand] = {} |
46 if not gtf[strand].has_key(type): | 46 if not gtf[strand].has_key(type): |
47 gtf[strand][type] = [] | 47 gtf[strand][type] = [] |
48 b = re.search("gene_id \"(.+?)\";", a[8].strip()) | 48 b = re.search("gene_id \"(.+?)\";", a[8].strip()) |
146 if strand == "+": | 146 if strand == "+": |
147 st = "positive" | 147 st = "positive" |
148 elif strand == "-": | 148 elif strand == "-": |
149 st = "negative" | 149 st = "negative" |
150 else: | 150 else: |
151 print "Something fishy in writing . . ." | 151 print "Please check the strand information in the GTF file. It should be '+' or '-'." |
152 | 152 |
153 for type in gtf[strand].keys(): | 153 for type in gtf[strand].keys(): |
154 data = gtf[strand][type] | 154 data = gtf[strand][type] |
155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) | 155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) |
156 | 156 |
160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") | 160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") |
161 data = infh.readlines() | 161 data = infh.readlines() |
162 # output file | 162 # output file |
163 outfh = open(inputFile[3], 'w') | 163 outfh = open(inputFile[3], 'w') |
164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") | 164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") |
165 | 165 |
166 for each in data: | 166 for each in data: |
167 a = each.split("\t") | 167 a = each.strip().split("\t") |
168 chr = a[0].strip() | 168 chr = a[0].strip() |
169 pep_start = a[1].strip() | 169 pep_start = str(int(a[1].strip())+1) |
170 pep_end = a[2].strip() | 170 pep_end = a[2].strip() |
171 strand = a[5].strip() | 171 strand = a[5].strip() |
172 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 172 each = "\t".join(a[:6]) |
173 rows = c.fetchall() | 173 if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6): |
174 if len(rows) > 0: | 174 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") |
175 outfh.write(each.strip() + "\tCDS\n") | |
176 else: | |
177 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
178 rows = c.fetchall() | 175 rows = c.fetchall() |
179 if len(rows) > 0: | 176 if len(rows) > 0: |
180 outfh.write(each.strip() + "\tfive_prime_utr\n") | 177 outfh.write(each.strip() + "\tCDS\n") |
181 else: | 178 else: |
182 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 179 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") |
183 rows = c.fetchall() | 180 rows = c.fetchall() |
184 if len(rows) > 0: | 181 if len(rows) > 0: |
185 outfh.write(each.strip() + "\tthree_prime_utr\n") | 182 outfh.write(each.strip() + "\tfive_prime_utr\n") |
186 else: | 183 else: |
187 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 184 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") |
188 rows = c.fetchall() | 185 rows = c.fetchall() |
189 if len(rows) > 0: | 186 if len(rows) > 0: |
190 outfh.write(each.strip() + "\texon\n") | 187 outfh.write(each.strip() + "\tthree_prime_utr\n") |
191 else: | 188 else: |
192 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 189 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") |
193 rows = c.fetchall() | 190 rows = c.fetchall() |
194 if len(rows) > 0: | 191 if len(rows) > 0: |
195 outfh.write(each.strip() + "\tintron\n") | 192 outfh.write(each.strip() + "\texon\n") |
196 else: | 193 else: |
197 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 194 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") |
198 rows = c.fetchall() | 195 rows = c.fetchall() |
199 if len(rows) > 0: | 196 if len(rows) > 0: |
200 outfh.write(each.strip() + "\tgene\n") | 197 outfh.write(each.strip() + "\tintron\n") |
201 else: | 198 else: |
202 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 199 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") |
203 rows = c.fetchall() | 200 rows = c.fetchall() |
204 if len(rows) > 0: | 201 if len(rows) > 0: |
205 outfh.write(each.strip() + "\tintergene\n") | 202 outfh.write(each.strip() + "\tgene\n") |
206 else: | 203 else: |
207 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") | 204 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") |
205 rows = c.fetchall() | |
206 if len(rows) > 0: | |
207 outfh.write(each.strip() + "\tintergene\n") | |
208 else: | |
209 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") | |
210 elif (len(a) == 12 and int(a[9]) == 2): | |
211 outfh.write(each.strip() + "\tSpliceJunction\n") | |
212 else: | |
213 outfh.write(each.strip() + "\tPlease check\n") | |
208 | 214 |
209 conn.close() | 215 conn.close() |
210 outfh.close() | 216 outfh.close() |
211 else: | 217 else: |
212 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" | 218 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" |