Mercurial > repos > galaxyp > retrieve_ensembl_bed
comparison retrieve_ensembl_bed.py @ 0:887e111c0919 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 3fd7be931712e7fa5b281bc8c48104c8583ef7f0
author | galaxyp |
---|---|
date | Sun, 14 Jan 2018 14:11:53 -0500 |
parents | |
children | c3d600729b6f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:887e111c0919 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2017, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 import argparse | |
16 import re | |
17 import sys | |
18 | |
19 from bedutil import bed_from_line | |
20 | |
21 from ensembl_rest import get_toplevel, get_transcripts_bed, max_region | |
22 | |
23 | |
24 def __main__(): | |
25 parser = argparse.ArgumentParser( | |
26 description='Retrieve Ensembl cDNAs in BED format') | |
27 parser.add_argument( | |
28 '-s', '--species', default='human', | |
29 help='Ensembl Species to retrieve') | |
30 parser.add_argument( | |
31 '-R', '--regions', action='append', default=[], | |
32 help='Restrict Ensembl retrieval to regions e.g.:' | |
33 + ' X,2:20000-25000,3:100-500+') | |
34 parser.add_argument( | |
35 '-B', '--biotypes', action='append', default=[], | |
36 help='Restrict Ensembl biotypes to retrieve') | |
37 parser.add_argument( | |
38 '-X', '--extended_bed', action='store_true', default=False, | |
39 help='Include the extended columns returned from Ensembl') | |
40 parser.add_argument( | |
41 '-U', '--ucsc_chrom_names', action='store_true', default=False, | |
42 help='Use the UCSC names for Chromosomes') | |
43 parser.add_argument( | |
44 '-t', '--toplevel', action='store_true', | |
45 help='Print Ensembl toplevel for species') | |
46 parser.add_argument( | |
47 'output', | |
48 help='Output BED filepath, or for stdout: "-"') | |
49 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') | |
50 parser.add_argument('-d', '--debug', action='store_true', help='Debug') | |
51 args = parser.parse_args() | |
52 # print >> sys.stderr, "args: %s" % args | |
53 species = args.species | |
54 out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout | |
55 biotypes = ';'.join(['biotype=%s' % bt.strip() | |
56 for biotype in args.biotypes | |
57 for bt in biotype.split(',') if bt.strip()]) | |
58 | |
59 selected_regions = dict() # chrom:(start, end) | |
60 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' | |
61 if args.regions: | |
62 for entry in args.regions: | |
63 if not entry: | |
64 continue | |
65 regs = [x.strip() for x in entry.split(',') if x.strip()] | |
66 for reg in regs: | |
67 m = re.match(region_pat, reg) | |
68 if m: | |
69 (chrom, start, end, strand) = m.groups() | |
70 if chrom: | |
71 if chrom not in selected_regions: | |
72 selected_regions[chrom] = [] | |
73 selected_regions[chrom].append([start, end, strand]) | |
74 if args.debug: | |
75 print >> sys.stderr, "selected_regions: %s" % selected_regions | |
76 | |
77 def retrieve_region(species, ref, start, stop, strand): | |
78 transcript_count = 0 | |
79 regions = range(start, stop, max_region) | |
80 if not regions or regions[-1] < stop: | |
81 regions.append(stop) | |
82 for end in regions[1:]: | |
83 bedlines = get_transcripts_bed(species, ref, start, end, | |
84 strand=strand, params=biotypes) | |
85 if args.debug: | |
86 print >> sys.stderr,\ | |
87 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ | |
88 % (species, ref, start, end, len(bedlines)) | |
89 # start, end, seq | |
90 for i, bedline in enumerate(bedlines): | |
91 if args.debug: | |
92 print >> sys.stderr, "%s\n" % (bedline) | |
93 if not args.ucsc_chrom_names: | |
94 bedline = re.sub('^[^\t]+', ref, bedline) | |
95 try: | |
96 if out_wtr: | |
97 out_wtr.write(bedline.replace(',\t', '\t') | |
98 if args.extended_bed | |
99 else str(bed_from_line(bedline))) | |
100 out_wtr.write("\n") | |
101 out_wtr.flush() | |
102 except Exception as e: | |
103 print >> sys.stderr,\ | |
104 "BED error (%s) : %s\n" % (e, bedline) | |
105 start = end + 1 | |
106 return transcript_count | |
107 | |
108 coord_systems = get_toplevel(species) | |
109 if 'chromosome' in coord_systems: | |
110 ref_lengths = dict() | |
111 for ref in sorted(coord_systems['chromosome'].keys()): | |
112 length = coord_systems['chromosome'][ref] | |
113 ref_lengths[ref] = length | |
114 if args.toplevel: | |
115 print >> sys.stderr,\ | |
116 "%s\t%s\tlength: %d" % (species, ref, length) | |
117 if selected_regions: | |
118 transcript_count = 0 | |
119 for ref in sorted(selected_regions.keys()): | |
120 if ref in ref_lengths: | |
121 for reg in selected_regions[ref]: | |
122 (_start, _stop, _strand) = reg | |
123 start = int(_start) if _start else 0 | |
124 stop = int(_stop) if _stop else ref_lengths[ref] | |
125 strand = '' if not _strand else ':1'\ | |
126 if _strand == '+' else ':-1' | |
127 transcript_count += retrieve_region(species, ref, | |
128 start, stop, | |
129 strand) | |
130 if args.debug or args.verbose: | |
131 length = stop - start | |
132 print >> sys.stderr,\ | |
133 "%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d"\ | |
134 % (species, ref, start, stop, strand, | |
135 length, transcript_count) | |
136 else: | |
137 strand = '' | |
138 start = 0 | |
139 for ref in sorted(ref_lengths.keys()): | |
140 length = ref_lengths[ref] | |
141 transcript_count = 0 | |
142 if args.debug: | |
143 print >> sys.stderr,\ | |
144 "Retrieving transcripts: %s\t%s\tlength: %d"\ | |
145 % (species, ref, length) | |
146 transcript_count += retrieve_region(species, ref, start, | |
147 length, strand) | |
148 if args.debug or args.verbose: | |
149 print >> sys.stderr,\ | |
150 "%s\t%s\tlength: %d\ttrancripts:%d"\ | |
151 % (species, ref, length, transcript_count) | |
152 | |
153 | |
154 if __name__ == "__main__": | |
155 __main__() |