comparison retrieve_ensembl_bed.py @ 0:887e111c0919 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 3fd7be931712e7fa5b281bc8c48104c8583ef7f0
author galaxyp
date Sun, 14 Jan 2018 14:11:53 -0500
parents
children c3d600729b6f
comparison
equal deleted inserted replaced
-1:000000000000 0:887e111c0919
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2017, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 import argparse
16 import re
17 import sys
18
19 from bedutil import bed_from_line
20
21 from ensembl_rest import get_toplevel, get_transcripts_bed, max_region
22
23
24 def __main__():
25 parser = argparse.ArgumentParser(
26 description='Retrieve Ensembl cDNAs in BED format')
27 parser.add_argument(
28 '-s', '--species', default='human',
29 help='Ensembl Species to retrieve')
30 parser.add_argument(
31 '-R', '--regions', action='append', default=[],
32 help='Restrict Ensembl retrieval to regions e.g.:'
33 + ' X,2:20000-25000,3:100-500+')
34 parser.add_argument(
35 '-B', '--biotypes', action='append', default=[],
36 help='Restrict Ensembl biotypes to retrieve')
37 parser.add_argument(
38 '-X', '--extended_bed', action='store_true', default=False,
39 help='Include the extended columns returned from Ensembl')
40 parser.add_argument(
41 '-U', '--ucsc_chrom_names', action='store_true', default=False,
42 help='Use the UCSC names for Chromosomes')
43 parser.add_argument(
44 '-t', '--toplevel', action='store_true',
45 help='Print Ensembl toplevel for species')
46 parser.add_argument(
47 'output',
48 help='Output BED filepath, or for stdout: "-"')
49 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
50 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
51 args = parser.parse_args()
52 # print >> sys.stderr, "args: %s" % args
53 species = args.species
54 out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout
55 biotypes = ';'.join(['biotype=%s' % bt.strip()
56 for biotype in args.biotypes
57 for bt in biotype.split(',') if bt.strip()])
58
59 selected_regions = dict() # chrom:(start, end)
60 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
61 if args.regions:
62 for entry in args.regions:
63 if not entry:
64 continue
65 regs = [x.strip() for x in entry.split(',') if x.strip()]
66 for reg in regs:
67 m = re.match(region_pat, reg)
68 if m:
69 (chrom, start, end, strand) = m.groups()
70 if chrom:
71 if chrom not in selected_regions:
72 selected_regions[chrom] = []
73 selected_regions[chrom].append([start, end, strand])
74 if args.debug:
75 print >> sys.stderr, "selected_regions: %s" % selected_regions
76
77 def retrieve_region(species, ref, start, stop, strand):
78 transcript_count = 0
79 regions = range(start, stop, max_region)
80 if not regions or regions[-1] < stop:
81 regions.append(stop)
82 for end in regions[1:]:
83 bedlines = get_transcripts_bed(species, ref, start, end,
84 strand=strand, params=biotypes)
85 if args.debug:
86 print >> sys.stderr,\
87 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\
88 % (species, ref, start, end, len(bedlines))
89 # start, end, seq
90 for i, bedline in enumerate(bedlines):
91 if args.debug:
92 print >> sys.stderr, "%s\n" % (bedline)
93 if not args.ucsc_chrom_names:
94 bedline = re.sub('^[^\t]+', ref, bedline)
95 try:
96 if out_wtr:
97 out_wtr.write(bedline.replace(',\t', '\t')
98 if args.extended_bed
99 else str(bed_from_line(bedline)))
100 out_wtr.write("\n")
101 out_wtr.flush()
102 except Exception as e:
103 print >> sys.stderr,\
104 "BED error (%s) : %s\n" % (e, bedline)
105 start = end + 1
106 return transcript_count
107
108 coord_systems = get_toplevel(species)
109 if 'chromosome' in coord_systems:
110 ref_lengths = dict()
111 for ref in sorted(coord_systems['chromosome'].keys()):
112 length = coord_systems['chromosome'][ref]
113 ref_lengths[ref] = length
114 if args.toplevel:
115 print >> sys.stderr,\
116 "%s\t%s\tlength: %d" % (species, ref, length)
117 if selected_regions:
118 transcript_count = 0
119 for ref in sorted(selected_regions.keys()):
120 if ref in ref_lengths:
121 for reg in selected_regions[ref]:
122 (_start, _stop, _strand) = reg
123 start = int(_start) if _start else 0
124 stop = int(_stop) if _stop else ref_lengths[ref]
125 strand = '' if not _strand else ':1'\
126 if _strand == '+' else ':-1'
127 transcript_count += retrieve_region(species, ref,
128 start, stop,
129 strand)
130 if args.debug or args.verbose:
131 length = stop - start
132 print >> sys.stderr,\
133 "%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d"\
134 % (species, ref, start, stop, strand,
135 length, transcript_count)
136 else:
137 strand = ''
138 start = 0
139 for ref in sorted(ref_lengths.keys()):
140 length = ref_lengths[ref]
141 transcript_count = 0
142 if args.debug:
143 print >> sys.stderr,\
144 "Retrieving transcripts: %s\t%s\tlength: %d"\
145 % (species, ref, length)
146 transcript_count += retrieve_region(species, ref, start,
147 length, strand)
148 if args.debug or args.verbose:
149 print >> sys.stderr,\
150 "%s\t%s\tlength: %d\ttrancripts:%d"\
151 % (species, ref, length, transcript_count)
152
153
154 if __name__ == "__main__":
155 __main__()