comparison retrieve_ensembl_bed.py @ 1:c3d600729b6f draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
author galaxyp
date Mon, 22 Jan 2018 13:13:26 -0500
parents 887e111c0919
children e385fe93df68
comparison
equal deleted inserted replaced
0:887e111c0919 1:c3d600729b6f
9 # 9 #
10 # James E Johnson 10 # James E Johnson
11 # 11 #
12 #------------------------------------------------------------------------------ 12 #------------------------------------------------------------------------------
13 """ 13 """
14
15 from __future__ import print_function
14 16
15 import argparse 17 import argparse
16 import re 18 import re
17 import sys 19 import sys
18 20
47 'output', 49 'output',
48 help='Output BED filepath, or for stdout: "-"') 50 help='Output BED filepath, or for stdout: "-"')
49 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') 51 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
50 parser.add_argument('-d', '--debug', action='store_true', help='Debug') 52 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
51 args = parser.parse_args() 53 args = parser.parse_args()
52 # print >> sys.stderr, "args: %s" % args
53 species = args.species 54 species = args.species
54 out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout 55 out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout
55 biotypes = ';'.join(['biotype=%s' % bt.strip() 56 biotypes = ';'.join(['biotype=%s' % bt.strip()
56 for biotype in args.biotypes 57 for biotype in args.biotypes
57 for bt in biotype.split(',') if bt.strip()]) 58 for bt in biotype.split(',') if bt.strip()])
70 if chrom: 71 if chrom:
71 if chrom not in selected_regions: 72 if chrom not in selected_regions:
72 selected_regions[chrom] = [] 73 selected_regions[chrom] = []
73 selected_regions[chrom].append([start, end, strand]) 74 selected_regions[chrom].append([start, end, strand])
74 if args.debug: 75 if args.debug:
75 print >> sys.stderr, "selected_regions: %s" % selected_regions 76 print("selected_regions: %s" % selected_regions, file=sys.stderr)
76 77
77 def retrieve_region(species, ref, start, stop, strand): 78 def retrieve_region(species, ref, start, stop, strand):
78 transcript_count = 0 79 transcript_count = 0
79 regions = range(start, stop, max_region) 80 regions = list(range(start, stop, max_region))
80 if not regions or regions[-1] < stop: 81 if not regions or regions[-1] < stop:
81 regions.append(stop) 82 regions.append(stop)
82 for end in regions[1:]: 83 for end in regions[1:]:
83 bedlines = get_transcripts_bed(species, ref, start, end, 84 bedlines = get_transcripts_bed(species, ref, start, end,
84 strand=strand, params=biotypes) 85 strand=strand, params=biotypes)
85 if args.debug: 86 if args.debug:
86 print >> sys.stderr,\ 87 print("%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d" %
87 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ 88 (species, ref, start, end, len(bedlines)),
88 % (species, ref, start, end, len(bedlines)) 89 file=sys.stderr)
89 # start, end, seq 90 # start, end, seq
90 for i, bedline in enumerate(bedlines): 91 for i, bedline in enumerate(bedlines):
91 if args.debug: 92 if args.debug:
92 print >> sys.stderr, "%s\n" % (bedline) 93 print("%s\n" % (bedline), file=sys.stderr)
93 if not args.ucsc_chrom_names: 94 if not args.ucsc_chrom_names:
94 bedline = re.sub('^[^\t]+', ref, bedline) 95 bedline = re.sub('^[^\t]+', ref, bedline)
95 try: 96 try:
96 if out_wtr: 97 if out_wtr:
97 out_wtr.write(bedline.replace(',\t', '\t') 98 out_wtr.write(bedline.replace(',\t', '\t')
98 if args.extended_bed 99 if args.extended_bed
99 else str(bed_from_line(bedline))) 100 else str(bed_from_line(bedline)))
100 out_wtr.write("\n") 101 out_wtr.write("\n")
101 out_wtr.flush() 102 out_wtr.flush()
102 except Exception as e: 103 except Exception as e:
103 print >> sys.stderr,\ 104 print("BED error (%s) : %s\n" % (e, bedline),
104 "BED error (%s) : %s\n" % (e, bedline) 105 file=sys.stderr)
105 start = end + 1 106 start = end + 1
106 return transcript_count 107 return transcript_count
107 108
108 coord_systems = get_toplevel(species) 109 coord_systems = get_toplevel(species)
109 if 'chromosome' in coord_systems: 110 if 'chromosome' in coord_systems:
110 ref_lengths = dict() 111 ref_lengths = dict()
111 for ref in sorted(coord_systems['chromosome'].keys()): 112 for ref in sorted(coord_systems['chromosome'].keys()):
112 length = coord_systems['chromosome'][ref] 113 length = coord_systems['chromosome'][ref]
113 ref_lengths[ref] = length 114 ref_lengths[ref] = length
114 if args.toplevel: 115 if args.toplevel:
115 print >> sys.stderr,\ 116 print("%s\t%s\tlength: %d" % (species, ref, length),
116 "%s\t%s\tlength: %d" % (species, ref, length) 117 file=sys.stderr)
117 if selected_regions: 118 if selected_regions:
118 transcript_count = 0 119 transcript_count = 0
119 for ref in sorted(selected_regions.keys()): 120 for ref in sorted(selected_regions.keys()):
120 if ref in ref_lengths: 121 if ref in ref_lengths:
121 for reg in selected_regions[ref]: 122 for reg in selected_regions[ref]:
127 transcript_count += retrieve_region(species, ref, 128 transcript_count += retrieve_region(species, ref,
128 start, stop, 129 start, stop,
129 strand) 130 strand)
130 if args.debug or args.verbose: 131 if args.debug or args.verbose:
131 length = stop - start 132 length = stop - start
132 print >> sys.stderr,\ 133 print("%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d" %
133 "%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d"\ 134 (species, ref, start, stop, strand,
134 % (species, ref, start, stop, strand, 135 length, transcript_count),
135 length, transcript_count) 136 file=sys.stderr)
136 else: 137 else:
137 strand = '' 138 strand = ''
138 start = 0 139 start = 0
139 for ref in sorted(ref_lengths.keys()): 140 for ref in sorted(ref_lengths.keys()):
140 length = ref_lengths[ref] 141 length = ref_lengths[ref]
141 transcript_count = 0 142 transcript_count = 0
142 if args.debug: 143 if args.debug:
143 print >> sys.stderr,\ 144 print("Retrieving transcripts: %s\t%s\tlength: %d" %
144 "Retrieving transcripts: %s\t%s\tlength: %d"\ 145 (species, ref, length), file=sys.stderr)
145 % (species, ref, length)
146 transcript_count += retrieve_region(species, ref, start, 146 transcript_count += retrieve_region(species, ref, start,
147 length, strand) 147 length, strand)
148 if args.debug or args.verbose: 148 if args.debug or args.verbose:
149 print >> sys.stderr,\ 149 print("%s\t%s\tlength: %d\ttrancripts:%d" %
150 "%s\t%s\tlength: %d\ttrancripts:%d"\ 150 (species, ref, length, transcript_count),
151 % (species, ref, length, transcript_count) 151 file=sys.stderr)
152 152
153 153
154 if __name__ == "__main__": 154 if __name__ == "__main__":
155 __main__() 155 __main__()