Mercurial > repos > galaxyp > retrieve_ensembl_bed
comparison retrieve_ensembl_bed.py @ 1:c3d600729b6f draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
| author | galaxyp |
|---|---|
| date | Mon, 22 Jan 2018 13:13:26 -0500 |
| parents | 887e111c0919 |
| children | e385fe93df68 |
comparison
equal
deleted
inserted
replaced
| 0:887e111c0919 | 1:c3d600729b6f |
|---|---|
| 9 # | 9 # |
| 10 # James E Johnson | 10 # James E Johnson |
| 11 # | 11 # |
| 12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
| 13 """ | 13 """ |
| 14 | |
| 15 from __future__ import print_function | |
| 14 | 16 |
| 15 import argparse | 17 import argparse |
| 16 import re | 18 import re |
| 17 import sys | 19 import sys |
| 18 | 20 |
| 47 'output', | 49 'output', |
| 48 help='Output BED filepath, or for stdout: "-"') | 50 help='Output BED filepath, or for stdout: "-"') |
| 49 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') | 51 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') |
| 50 parser.add_argument('-d', '--debug', action='store_true', help='Debug') | 52 parser.add_argument('-d', '--debug', action='store_true', help='Debug') |
| 51 args = parser.parse_args() | 53 args = parser.parse_args() |
| 52 # print >> sys.stderr, "args: %s" % args | |
| 53 species = args.species | 54 species = args.species |
| 54 out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout | 55 out_wtr = open(args.output, 'w') if args.output != '-' else sys.stdout |
| 55 biotypes = ';'.join(['biotype=%s' % bt.strip() | 56 biotypes = ';'.join(['biotype=%s' % bt.strip() |
| 56 for biotype in args.biotypes | 57 for biotype in args.biotypes |
| 57 for bt in biotype.split(',') if bt.strip()]) | 58 for bt in biotype.split(',') if bt.strip()]) |
| 70 if chrom: | 71 if chrom: |
| 71 if chrom not in selected_regions: | 72 if chrom not in selected_regions: |
| 72 selected_regions[chrom] = [] | 73 selected_regions[chrom] = [] |
| 73 selected_regions[chrom].append([start, end, strand]) | 74 selected_regions[chrom].append([start, end, strand]) |
| 74 if args.debug: | 75 if args.debug: |
| 75 print >> sys.stderr, "selected_regions: %s" % selected_regions | 76 print("selected_regions: %s" % selected_regions, file=sys.stderr) |
| 76 | 77 |
| 77 def retrieve_region(species, ref, start, stop, strand): | 78 def retrieve_region(species, ref, start, stop, strand): |
| 78 transcript_count = 0 | 79 transcript_count = 0 |
| 79 regions = range(start, stop, max_region) | 80 regions = list(range(start, stop, max_region)) |
| 80 if not regions or regions[-1] < stop: | 81 if not regions or regions[-1] < stop: |
| 81 regions.append(stop) | 82 regions.append(stop) |
| 82 for end in regions[1:]: | 83 for end in regions[1:]: |
| 83 bedlines = get_transcripts_bed(species, ref, start, end, | 84 bedlines = get_transcripts_bed(species, ref, start, end, |
| 84 strand=strand, params=biotypes) | 85 strand=strand, params=biotypes) |
| 85 if args.debug: | 86 if args.debug: |
| 86 print >> sys.stderr,\ | 87 print("%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d" % |
| 87 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ | 88 (species, ref, start, end, len(bedlines)), |
| 88 % (species, ref, start, end, len(bedlines)) | 89 file=sys.stderr) |
| 89 # start, end, seq | 90 # start, end, seq |
| 90 for i, bedline in enumerate(bedlines): | 91 for i, bedline in enumerate(bedlines): |
| 91 if args.debug: | 92 if args.debug: |
| 92 print >> sys.stderr, "%s\n" % (bedline) | 93 print("%s\n" % (bedline), file=sys.stderr) |
| 93 if not args.ucsc_chrom_names: | 94 if not args.ucsc_chrom_names: |
| 94 bedline = re.sub('^[^\t]+', ref, bedline) | 95 bedline = re.sub('^[^\t]+', ref, bedline) |
| 95 try: | 96 try: |
| 96 if out_wtr: | 97 if out_wtr: |
| 97 out_wtr.write(bedline.replace(',\t', '\t') | 98 out_wtr.write(bedline.replace(',\t', '\t') |
| 98 if args.extended_bed | 99 if args.extended_bed |
| 99 else str(bed_from_line(bedline))) | 100 else str(bed_from_line(bedline))) |
| 100 out_wtr.write("\n") | 101 out_wtr.write("\n") |
| 101 out_wtr.flush() | 102 out_wtr.flush() |
| 102 except Exception as e: | 103 except Exception as e: |
| 103 print >> sys.stderr,\ | 104 print("BED error (%s) : %s\n" % (e, bedline), |
| 104 "BED error (%s) : %s\n" % (e, bedline) | 105 file=sys.stderr) |
| 105 start = end + 1 | 106 start = end + 1 |
| 106 return transcript_count | 107 return transcript_count |
| 107 | 108 |
| 108 coord_systems = get_toplevel(species) | 109 coord_systems = get_toplevel(species) |
| 109 if 'chromosome' in coord_systems: | 110 if 'chromosome' in coord_systems: |
| 110 ref_lengths = dict() | 111 ref_lengths = dict() |
| 111 for ref in sorted(coord_systems['chromosome'].keys()): | 112 for ref in sorted(coord_systems['chromosome'].keys()): |
| 112 length = coord_systems['chromosome'][ref] | 113 length = coord_systems['chromosome'][ref] |
| 113 ref_lengths[ref] = length | 114 ref_lengths[ref] = length |
| 114 if args.toplevel: | 115 if args.toplevel: |
| 115 print >> sys.stderr,\ | 116 print("%s\t%s\tlength: %d" % (species, ref, length), |
| 116 "%s\t%s\tlength: %d" % (species, ref, length) | 117 file=sys.stderr) |
| 117 if selected_regions: | 118 if selected_regions: |
| 118 transcript_count = 0 | 119 transcript_count = 0 |
| 119 for ref in sorted(selected_regions.keys()): | 120 for ref in sorted(selected_regions.keys()): |
| 120 if ref in ref_lengths: | 121 if ref in ref_lengths: |
| 121 for reg in selected_regions[ref]: | 122 for reg in selected_regions[ref]: |
| 127 transcript_count += retrieve_region(species, ref, | 128 transcript_count += retrieve_region(species, ref, |
| 128 start, stop, | 129 start, stop, |
| 129 strand) | 130 strand) |
| 130 if args.debug or args.verbose: | 131 if args.debug or args.verbose: |
| 131 length = stop - start | 132 length = stop - start |
| 132 print >> sys.stderr,\ | 133 print("%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d" % |
| 133 "%s\t%s:%d-%d%s\tlength: %d\ttrancripts:%d"\ | 134 (species, ref, start, stop, strand, |
| 134 % (species, ref, start, stop, strand, | 135 length, transcript_count), |
| 135 length, transcript_count) | 136 file=sys.stderr) |
| 136 else: | 137 else: |
| 137 strand = '' | 138 strand = '' |
| 138 start = 0 | 139 start = 0 |
| 139 for ref in sorted(ref_lengths.keys()): | 140 for ref in sorted(ref_lengths.keys()): |
| 140 length = ref_lengths[ref] | 141 length = ref_lengths[ref] |
| 141 transcript_count = 0 | 142 transcript_count = 0 |
| 142 if args.debug: | 143 if args.debug: |
| 143 print >> sys.stderr,\ | 144 print("Retrieving transcripts: %s\t%s\tlength: %d" % |
| 144 "Retrieving transcripts: %s\t%s\tlength: %d"\ | 145 (species, ref, length), file=sys.stderr) |
| 145 % (species, ref, length) | |
| 146 transcript_count += retrieve_region(species, ref, start, | 146 transcript_count += retrieve_region(species, ref, start, |
| 147 length, strand) | 147 length, strand) |
| 148 if args.debug or args.verbose: | 148 if args.debug or args.verbose: |
| 149 print >> sys.stderr,\ | 149 print("%s\t%s\tlength: %d\ttrancripts:%d" % |
| 150 "%s\t%s\tlength: %d\ttrancripts:%d"\ | 150 (species, ref, length, transcript_count), |
| 151 % (species, ref, length, transcript_count) | 151 file=sys.stderr) |
| 152 | 152 |
| 153 | 153 |
| 154 if __name__ == "__main__": | 154 if __name__ == "__main__": |
| 155 __main__() | 155 __main__() |
