Mercurial > repos > galaxyp > retrieve_ensembl_bed
comparison retrieve_ensembl_bed.py @ 2:e385fe93df68 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 6babd357845126292cb202aaea0f70ff68819525"
| author | galaxyp | 
|---|---|
| date | Mon, 07 Oct 2019 16:14:19 -0400 | 
| parents | c3d600729b6f | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 1:c3d600729b6f | 2:e385fe93df68 | 
|---|---|
| 31 help='Ensembl Species to retrieve') | 31 help='Ensembl Species to retrieve') | 
| 32 parser.add_argument( | 32 parser.add_argument( | 
| 33 '-R', '--regions', action='append', default=[], | 33 '-R', '--regions', action='append', default=[], | 
| 34 help='Restrict Ensembl retrieval to regions e.g.:' | 34 help='Restrict Ensembl retrieval to regions e.g.:' | 
| 35 + ' X,2:20000-25000,3:100-500+') | 35 + ' X,2:20000-25000,3:100-500+') | 
| 36 parser.add_argument( | |
| 37 '-i', '--interval_file', default=None, | |
| 38 help='Regions from a bed, gff, or interval file') | |
| 39 parser.add_argument( | |
| 40 '-f', '--interval_format', choices=['bed','gff','interval'], default='interval', | |
| 41 help='Interval format has TAB-separated columns: Seq, Start, End, Strand') | |
| 36 parser.add_argument( | 42 parser.add_argument( | 
| 37 '-B', '--biotypes', action='append', default=[], | 43 '-B', '--biotypes', action='append', default=[], | 
| 38 help='Restrict Ensembl biotypes to retrieve') | 44 help='Restrict Ensembl biotypes to retrieve') | 
| 39 parser.add_argument( | 45 parser.add_argument( | 
| 40 '-X', '--extended_bed', action='store_true', default=False, | 46 '-X', '--extended_bed', action='store_true', default=False, | 
| 72 if chrom not in selected_regions: | 78 if chrom not in selected_regions: | 
| 73 selected_regions[chrom] = [] | 79 selected_regions[chrom] = [] | 
| 74 selected_regions[chrom].append([start, end, strand]) | 80 selected_regions[chrom].append([start, end, strand]) | 
| 75 if args.debug: | 81 if args.debug: | 
| 76 print("selected_regions: %s" % selected_regions, file=sys.stderr) | 82 print("selected_regions: %s" % selected_regions, file=sys.stderr) | 
| 83 | |
| 84 if args.interval_file: | |
| 85 pat = r'^(?:chr)?([^\t]+)(?:\t(\d+)(?:\t(\d+)(?:\t([+-])?)?)?)?.*' | |
| 86 if args.interval_format == 'bed': | |
| 87 pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*' | |
| 88 elif args.interval_format == 'gff': | |
| 89 pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*' | |
| 90 with open(args.interval_file,'r') as fh: | |
| 91 for i, line in enumerate(fh): | |
| 92 if line.startswith('#'): | |
| 93 continue | |
| 94 m = re.match(pat, line.rstrip()) | |
| 95 if m: | |
| 96 (chrom, start, end, strand) = m.groups() | |
| 97 if chrom: | |
| 98 if chrom not in selected_regions: | |
| 99 selected_regions[chrom] = [] | |
| 100 selected_regions[chrom].append([start, end, strand]) | |
| 101 if args.debug: | |
| 102 print("selected_regions: %s" % selected_regions, file=sys.stderr) | |
| 103 | |
| 77 | 104 | 
| 78 def retrieve_region(species, ref, start, stop, strand): | 105 def retrieve_region(species, ref, start, stop, strand): | 
| 79 transcript_count = 0 | 106 transcript_count = 0 | 
| 80 regions = list(range(start, stop, max_region)) | 107 regions = list(range(start, stop, max_region)) | 
| 81 if not regions or regions[-1] < stop: | 108 if not regions or regions[-1] < stop: | 
