| 0 | 1 #!/usr/bin/env python | 
|  | 2 import argparse | 
|  | 3 import os | 
|  | 4 | 
| 9 | 5 import extract_genomic_dna_utils as egdu | 
| 0 | 6 import bx.seq.nib | 
|  | 7 import bx.seq.twobit | 
|  | 8 from bx.intervals.io import Header, Comment | 
|  | 9 | 
|  | 10 | 
|  | 11 parser = argparse.ArgumentParser() | 
| 3 | 12 parser.add_argument('--input_format', dest='input_format', help="Input dataset format") | 
|  | 13 parser.add_argument('--input', dest='input', help="Input dataset") | 
|  | 14 parser.add_argument('--genome', dest='genome', help="Input dataset genome build") | 
|  | 15 parser.add_argument('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") | 
|  | 16 parser.add_argument('--columns', dest='columns', help="Columns to use in input file") | 
|  | 17 parser.add_argument('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") | 
|  | 18 parser.add_argument('--reference_genome', dest='reference_genome', help="Reference genome file") | 
|  | 19 parser.add_argument('--output_format', dest='output_format', help="Output format") | 
| 17 | 20 parser.add_argument('--fasta_header_type', dest='fasta_header_type', default=None, help="Fasta header format") | 
|  | 21 parser.add_argument('--fasta_header_delimiter', dest='fasta_header_delimiter', default=None, help="Fasta header field delimiter") | 
| 3 | 22 parser.add_argument('--output', dest='output', help="Output dataset") | 
| 0 | 23 args = parser.parse_args() | 
|  | 24 | 
|  | 25 input_is_gff = args.input_format == 'gff' | 
| 1 | 26 interpret_features = input_is_gff and args.interpret_features == "yes" | 
| 4 | 27 if len(args.columns.split(',')) == 5: | 
| 0 | 28     # Bed file. | 
| 9 | 29     chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns) | 
| 0 | 30 else: | 
|  | 31     # Gff file. | 
| 9 | 32     chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) | 
| 0 | 33     name_col = False | 
|  | 34 | 
|  | 35 if args.reference_genome_source == "history": | 
| 9 | 36     seq_path = egdu.convert_to_twobit(args.reference_genome) | 
| 0 | 37 else: | 
|  | 38     seq_path = args.reference_genome | 
|  | 39 seq_dir = os.path.split(seq_path)[0] | 
|  | 40 | 
|  | 41 includes_strand_col = strand_col >= 0 | 
|  | 42 strand = None | 
|  | 43 nibs = {} | 
|  | 44 skipped_lines = 0 | 
|  | 45 first_invalid_line = 0 | 
|  | 46 invalid_lines = [] | 
|  | 47 warnings = [] | 
|  | 48 warning = '' | 
|  | 49 twobitfile = None | 
|  | 50 line_count = 1 | 
|  | 51 file_iterator = open(args.input) | 
|  | 52 if interpret_features: | 
| 9 | 53     file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) | 
| 0 | 54 out = open(args.output, 'wt') | 
|  | 55 | 
|  | 56 for feature in file_iterator: | 
|  | 57     # Ignore comments, headers. | 
|  | 58     if isinstance(feature, (Header, Comment)): | 
|  | 59         line_count += 1 | 
|  | 60         continue | 
|  | 61     name = "" | 
|  | 62     if interpret_features: | 
|  | 63         # Processing features. | 
| 9 | 64         egdu.convert_gff_coords_to_bed(feature) | 
| 0 | 65         chrom = feature.chrom | 
|  | 66         start = feature.start | 
|  | 67         end = feature.end | 
|  | 68         strand = feature.strand | 
|  | 69     else: | 
|  | 70         # Processing lines, either interval or GFF format. | 
|  | 71         line = feature.rstrip('\r\n') | 
|  | 72         if line and not line.startswith("#"): | 
|  | 73             fields = line.split('\t') | 
|  | 74             try: | 
|  | 75                 chrom = fields[chrom_col] | 
|  | 76                 start = int(fields[start_col]) | 
|  | 77                 end = int(fields[end_col]) | 
|  | 78                 if name_col: | 
|  | 79                     name = fields[name_col] | 
|  | 80                 if input_is_gff: | 
| 9 | 81                     start, end = egdu.convert_gff_coords_to_bed([start, end]) | 
| 0 | 82                 if includes_strand_col: | 
|  | 83                     strand = fields[strand_col] | 
|  | 84             except: | 
|  | 85                 warning = "Invalid chrom, start or end column values. " | 
|  | 86                 warnings.append(warning) | 
|  | 87                 if not invalid_lines: | 
| 9 | 88                     invalid_lines = egdu.get_lines(feature) | 
| 0 | 89                     first_invalid_line = line_count | 
|  | 90                 skipped_lines += len(invalid_lines) | 
|  | 91                 continue | 
|  | 92             if start > end: | 
|  | 93                 warning = "Invalid interval, start '%d' > end '%d'.  " % (start, end) | 
|  | 94                 warnings.append(warning) | 
|  | 95                 if not invalid_lines: | 
| 9 | 96                     invalid_lines = egdu.get_lines(feature) | 
| 0 | 97                     first_invalid_line = line_count | 
|  | 98                 skipped_lines += len(invalid_lines) | 
|  | 99                 continue | 
|  | 100             if strand not in ['+', '-']: | 
|  | 101                 strand = '+' | 
|  | 102             sequence = '' | 
|  | 103         else: | 
|  | 104             continue | 
|  | 105     # Open sequence file and get sequence for feature/interval. | 
|  | 106     if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): | 
|  | 107         if chrom in nibs: | 
|  | 108             nib = nibs[chrom] | 
|  | 109         else: | 
| 9 | 110             nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom))) | 
| 0 | 111         try: | 
|  | 112             sequence = nib.get(start, end - start) | 
|  | 113         except Exception, e: | 
| 1 | 114             warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) | 
| 0 | 115             warnings.append(warning) | 
|  | 116             if not invalid_lines: | 
| 9 | 117                 invalid_lines = egdu.get_lines(feature) | 
| 0 | 118                 first_invalid_line = line_count | 
|  | 119             skipped_lines += len(invalid_lines) | 
|  | 120             continue | 
| 5 | 121     elif os.path.isfile(seq_path): | 
| 0 | 122         if not(twobitfile): | 
| 9 | 123             twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) | 
| 0 | 124         try: | 
|  | 125             if interpret_features: | 
|  | 126                 # Create sequence from intervals within a feature. | 
|  | 127                 sequence = '' | 
|  | 128                 for interval in feature.intervals: | 
|  | 129                     sequence += twobitfile[interval.chrom][interval.start:interval.end] | 
|  | 130             else: | 
|  | 131                 sequence = twobitfile[chrom][start:end] | 
|  | 132         except: | 
|  | 133             warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) | 
|  | 134             warnings.append(warning) | 
|  | 135             if not invalid_lines: | 
| 9 | 136                 invalid_lines = egdu.get_lines(feature) | 
| 0 | 137                 first_invalid_line = line_count | 
|  | 138             skipped_lines += len(invalid_lines) | 
|  | 139             continue | 
|  | 140     else: | 
| 1 | 141         warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) | 
| 0 | 142         warnings.append(warning) | 
|  | 143         if not invalid_lines: | 
| 9 | 144             invalid_lines = egdu.get_lines(feature) | 
| 0 | 145             first_invalid_line = line_count | 
|  | 146         skipped_lines += len(invalid_lines) | 
|  | 147         continue | 
|  | 148     if sequence == '': | 
| 1 | 149         warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) | 
| 0 | 150         warnings.append(warning) | 
|  | 151         if not invalid_lines: | 
| 9 | 152             invalid_lines = egdu.get_lines(feature) | 
| 0 | 153             first_invalid_line = line_count | 
|  | 154         skipped_lines += len(invalid_lines) | 
|  | 155         continue | 
|  | 156     if includes_strand_col and strand == "-": | 
| 9 | 157         sequence = egdu.reverse_complement(sequence) | 
| 0 | 158     if args.output_format == "fasta": | 
|  | 159         l = len(sequence) | 
|  | 160         c = 0 | 
|  | 161         if input_is_gff: | 
| 9 | 162             start, end = egdu.convert_bed_coords_to_gff([start, end]) | 
| 17 | 163         if args.fasta_header_type == "bedtools_getfasta_default": | 
|  | 164             out.write(">%s\n" % egdu.get_bedtools_getfasta_default_header(str(chrom), | 
|  | 165                                                                           str(start), | 
|  | 166                                                                           str(end), | 
|  | 167                                                                           strand, | 
|  | 168                                                                           includes_strand_col)) | 
| 0 | 169         else: | 
| 17 | 170             # args.fasta_header_type == "char_delimited": | 
|  | 171             fields = [args.genome, str(chrom), str(start), str(end), strand] | 
|  | 172             field_delimiter = egdu.get_fasta_header_delimiter(args.fasta_header_delimiter) | 
|  | 173             meta_data = field_delimiter.join(fields) | 
|  | 174             if name.strip(): | 
|  | 175                 out.write(">%s %s\n" % (meta_data, name)) | 
|  | 176             else: | 
|  | 177                 out.write(">%s\n" % meta_data) | 
| 0 | 178         while c < l: | 
|  | 179             b = min(c + 50, l) | 
|  | 180             out.write("%s\n" % str(sequence[c:b])) | 
|  | 181             c = b | 
|  | 182     else: | 
|  | 183         # output_format == "interval". | 
|  | 184         if interpret_features: | 
|  | 185             meta_data = "\t".join([feature.chrom, | 
|  | 186                                    "galaxy_extract_genomic_dna", | 
|  | 187                                    "interval", | 
|  | 188                                    str(feature.start), | 
|  | 189                                    str(feature.end), | 
|  | 190                                    feature.score, | 
|  | 191                                    feature.strand, | 
|  | 192                                    ".", | 
| 9 | 193                                    egdu.gff_attributes_to_str(feature.attributes, "GTF")]) | 
| 0 | 194         else: | 
| 17 | 195             # Here fields was set up around line 73. | 
| 0 | 196             meta_data = "\t".join(fields) | 
|  | 197         if input_is_gff: | 
|  | 198             format_str = "%s seq \"%s\";\n" | 
|  | 199         else: | 
|  | 200             format_str = "%s\t%s\n" | 
|  | 201         out.write(format_str % (meta_data, str(sequence))) | 
|  | 202     # Update line count. | 
| 9 | 203     if isinstance(feature, egdu.GFFFeature): | 
| 0 | 204         line_count += len(feature.intervals) | 
|  | 205     else: | 
|  | 206         line_count += 1 | 
|  | 207 out.close() | 
|  | 208 | 
|  | 209 if warnings: | 
|  | 210     warn_msg = "%d warnings, 1st is: " % len(warnings) | 
|  | 211     warn_msg += warnings[0] | 
|  | 212     print warn_msg | 
|  | 213 if skipped_lines: | 
|  | 214     # Error message includes up to the first 10 skipped lines. | 
|  | 215     print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10])) | 
|  | 216 | 
|  | 217 if args.reference_genome_source == "history": | 
|  | 218     os.remove(seq_path) |