| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 import argparse
 | 
| 
 | 
     3 import os
 | 
| 
 | 
     4 import subprocess
 | 
| 
 | 
     5 import sys
 | 
| 
 | 
     6 import tempfile
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 import bx.seq.nib
 | 
| 
 | 
     9 import bx.seq.twobit
 | 
| 
 | 
    10 from bx.intervals.io import Header, Comment
 | 
| 
 | 
    11 from galaxy.datatypes.util import gff_util
 | 
| 
 | 
    12 from galaxy.tools.util.galaxyops import parse_cols_arg
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 def convert_to_twobit(reference_genome):
 | 
| 
 | 
    16     """
 | 
| 
 | 
    17     Create 2bit file history fasta dataset.
 | 
| 
 | 
    18     """
 | 
| 
 | 
    19     try:
 | 
| 
 | 
    20         seq_path = tempfile.NamedTemporaryFile(dir=".").name
 | 
| 
 | 
    21         cmd = "faToTwoBit %s %s" % (reference_genome, seq_path)
 | 
| 
 | 
    22         tmp_name = tempfile.NamedTemporaryFile(dir=".").name
 | 
| 
 | 
    23         tmp_stderr = open(tmp_name, 'wb')
 | 
| 
 | 
    24         proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
 | 
| 
 | 
    25         returncode = proc.wait()
 | 
| 
 | 
    26         tmp_stderr.close()
 | 
| 
 | 
    27         if returncode != 0:
 | 
| 
 | 
    28             # Get stderr, allowing for case where it's very large.
 | 
| 
 | 
    29             tmp_stderr = open(tmp_name, 'rb')
 | 
| 
 | 
    30             stderr = ''
 | 
| 
 | 
    31             buffsize = 1048576
 | 
| 
 | 
    32             try:
 | 
| 
 | 
    33                 while True:
 | 
| 
 | 
    34                     stderr += tmp_stderr.read(buffsize)
 | 
| 
 | 
    35                     if not stderr or len(stderr) % buffsize != 0:
 | 
| 
 | 
    36                         break
 | 
| 
 | 
    37             except OverflowError:
 | 
| 
 | 
    38                 pass
 | 
| 
 | 
    39             tmp_stderr.close()
 | 
| 
 | 
    40             os.remove(tmp_name)
 | 
| 
 | 
    41             stop_err(stderr)
 | 
| 
 | 
    42         return seq_path
 | 
| 
 | 
    43     except Exception, e:
 | 
| 
 | 
    44         stop_err('Error running faToTwoBit. ' + str(e))
 | 
| 
 | 
    45 
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 def get_lines(feature):
 | 
| 
 | 
    48     # Get feature's line(s).
 | 
| 
 | 
    49     if isinstance(feature, gff_util.GFFFeature):
 | 
| 
 | 
    50         return feature.lines()
 | 
| 
 | 
    51     else:
 | 
| 
 | 
    52         return [feature.rstrip('\r\n')]
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 
 | 
| 
 | 
    55 def reverse_complement(s):
 | 
| 
 | 
    56     complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}
 | 
| 
 | 
    57     reversed_s = []
 | 
| 
 | 
    58     for i in s:
 | 
| 
 | 
    59         reversed_s.append(complement_dna[i])
 | 
| 
 | 
    60     reversed_s.reverse()
 | 
| 
 | 
    61     return "".join(reversed_s)
 | 
| 
 | 
    62 
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 def stop_err(msg):
 | 
| 
 | 
    65     sys.stderr.write(msg)
 | 
| 
 | 
    66     sys.exit(1)
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 parser = argparse.ArgumentParser()
 | 
| 
3
 | 
    69 parser.add_argument('--input_format', dest='input_format', help="Input dataset format")
 | 
| 
 | 
    70 parser.add_argument('--input', dest='input', help="Input dataset")
 | 
| 
 | 
    71 parser.add_argument('--genome', dest='genome', help="Input dataset genome build")
 | 
| 
 | 
    72 parser.add_argument('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff")
 | 
| 
 | 
    73 parser.add_argument('--columns', dest='columns', help="Columns to use in input file")
 | 
| 
 | 
    74 parser.add_argument('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file")
 | 
| 
 | 
    75 parser.add_argument('--reference_genome', dest='reference_genome', help="Reference genome file")
 | 
| 
 | 
    76 parser.add_argument('--output_format', dest='output_format', help="Output format")
 | 
| 
 | 
    77 parser.add_argument('--output', dest='output', help="Output dataset")
 | 
| 
0
 | 
    78 args = parser.parse_args()
 | 
| 
 | 
    79 
 | 
| 
 | 
    80 input_is_gff = args.input_format == 'gff'
 | 
| 
1
 | 
    81 interpret_features = input_is_gff and args.interpret_features == "yes"
 | 
| 
4
 | 
    82 if len(args.columns.split(',')) == 5:
 | 
| 
0
 | 
    83     # Bed file.
 | 
| 
4
 | 
    84     chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(args.columns)
 | 
| 
0
 | 
    85 else:
 | 
| 
 | 
    86     # Gff file.
 | 
| 
4
 | 
    87     chrom_col, start_col, end_col, strand_col = parse_cols_arg(args.columns)
 | 
| 
0
 | 
    88     name_col = False
 | 
| 
 | 
    89 
 | 
| 
 | 
    90 if args.reference_genome_source == "history":
 | 
| 
 | 
    91     seq_path = convert_to_twobit(args.reference_genome)
 | 
| 
 | 
    92 else:
 | 
| 
 | 
    93     seq_path = args.reference_genome
 | 
| 
 | 
    94 seq_dir = os.path.split(seq_path)[0]
 | 
| 
 | 
    95 
 | 
| 
 | 
    96 includes_strand_col = strand_col >= 0
 | 
| 
 | 
    97 strand = None
 | 
| 
 | 
    98 nibs = {}
 | 
| 
 | 
    99 skipped_lines = 0
 | 
| 
 | 
   100 first_invalid_line = 0
 | 
| 
 | 
   101 invalid_lines = []
 | 
| 
 | 
   102 warnings = []
 | 
| 
 | 
   103 warning = ''
 | 
| 
 | 
   104 twobitfile = None
 | 
| 
 | 
   105 line_count = 1
 | 
| 
 | 
   106 file_iterator = open(args.input)
 | 
| 
 | 
   107 if interpret_features:
 | 
| 
 | 
   108     file_iterator = gff_util.GFFReaderWrapper(file_iterator, fix_strand=False)
 | 
| 
 | 
   109 out = open(args.output, 'wt')
 | 
| 
 | 
   110 
 | 
| 
 | 
   111 for feature in file_iterator:
 | 
| 
 | 
   112     # Ignore comments, headers.
 | 
| 
 | 
   113     if isinstance(feature, (Header, Comment)):
 | 
| 
 | 
   114         line_count += 1
 | 
| 
 | 
   115         continue
 | 
| 
 | 
   116     name = ""
 | 
| 
 | 
   117     if interpret_features:
 | 
| 
 | 
   118         # Processing features.
 | 
| 
 | 
   119         gff_util.convert_gff_coords_to_bed(feature)
 | 
| 
 | 
   120         chrom = feature.chrom
 | 
| 
 | 
   121         start = feature.start
 | 
| 
 | 
   122         end = feature.end
 | 
| 
 | 
   123         strand = feature.strand
 | 
| 
 | 
   124     else:
 | 
| 
 | 
   125         # Processing lines, either interval or GFF format.
 | 
| 
 | 
   126         line = feature.rstrip('\r\n')
 | 
| 
 | 
   127         if line and not line.startswith("#"):
 | 
| 
 | 
   128             fields = line.split('\t')
 | 
| 
 | 
   129             try:
 | 
| 
 | 
   130                 chrom = fields[chrom_col]
 | 
| 
 | 
   131                 start = int(fields[start_col])
 | 
| 
 | 
   132                 end = int(fields[end_col])
 | 
| 
 | 
   133                 if name_col:
 | 
| 
 | 
   134                     name = fields[name_col]
 | 
| 
 | 
   135                 if input_is_gff:
 | 
| 
 | 
   136                     start, end = gff_util.convert_gff_coords_to_bed([start, end])
 | 
| 
 | 
   137                 if includes_strand_col:
 | 
| 
 | 
   138                     strand = fields[strand_col]
 | 
| 
 | 
   139             except:
 | 
| 
 | 
   140                 warning = "Invalid chrom, start or end column values. "
 | 
| 
 | 
   141                 warnings.append(warning)
 | 
| 
 | 
   142                 if not invalid_lines:
 | 
| 
 | 
   143                     invalid_lines = get_lines(feature)
 | 
| 
 | 
   144                     first_invalid_line = line_count
 | 
| 
 | 
   145                 skipped_lines += len(invalid_lines)
 | 
| 
 | 
   146                 continue
 | 
| 
 | 
   147             if start > end:
 | 
| 
 | 
   148                 warning = "Invalid interval, start '%d' > end '%d'.  " % (start, end)
 | 
| 
 | 
   149                 warnings.append(warning)
 | 
| 
 | 
   150                 if not invalid_lines:
 | 
| 
 | 
   151                     invalid_lines = get_lines(feature)
 | 
| 
 | 
   152                     first_invalid_line = line_count
 | 
| 
 | 
   153                 skipped_lines += len(invalid_lines)
 | 
| 
 | 
   154                 continue
 | 
| 
 | 
   155             if strand not in ['+', '-']:
 | 
| 
 | 
   156                 strand = '+'
 | 
| 
 | 
   157             sequence = ''
 | 
| 
 | 
   158         else:
 | 
| 
 | 
   159             continue
 | 
| 
 | 
   160     # Open sequence file and get sequence for feature/interval.
 | 
| 
 | 
   161     if os.path.exists("%s/%s.nib" % (seq_dir, chrom)):
 | 
| 
 | 
   162         if chrom in nibs:
 | 
| 
 | 
   163             nib = nibs[chrom]
 | 
| 
 | 
   164         else:
 | 
| 
 | 
   165             nibs[chrom] = nib = bx.seq.nib.NibFile(file("%s/%s.nib" % (seq_path, chrom)))
 | 
| 
 | 
   166         try:
 | 
| 
 | 
   167             sequence = nib.get(start, end - start)
 | 
| 
 | 
   168         except Exception, e:
 | 
| 
1
 | 
   169             warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome)
 | 
| 
0
 | 
   170             warnings.append(warning)
 | 
| 
 | 
   171             if not invalid_lines:
 | 
| 
 | 
   172                 invalid_lines = get_lines(feature)
 | 
| 
 | 
   173                 first_invalid_line = line_count
 | 
| 
 | 
   174             skipped_lines += len(invalid_lines)
 | 
| 
 | 
   175             continue
 | 
| 
5
 | 
   176     elif os.path.isfile(seq_path):
 | 
| 
0
 | 
   177         if not(twobitfile):
 | 
| 
 | 
   178             twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
 | 
| 
 | 
   179         try:
 | 
| 
 | 
   180             if interpret_features:
 | 
| 
 | 
   181                 # Create sequence from intervals within a feature.
 | 
| 
 | 
   182                 sequence = ''
 | 
| 
 | 
   183                 for interval in feature.intervals:
 | 
| 
 | 
   184                     sequence += twobitfile[interval.chrom][interval.start:interval.end]
 | 
| 
 | 
   185             else:
 | 
| 
 | 
   186                 sequence = twobitfile[chrom][start:end]
 | 
| 
 | 
   187         except:
 | 
| 
 | 
   188             warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom)
 | 
| 
 | 
   189             warnings.append(warning)
 | 
| 
 | 
   190             if not invalid_lines:
 | 
| 
 | 
   191                 invalid_lines = get_lines(feature)
 | 
| 
 | 
   192                 first_invalid_line = line_count
 | 
| 
 | 
   193             skipped_lines += len(invalid_lines)
 | 
| 
 | 
   194             continue
 | 
| 
 | 
   195     else:
 | 
| 
1
 | 
   196         warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome)
 | 
| 
0
 | 
   197         warnings.append(warning)
 | 
| 
 | 
   198         if not invalid_lines:
 | 
| 
 | 
   199             invalid_lines = get_lines(feature)
 | 
| 
 | 
   200             first_invalid_line = line_count
 | 
| 
 | 
   201         skipped_lines += len(invalid_lines)
 | 
| 
 | 
   202         continue
 | 
| 
 | 
   203     if sequence == '':
 | 
| 
1
 | 
   204         warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome)
 | 
| 
0
 | 
   205         warnings.append(warning)
 | 
| 
 | 
   206         if not invalid_lines:
 | 
| 
 | 
   207             invalid_lines = get_lines(feature)
 | 
| 
 | 
   208             first_invalid_line = line_count
 | 
| 
 | 
   209         skipped_lines += len(invalid_lines)
 | 
| 
 | 
   210         continue
 | 
| 
 | 
   211     if includes_strand_col and strand == "-":
 | 
| 
 | 
   212         sequence = reverse_complement(sequence)
 | 
| 
 | 
   213     if args.output_format == "fasta":
 | 
| 
 | 
   214         l = len(sequence)
 | 
| 
 | 
   215         c = 0
 | 
| 
 | 
   216         if input_is_gff:
 | 
| 
 | 
   217             start, end = gff_util.convert_bed_coords_to_gff([start, end])
 | 
| 
1
 | 
   218         fields = [args.genome, str(chrom), str(start), str(end), strand]
 | 
| 
0
 | 
   219         meta_data = "_".join(fields)
 | 
| 
 | 
   220         if name.strip():
 | 
| 
 | 
   221             out.write(">%s %s\n" % (meta_data, name))
 | 
| 
 | 
   222         else:
 | 
| 
 | 
   223             out.write(">%s\n" % meta_data)
 | 
| 
 | 
   224         while c < l:
 | 
| 
 | 
   225             b = min(c + 50, l)
 | 
| 
 | 
   226             out.write("%s\n" % str(sequence[c:b]))
 | 
| 
 | 
   227             c = b
 | 
| 
 | 
   228     else:
 | 
| 
 | 
   229         # output_format == "interval".
 | 
| 
 | 
   230         if interpret_features:
 | 
| 
 | 
   231             meta_data = "\t".join([feature.chrom,
 | 
| 
 | 
   232                                    "galaxy_extract_genomic_dna",
 | 
| 
 | 
   233                                    "interval",
 | 
| 
 | 
   234                                    str(feature.start),
 | 
| 
 | 
   235                                    str(feature.end),
 | 
| 
 | 
   236                                    feature.score,
 | 
| 
 | 
   237                                    feature.strand,
 | 
| 
 | 
   238                                    ".",
 | 
| 
 | 
   239                                    gff_util.gff_attributes_to_str(feature.attributes, "GTF")])
 | 
| 
 | 
   240         else:
 | 
| 
 | 
   241             # Where is fields being set here?
 | 
| 
 | 
   242             meta_data = "\t".join(fields)
 | 
| 
 | 
   243         if input_is_gff:
 | 
| 
 | 
   244             format_str = "%s seq \"%s\";\n"
 | 
| 
 | 
   245         else:
 | 
| 
 | 
   246             format_str = "%s\t%s\n"
 | 
| 
 | 
   247         out.write(format_str % (meta_data, str(sequence)))
 | 
| 
 | 
   248     # Update line count.
 | 
| 
 | 
   249     if isinstance(feature, gff_util.GFFFeature):
 | 
| 
 | 
   250         line_count += len(feature.intervals)
 | 
| 
 | 
   251     else:
 | 
| 
 | 
   252         line_count += 1
 | 
| 
 | 
   253 out.close()
 | 
| 
 | 
   254 
 | 
| 
 | 
   255 if warnings:
 | 
| 
 | 
   256     warn_msg = "%d warnings, 1st is: " % len(warnings)
 | 
| 
 | 
   257     warn_msg += warnings[0]
 | 
| 
 | 
   258     print warn_msg
 | 
| 
 | 
   259 if skipped_lines:
 | 
| 
 | 
   260     # Error message includes up to the first 10 skipped lines.
 | 
| 
 | 
   261     print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))
 | 
| 
 | 
   262 
 | 
| 
 | 
   263 if args.reference_genome_source == "history":
 | 
| 
 | 
   264     os.remove(seq_path)
 |