Mercurial > repos > devteam > subtract
changeset 5:0d97b11ed3d5 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/subtract commit cae3e05d02e60f595bb8b6d77a84f030e9bd1689
| author | devteam | 
|---|---|
| date | Thu, 22 Jun 2017 18:51:24 -0400 | 
| parents | a43e5a6390c1 | 
| children | |
| files | gops_subtract.py macros.xml operation_filter.py subtract.xml tool_dependencies.xml utils/__init__.py utils/__init__.pyc utils/gff_util.py utils/gff_util.pyc utils/odict.py utils/odict.pyc | 
| diffstat | 10 files changed, 638 insertions(+), 119 deletions(-) [+] | 
line wrap: on
 line diff
--- a/gops_subtract.py Fri Dec 18 19:39:15 2015 -0500 +++ b/gops_subtract.py Thu Jun 22 18:51:24 2017 -0400 @@ -11,14 +11,18 @@ -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval """ +from __future__ import print_function + import fileinput import sys + +from bx.cookbook import doc_optparse from bx.intervals.io import GenomicInterval, NiceReaderWrapper from bx.intervals.operations.subtract import subtract -from bx.cookbook import doc_optparse from bx.tabular.io import ParseError from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped -from utils.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff + +from utils.gff_util import convert_bed_coords_to_gff, GFFFeature, GFFReaderWrapper assert sys.version_info[:2] >= ( 2, 4 ) @@ -81,16 +85,17 @@ out_file.write( "%s\n" % "\t".join( feature.fields ) ) else: out_file.write( "%s\n" % feature ) - except ParseError, exc: + except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: - print skipped( g1, filedesc=" of 2nd dataset" ) + print(skipped( g1, filedesc=" of 2nd dataset" )) if g2.skipped > 0: - print skipped( g2, filedesc=" of 1st dataset" ) + print(skipped( g2, filedesc=" of 1st dataset" )) + if __name__ == "__main__": main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Jun 22 18:51:24 2017 -0400 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.7.1">bx-python</requirement> + <requirement type="package" version="1.0.0">galaxy-ops</requirement> + </requirements> + </xml> + <token name="@SCREENCASTS@"> +----- + +**Screencasts!** + +See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). + +.. _Screencasts: https://galaxyproject.org/learn/interval-operations/ + +----- + </token> +</macros>
--- a/operation_filter.py Fri Dec 18 19:39:15 2015 -0500 +++ b/operation_filter.py Thu Jun 22 18:51:24 2017 -0400 @@ -1,8 +1,7 @@ # runs after the job (and after the default post-filter) +from galaxy.jobs.handler import JOB_ERROR from galaxy.tools.parameters import DataToolParameter -from galaxy.jobs.handler import JOB_ERROR - # Older py compatibility try: set() @@ -14,7 +13,7 @@ dbkeys = set() data_param_names = set() data_params = 0 - for name, param in page_param_map.iteritems(): + for name, param in page_param_map.items(): if isinstance( param, DataToolParameter ): # for each dataset parameter if param_values.get(name, None) is not None: @@ -53,7 +52,6 @@ try: if stderr and len( stderr ) > 0: raise Exception( stderr ) - except Exception: data.blurb = JOB_ERROR data.state = JOB_ERROR
--- a/subtract.xml Fri Dec 18 19:39:15 2015 -0500 +++ b/subtract.xml Thu Jun 22 18:51:24 2017 -0400 @@ -1,110 +1,95 @@ <tool id="gops_subtract_1" name="Subtract" version="1.0.0"> - <description>the intervals of two datasets</description> - <requirements> - <requirement type="package" version="0.7.1">bx-python</requirement> - <requirement type="package" version="1.0.0">galaxy-ops</requirement> - </requirements> - <command interpreter="python">gops_subtract.py - $input1 $input2 $output - - #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -1 1,4,5,7 --gff1 - #else: - -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} - #end if + <description>the intervals of two datasets</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <code file="operation_filter.py"/> + <command><![CDATA[ +python '$__tool_directory__/gops_subtract.py' +'$input1' +'$input2' +'$output' - #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -2 1,4,5,7 --gff2 - #else: - -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} - #end if +#if $input1.is_of_type('gff') + -1 1,4,5,7 --gff1 +#else: + -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} +#end if - -m $min $returntype - </command> - <inputs> - <param format="interval,gff" name="input2" type="data" help="Second dataset"> - <label>Subtract</label> - </param> - - <param format="interval,gff" name="input1" type="data" help="First dataset"> - <label>from</label> - </param> +#if $input2.is_of_type('gff') + -2 1,4,5,7 --gff2 +#else: + -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} +#end if - <param name="returntype" type="select" label="Return" help="of the first dataset (see figure below)"> - <option value="">Intervals with no overlap</option> - <option value="-p">Non-overlapping pieces of intervals</option> - </param> - - <param name="min" type="integer" value="1" min="1" help="(bp)"> - <label>where minimal overlap is</label> - </param> - </inputs> - <outputs> - <data format="input" name="output" metadata_source="input1"/> - </outputs> - <code file="operation_filter.py"/> - <trackster_conf/> - <tests> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops-subtract.dat" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2_mod.bed" ftype="interval"/> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_subtract_diffCols.dat" /> - </test> - <test> - <param name="input1" value="gops_subtract_bigint.bed" /> - <param name="input2" value="2.bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops-subtract.dat" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <param name="min" value="10" /> - <param name="returntype" value="Non-overlapping pieces of intervals" /> - <output name="output" file="gops-subtract-p.dat" /> - </test> - <!-- Subtract two GFF files. --> - <test> - <param name="input1" value="gops_subtract_in1.gff" /> - <param name="input2" value="gops_subtract_in2.gff" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_subtract_out1.gff" /> - </test> - <!-- Subtract BED file from GFF file. --> - <test> - <param name="input1" value="gops_subtract_in1.gff" /> - <param name="input2" value="gops_subtract_in2.bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_subtract_out1.gff" /> - </test> - </tests> - <help> - +-m $min +$returntype + ]]></command> + <inputs> + <param name="input2" type="data" format="interval,gff" label="Subtract" help="Second dataset" /> + <param name="input1" type="data" format="interval,gff" label="from" help="First dataset" /> + <param name="returntype" type="select" label="Return" help="of the first dataset (see figure below)"> + <option value="">Intervals with no overlap</option> + <option value="-p">Non-overlapping pieces of intervals</option> + </param> + <param name="min" type="integer" value="1" min="1" label="where minimal overlap is" help="(bp)" /> + </inputs> + <outputs> + <data name="output" format_source="input1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2.bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops-subtract.dat" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2_mod.bed" ftype="interval"/> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_subtract_diffCols.dat" /> + </test> + <test> + <param name="input1" value="gops_subtract_bigint.bed" /> + <param name="input2" value="2.bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops-subtract.dat" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2.bed" /> + <param name="min" value="10" /> + <param name="returntype" value="Non-overlapping pieces of intervals" /> + <output name="output" file="gops-subtract-p.dat" /> + </test> + <!-- Subtract two GFF files. --> + <test> + <param name="input1" value="gops_subtract_in1.gff" /> + <param name="input2" value="gops_subtract_in2.gff" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_subtract_out1.gff" /> + </test> + <!-- Subtract BED file from GFF file. --> + <test> + <param name="input1" value="gops_subtract_in1.gff" /> + <param name="input2" value="gops_subtract_in2.bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_subtract_out1.gff" /> + </test> + </tests> + <help><![CDATA[ .. class:: infomark **TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ +@SCREENCASTS@ **Syntax** @@ -123,6 +108,5 @@ Non-overlapping pieces of intervals: .. image:: gops_subtractOverlappingPieces.gif - -</help> + ]]></help> </tool>
--- a/tool_dependencies.xml Fri Dec 18 19:39:15 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="bx-python" version="0.7.1"> - <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="galaxy-ops" version="1.0.0"> - <repository changeset_revision="60c9a7af1345" name="package_galaxy_ops_1_0_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/gff_util.py Thu Jun 22 18:51:24 2017 -0400 @@ -0,0 +1,435 @@ +""" +Provides utilities for working with GFF files. +""" +import copy + +from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper +from bx.tabular.io import Comment, Header, ParseError + +from .odict import odict + + +class GFFInterval( GenomicInterval ): + """ + A GFF interval, including attributes. If file is strictly a GFF file, + only attribute is 'group.' + """ + def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, + strand_col=6, score_col=5, default_strand='.', fix_strand=False ): + # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, + # temporarily set strand and then unset after initing GenomicInterval. + unknown_strand = False + if not fix_strand and fields[ strand_col ] == '.': + unknown_strand = True + fields[ strand_col ] = '+' + GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, + default_strand, fix_strand=fix_strand ) + if unknown_strand: + self.strand = '.' + self.fields[ strand_col ] = '.' + + # Handle feature, score column. + self.feature_col = feature_col + if self.feature_col >= self.nfields: + raise MissingFieldError( "No field for feature_col (%d)" % feature_col ) + self.feature = self.fields[ self.feature_col ] + self.score_col = score_col + if self.score_col >= self.nfields: + raise MissingFieldError( "No field for score_col (%d)" % score_col ) + self.score = self.fields[ self.score_col ] + + # GFF attributes. + self.attributes = parse_gff_attributes( fields[8] ) + + def copy( self ): + return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, + self.end_col, self.strand_col, self.score_col, self.strand) + + +class GFFFeature( GFFInterval ): + """ + A GFF feature, which can include multiple intervals. + """ + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, + strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], + raw_size=0 ): + # Use copy so that first interval and feature do not share fields. + GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, + start_col, end_col, strand_col, score_col, default_strand, + fix_strand=fix_strand ) + self.intervals = intervals + self.raw_size = raw_size + # Use intervals to set feature attributes. + for interval in self.intervals: + # Error checking. NOTE: intervals need not share the same strand. + if interval.chrom != self.chrom: + raise ValueError( "interval chrom does not match self chrom: %s != %s" % + ( interval.chrom, self.chrom ) ) + # Set start, end of interval. + if interval.start < self.start: + self.start = interval.start + if interval.end > self.end: + self.end = interval.end + + def name( self ): + """ Returns feature's name. """ + name = None + # Preference for name: GTF, GFF3, GFF. + for attr_name in ['gene_id', 'transcript_id', # GTF + 'ID', 'id', # GFF3 + 'group' ]: # GFF (TODO) + name = self.attributes.get( attr_name, None ) + if name is not None: + break + return name + + def copy( self ): + intervals_copy = [] + for interval in self.intervals: + intervals_copy.append( interval.copy() ) + return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, + self.score_col, self.strand, intervals=intervals_copy ) + + def lines( self ): + lines = [] + for interval in self.intervals: + lines.append( '\t'.join( interval.fields ) ) + return lines + + +class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): + """ + Reader wrapper that reads GFF intervals/lines and automatically converts + them to BED format. + """ + + def parse_row( self, line ): + # HACK: this should return a GFF interval, but bx-python operations + # require GenomicInterval objects and subclasses will not work. + interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, + self.end_col, self.strand_col, self.default_strand, + fix_strand=self.fix_strand ) + interval = convert_gff_coords_to_bed( interval ) + return interval + + +class GFFReaderWrapper( NiceReaderWrapper ): + """ + Reader wrapper for GFF files. + + Wrapper has two major functions: + + 1. group entries for GFF file (via group column), GFF3 (via id attribute), + or GTF (via gene_id/transcript id); + 2. convert coordinates from GFF format--starting and ending coordinates + are 1-based, closed--to the 'traditional'/BED interval format--0 based, + half-open. This is useful when using GFF files as inputs to tools that + expect traditional interval format. + """ + + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, + end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): + NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, + strand_col=strand_col, fix_strand=fix_strand, **kwargs ) + self.feature_col = feature_col + self.score_col = score_col + self.convert_to_bed_coord = convert_to_bed_coord + self.last_line = None + self.cur_offset = 0 + self.seed_interval = None + self.seed_interval_line_len = 0 + + def parse_row( self, line ): + interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, + self.start_col, self.end_col, self.strand_col, self.score_col, + self.default_strand, fix_strand=self.fix_strand ) + return interval + + def __next__( self ): + """ Returns next GFFFeature. """ + + # + # Helper function. + # + + def handle_parse_error( parse_error ): + """ Actions to take when ParseError found. """ + if self.outstream: + if self.print_delegate and hasattr(self.print_delegate, "__call__"): + self.print_delegate( self.outstream, e, self ) + self.skipped += 1 + # no reason to stuff an entire bad file into memmory + if self.skipped < 10: + self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) + + # For debugging, uncomment this to propogate parsing exceptions up. + # I.e. the underlying reason for an unexpected StopIteration exception + # can be found by uncommenting this. + # raise e + + # + # Get next GFFFeature + # + raw_size = self.seed_interval_line_len + + # If there is no seed interval, set one. Also, if there are no more + # intervals to read, this is where iterator dies. + if not self.seed_interval: + while not self.seed_interval: + try: + self.seed_interval = GenomicIntervalReader.next( self ) + except ParseError as e: + handle_parse_error( e ) + # TODO: When no longer supporting python 2.4 use finally: + # finally: + raw_size += len( self.current_line ) + + # If header or comment, clear seed interval and return it with its size. + if isinstance( self.seed_interval, ( Header, Comment ) ): + return_val = self.seed_interval + return_val.raw_size = len( self.current_line ) + self.seed_interval = None + self.seed_interval_line_len = 0 + return return_val + + # Initialize feature identifier from seed. + feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF + # For GFF3 + feature_id = self.seed_interval.attributes.get( 'ID', None ) + # For GTF. + feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) + + # Read all intervals associated with seed. + feature_intervals = [] + feature_intervals.append( self.seed_interval ) + while True: + try: + interval = GenomicIntervalReader.next( self ) + raw_size += len( self.current_line ) + except StopIteration as e: + # No more intervals to read, but last feature needs to be + # returned. + interval = None + raw_size += len( self.current_line ) + break + except ParseError as e: + handle_parse_error( e ) + raw_size += len( self.current_line ) + continue + # TODO: When no longer supporting python 2.4 use finally: + # finally: + # raw_size += len( self.current_line ) + + # Ignore comments. + if isinstance( interval, Comment ): + continue + + # Determine if interval is part of feature. + part_of = False + group = interval.attributes.get( 'group', None ) + # GFF test: + if group and feature_group == group: + part_of = True + # GFF3 test: + parent_id = interval.attributes.get( 'Parent', None ) + cur_id = interval.attributes.get( 'ID', None ) + if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ): + part_of = True + # GTF test: + transcript_id = interval.attributes.get( 'transcript_id', None ) + if transcript_id and transcript_id == feature_transcript_id: + part_of = True + + # If interval is not part of feature, clean up and break. + if not part_of: + # Adjust raw size because current line is not part of feature. + raw_size -= len( self.current_line ) + break + + # Interval associated with feature. + feature_intervals.append( interval ) + + # Last interval read is the seed for the next interval. + self.seed_interval = interval + self.seed_interval_line_len = len( self.current_line ) + + # Return feature. + feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, + self.end_col, self.strand_col, self.score_col, + self.default_strand, fix_strand=self.fix_strand, + intervals=feature_intervals, raw_size=raw_size ) + + # Convert to BED coords? + if self.convert_to_bed_coord: + convert_gff_coords_to_bed( feature ) + + return feature + next = __next__ # This line should be removed once the bx-python port to Python3 is finished + + +def convert_bed_coords_to_gff( interval ): + """ + Converts an interval object's coordinates from BED format to GFF format. + Accepted object types include GenomicInterval and list (where the first + element in the list is the interval's start, and the second element is + the interval's end). + """ + if isinstance( interval, GenomicInterval ): + interval.start += 1 + if isinstance( interval, GFFFeature ): + for subinterval in interval.intervals: + convert_bed_coords_to_gff( subinterval ) + elif type( interval ) is list: + interval[ 0 ] += 1 + return interval + + +def convert_gff_coords_to_bed( interval ): + """ + Converts an interval object's coordinates from GFF format to BED format. + Accepted object types include GFFFeature, GenomicInterval, and list (where + the first element in the list is the interval's start, and the second + element is the interval's end). + """ + if isinstance( interval, GenomicInterval ): + interval.start -= 1 + if isinstance( interval, GFFFeature ): + for subinterval in interval.intervals: + convert_gff_coords_to_bed( subinterval ) + elif type( interval ) is list: + interval[ 0 ] -= 1 + return interval + + +def parse_gff_attributes( attr_str ): + """ + Parses a GFF/GTF attribute string and returns a dictionary of name-value + pairs. The general format for a GFF3 attributes string is + + name1=value1;name2=value2 + + The general format for a GTF attribute string is + + name1 "value1" ; name2 "value2" + + The general format for a GFF attribute string is a single string that + denotes the interval's group; in this case, method returns a dictionary + with a single key-value pair, and key name is 'group' + """ + attributes_list = attr_str.split(";") + attributes = {} + for name_value_pair in attributes_list: + # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3 + # attribute; next, try double quotes for GTF. + pair = name_value_pair.strip().split("=") + if len( pair ) == 1: + pair = name_value_pair.strip().split("\"") + if len( pair ) == 1: + # Could not split for some reason -- raise exception? + continue + if pair == '': + continue + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[ name ] = value + + if len( attributes ) == 0: + # Could not split attributes string, so entire string must be + # 'group' attribute. This is the case for strictly GFF files. + attributes['group'] = attr_str + return attributes + + +def gff_attributes_to_str( attrs, gff_format ): + """ + Convert GFF attributes to string. Supported formats are GFF3, GTF. + """ + if gff_format == 'GTF': + format_string = '%s "%s"' + # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id + id_attr = None + if 'group' in attrs: + id_attr = 'group' + elif 'ID' in attrs: + id_attr = 'ID' + elif 'Parent' in attrs: + id_attr = 'Parent' + if id_attr: + attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr] + elif gff_format == 'GFF3': + format_string = '%s=%s' + attrs_strs = [] + for name, value in attrs.items(): + attrs_strs.append( format_string % ( name, value ) ) + return " ; ".join( attrs_strs ) + + +def read_unordered_gtf( iterator, strict=False ): + """ + Returns GTF features found in an iterator. GTF lines need not be ordered + or clustered for reader to work. Reader returns GFFFeature objects sorted + by transcript_id, chrom, and start position. + """ + + # -- Get function that generates line/feature key. -- + + def get_transcript_id(fields): + return parse_gff_attributes( fields[8] )[ 'transcript_id' ] + + if strict: + # Strict GTF parsing uses transcript_id only to group lines into feature. + key_fn = get_transcript_id + else: + # Use lenient parsing where chromosome + transcript_id is the key. This allows + # transcripts with same ID on different chromosomes; this occurs in some popular + # datasources, such as RefGenes in UCSC. + def key_fn(fields): + return fields[0] + '_' + get_transcript_id( fields ) + + # Aggregate intervals by transcript_id and collect comments. + feature_intervals = odict() + comments = [] + for count, line in enumerate( iterator ): + if line.startswith( '#' ): + comments.append( Comment( line ) ) + continue + + line_key = key_fn( line.split('\t') ) + if line_key in feature_intervals: + feature = feature_intervals[ line_key ] + else: + feature = [] + feature_intervals[ line_key ] = feature + feature.append( GFFInterval( None, line.split( '\t' ) ) ) + + # Create features. + chroms_features = {} + for count, intervals in enumerate( feature_intervals.values() ): + # Sort intervals by start position. + intervals.sort( lambda a, b: cmp( a.start, b.start ) ) + feature = GFFFeature( None, intervals=intervals ) + if feature.chrom not in chroms_features: + chroms_features[ feature.chrom ] = [] + chroms_features[ feature.chrom ].append( feature ) + + # Sort features by chrom, start position. + chroms_features_sorted = [] + for chrom_features in chroms_features.values(): + chroms_features_sorted.append( chrom_features ) + chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) ) + for features in chroms_features_sorted: + features.sort( lambda a, b: cmp( a.start, b.start ) ) + + # Yield comments first, then features. + # FIXME: comments can appear anywhere in file, not just the beginning. + # Ideally, then comments would be associated with features and output + # just before feature/line. + for comment in comments: + yield comment + + for chrom_features in chroms_features_sorted: + for feature in chrom_features: + yield feature
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/odict.py Thu Jun 22 18:51:24 2017 -0400 @@ -0,0 +1,86 @@ +""" +Ordered dictionary implementation. +""" + +from UserDict import UserDict + + +class odict(UserDict): + """ + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747 + + This dictionary class extends UserDict to record the order in which items are + added. Calling keys(), values(), items(), etc. will return results in this + order. + """ + def __init__( self, dict=None ): + self._keys = [] + UserDict.__init__( self, dict ) + + def __delitem__( self, key ): + UserDict.__delitem__( self, key ) + self._keys.remove( key ) + + def __setitem__( self, key, item ): + UserDict.__setitem__( self, key, item ) + if key not in self._keys: + self._keys.append( key ) + + def clear( self ): + UserDict.clear( self ) + self._keys = [] + + def copy(self): + new = odict() + new.update( self ) + return new + + def items( self ): + return zip( self._keys, self.values() ) + + def keys( self ): + return self._keys[:] + + def popitem( self ): + try: + key = self._keys[-1] + except IndexError: + raise KeyError( 'dictionary is empty' ) + val = self[ key ] + del self[ key ] + return ( key, val ) + + def setdefault( self, key, failobj=None ): + if key not in self._keys: + self._keys.append( key ) + return UserDict.setdefault( self, key, failobj ) + + def update( self, dict ): + for ( key, val ) in dict.items(): + self.__setitem__( key, val ) + + def values( self ): + return map( self.get, self._keys ) + + def iterkeys( self ): + return iter( self._keys ) + + def itervalues( self ): + for key in self._keys: + yield self.get( key ) + + def iteritems( self ): + for key in self._keys: + yield key, self.get( key ) + + def __iter__( self ): + for key in self._keys: + yield key + + def reverse( self ): + self._keys.reverse() + + def insert( self, index, key, item ): + if key not in self._keys: + self._keys.insert( index, key ) + UserDict.__setitem__( self, key, item )
