Mercurial > repos > iuc > meme_fimo

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Users/gvk/work/git_workspace/tools-iuc/tools/meme/fimo.xml	Wed Jun 29 08:24:33 2016 -0400
@@ -0,0 +1,294 @@
+<tool id="meme_fimo" name="FIMO" version="4.11.0.3">
+    <description>- Scan a set of sequences for motifs.</description>
+    <requirements>
+        <requirement type="package" version="6.9.3">imagemagick</requirement>
+        <requirement type="package" version="4.11.0">meme</requirement>
+    </requirements>
+    <command>
+        <![CDATA[
+            mkdir -p output &&
+            python $__tool_directory__/fimo_wrapper.py
+            --input_motifs "${input_motifs}"
+            #if str($fasta_type.fasta_type_selector) == 'history':
+                --input_fasta "${fasta_type.input_database}"
+            #else:
+                --input_fasta "${fasta_type.input_database.fields.path}"
+            #end if
+            --options_type $options_type.options_type_selector
+            #if str($options_type.options_type_selector) == 'advanced':
+                --alpha "${options_type.alpha}"
+                #if str($options_type.bgfile_type.bgfile_type_selector) == 'motif_file':
+                    --bgfile "motif-file"
+                #elif str($options_type.bgfile_type.bgfile_type_selector) == 'bgfile':
+                    --bgfile "${options_type.bgfile_type.bgfile}"
+                #end if
+                ${options_type.max_strand}
+                --max_stored_scores "${options_type.max_stored_scores}"
+                #if str($options_type.motifs_cond.motifs_selector) == 'no':
+                    #for $motif in $options_type.motifs:
+                        --motif "${motif.motif}"
+                    #end for
+                #end if
+                --output_separate_motifs ${options_type.output_separate_motifs}
+                --motif_pseudo "${options_type.motif_pseudo}"
+                ${options_type.no_qvalue}
+                ${options_type.norc}
+                #if str($options_type.parse_genomic_coord_cond.parse_genomic_coord) == 'yes':
+                    --parse_genomic_coord 'yes'
+                    --remove_duplicate_coords ${options_type.parse_genomic_coord_cond.remove_duplicate_coords}
+                #end if
+                #if str($options_type.psp_cond.psp_selector) == 'yes':
+                    --input_psp "${input_psp}"
+                #end if
+                #if str($options_type.prior_dist_cond.prior_dist_selector) == 'yes':
+                    --input_prior_dist "${input_prior_dist}"
+                #end if
+                ${options_type.qv_thresh}
+                --thresh ${options_type.thresh}
+            #end if
+            --output_path '${html_outfile.files_path}'
+            --html_output "${html_outfile}"
+            --interval_output '${interval_outfile}'
+            --txt_output "${txt_outfile}"
+            --xml_output "${xml_outfile}"
+            --gff_output "${gff_outfile}"
+        ]]>
+    </command>
+    <inputs>
+        <param name="input_motifs" type="data" format="memexml" label="'MEME output' formatted file"/>
+        <conditional name="fasta_type">
+            <param name="fasta_type_selector" type="select" label="Source for sequence to search">
+                <option value="cached">Locally Cached sequences</option>
+                <option value="history" selected="true">Sequences from your history</option>
+            </param>
+            <when value="cached">
+                <param name="input_database" type="select" label="Genome to search">
+                    <options from_data_table="all_fasta" />
+                </param>
+            </when>
+            <when value="history">
+                <param format="fasta" name="input_database" type="data" label="Sequences"/>
+            </when>
+        </conditional>
+        <conditional name="options_type">
+            <param name="options_type_selector" type="select" label="Options configuration">
+                <option value="basic" selected="true">Basic</option>
+                <option value="advanced">Advanced</option>
+            </param>
+            <when value="basic" />
+            <when value="advanced">
+                <param name="alpha" type="float" value="1.0" min="0" max="1.0" label="Alpha parameter for calculating position specific priors" help="Represents the fraction of all transcription factor binding sites that are binding sites for the TF of interest (must be between 0 and 1)."/>
+                <conditional name="bgfile_type">
+                    <param name="bgfile_type_selector" type="select" label="Background file type">
+                        <option value="default" selected="true">Use frequencies embedded in the application from the non-redundant database</option>
+                        <option value="motif_file">Use frequencies from motif file</option>
+                        <option value="bgfile">Use frequencies from background file</option>
+                    </param>
+                    <when value="motif_file" />
+                    <when value="default" />
+                    <when value="bgfile">
+                        <param name="bgfile" type="data" format="txt" optional="True" label="Background Model" help="File must be in MEME background file format."/>
+                    </when>
+                </conditional>
+                <param name="max_strand" label="If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score" type="boolean" truevalue="--max_strand" falsevalue="" checked="False" help="If the scores are tied, the matching strand is chosen at random.  Leave unchecked to report both matches."/>
+                <param name="max_stored_scores" type="integer" value="100000" label="Maximum number of scores that will be stored" />
+                <conditional name="motifs_cond">
+                    <param name="motifs_selector" type="select" label="Use all motifs in input?">
+                        <option value="yes" selected="true">Yes</option>
+                        <option value="no">No</option>
+                    </param>
+                    <when value="yes"/>
+                    <when value="no">
+                        <repeat name="motifs" title="Limit to specified motif">
+                            <param name="motif" type="text" value="" label="Specify motif by id" />
+                        </repeat>
+                    </when>
+                </conditional>
+                <param name="output_separate_motifs" type="select" label="Output a dataset per motif?" help="Output a collection consisting of a separate dataset for each motif in the input">
+                    <option value="no" selected="true">No</option>
+                    <option value="yes">Yes</option>
+                </param>
+                <param name="motif_pseudo" type="float" value="0.1" label="Pseudocount to add to counts in motif matrix" help="A pseudocount to be added to each count in the motif matrix, after first multiplying by the corresponding background frequency"/>
+                <param name="no_qvalue" label="Do not compute a q-value for each p-value" type="boolean" truevalue="--no_qvalue" falsevalue="" checked="True" help="The q-value calculation is that of Benjamini and Hochberg (1995)."/>
+                <param name="norc" label="Do not score the reverse complement DNA strand" type="boolean" truevalue="--norc" falsevalue="" checked="False" />
+                <conditional name="parse_genomic_coord_cond">
+                    <param name="parse_genomic_coord" label="Check each sequence header for UCSC style genomic coordinates" type="select">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="yes">
+                        <param name="remove_duplicate_coords" type="select" label="Remove duplicate entries in unique GFF coordinates?" help="Remove duplicate entries as defined by the unique GFF coordinates">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    </when>
+                    <when value="no"/>
+                </conditional>
+                <conditional name="psp_cond">
+                    <param name="psp_selector" type="select" label="Use position-specific priors?">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no"/>
+                    <when value="yes">
+                        <param name="input_psp" type="data" format="txt" label="Select dataset containing position specific priors" help="Format must be meme psp or wiggle."/>
+                    </when>
+                </conditional>
+                <conditional name="prior_dist_cond">
+                    <param name="prior_dist_selector" type="select" label="Use binned distribution of priors?">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no"/>
+                    <when value="yes">
+                        <param name="input_prior_dist" type="data" format="txt" label="Select dataset containing binned distribution of priors"/>
+                    </when>
+                </conditional>
+                <param name="qv_thresh" label="Use q-values for the output threshold" type="boolean" truevalue="--qv_thresh" falsevalue="" checked="False" help="Leave unchecked to use p-values for the output threshold."/>
+                <param name="thresh" type="float" value="1e-4" label="Output threshold for displaying search results" help="Only search results with a p-value less than the threshold will be output. The threshold can be set to use q-values rather than p-values via the option above."/>
+            </when>
+        </conditional>
+        <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False">
+            <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)">
+            <actions>
+                <conditional name="fasta_type.fasta_type_selector">
+                    <when value="cached">
+                        <action type="metadata" name="dbkey">
+                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
+                                <filter type="param_value" column="0" value="seq" keep="True"/>
+                                <filter type="param_value" ref="fasta_type.input_database" column="1"/>
+                            </option>
+                        </action>
+                    </when>
+                </conditional>
+            </actions>
+        </data>
+        <data format="tabular" name="txt_outfile" label="${tool.name} on ${on_string} (text)">
+            <actions>
+                <conditional name="fasta_type.fasta_type_selector">
+                    <when value="cached">
+                        <action type="metadata" name="dbkey">
+                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
+                                <filter type="param_value" ref="fasta_type.input_database" column="0"/>
+                            </option>
+                        </action>
+                    </when>
+                </conditional>
+            </actions>
+        </data>
+        <data format="tabular" name="gff_outfile" label="${tool.name} on ${on_string} (almost-gff)">
+            <filter>options_type['output_separate_motifs'] == 'no'</filter>
+            <actions>
+                <conditional name="fasta_type.fasta_type_selector">
+                    <when value="cached">
+                        <action type="metadata" name="dbkey">
+                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
+                                <filter type="param_value" ref="fasta_type.input_database" column="0"/>
+                            </option>
+                        </action>
+                    </when>
+                </conditional>
+            </actions>
+        </data>
+        <collection name="motifs" type="list" label="Motifs: ${tool.name} on ${on_string}">
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)" directory="output" ext="gff" visible="false" />
+            <filter>options_type['output_separate_motifs'] == 'yes'</filter>
+        </collection>
+        <data format="cisml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)">
+            <actions>
+                <conditional name="fasta_type.fasta_type_selector">
+                    <when value="cached">
+                        <action type="metadata" name="dbkey">
+                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
+                                <filter type="param_value" ref="fasta_type.input_database" column="0"/>
+                            </option>
+                        </action>
+                    </when>
+                </conditional>
+            </actions>
+        </data>
+        <data format="interval" name="interval_outfile" label="${tool.name} on ${on_string} (interval)">
+            <actions>
+                <conditional name="fasta_type.fasta_type_selector">
+                    <when value="cached">
+                        <action type="metadata" name="dbkey">
+                            <option type="from_data_table" name="all_fasta" column="1" offset="0">
+                                <filter type="param_value" ref="fasta_type.input_database" column="0"/>
+                            </option>
+                        </action>
+                    </when>
+                </conditional>
+            </actions>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_motifs" value="meme_output_xml_1.xml" ftype="memexml"/>
+            <param name="fasta_type_selector" value="history"/>
+            <param name="input_database" value="phiX.fasta" ftype="fasta"/>
+            <param name="options_type_selector" value="basic"/>
+            <param name="non_commercial_use" value="True"/>
+            <output name="html_outfile" file="fimo_output_html_1.html" compare="contains"/>
+            <output name="txt_outfile" file="fimo_output_txt_1.txt" compare="contains"/>
+            <output name="gff_outfile" file="fimo_output_almost-gff_1.txt" compare="contains"/>
+            <output name="xml_outfile" file="fimo_output_xml_1.xml" compare="contains"/>
+            <output name="interval_outfile" file="fimo_output_interval_1.txt" compare="contains"/>
+        </test>
+        <test>
+            <param name="input_motifs" value="meme_output_xml_1.xml" ftype="memexml"/>
+            <param name="fasta_type_selector" value="history"/>
+            <param name="input_database" value="phiX.fasta" ftype="fasta"/>
+            <param name="options_type_selector" value="advanced"/>
+            <param name="non_commercial_use" value="True"/>
+            <output name="html_outfile" file="fimo_output_html_2.html" compare="contains"/>
+            <output name="txt_outfile" file="fimo_output_txt_2.txt" compare="contains"/>
+            <output name="gff_outfile" file="fimo_output_almost-gff_2.txt" compare="contains"/>
+            <output name="xml_outfile" file="fimo_output_xml_2.xml" compare="contains"/>
+            <output name="interval_outfile" file="fimo_output_interval_2.txt" compare="contains"/>
+        </test>
+        <test>
+            <param name="input_motifs" value="meme_output_xml_1.xml" ftype="memexml"/>
+            <param name="fasta_type_selector" value="history"/>
+            <param name="input_database" value="phiX.fasta" ftype="fasta"/>
+            <param name="options_type_selector" value="advanced"/>
+            <param name="parse_genomic_coord" value="--parse_genomic_coord"/>
+            <param name="remove_duplicate_coords" value="yes"/>
+            <param name="output_separate_motifs" value="yes"/>
+            <param name="non_commercial_use" value="True"/>
+            <output name="html_outfile" file="fimo_output_html_2.html" compare="contains"/>
+            <output name="txt_outfile" file="fimo_output_txt_2.txt" compare="contains"/>
+            <output_collection name="motifs" type="list">
+                <element name="MOTIF1.gff" file="motif1.gff" ftype="gff" compare="contains"/>
+            </output_collection>
+            <output name="xml_outfile" file="fimo_output_xml_2.xml" compare="contains"/>
+            <output name="interval_outfile" file="fimo_output_interval_2.txt" compare="contains"/>
+        </test>
+    </tests>
+    <help>
+
+.. class:: warningmark
+
+**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted.
+Before using, be sure to review, agree, and comply with the license.**
+
+FIMO scans a sequence database for individual matches to each of the motifs you provide (sample output for motifs and sequences).
+The name FIMO stands for 'Find Individual Motif Occurrences'.  The program searches a database of sequences for occurrences of
+known motifs, treating each motif independently.  Motifs must be in MEME Motif Format.  You can define the statistical threshold
+(p-value) for motifs and whether FIMO scans just the given sequences or their reverse complements (where applicable).
+
+.. class:: infomark
+
+For detailed information on FIMO, click here_, or view the license_.
+
+.. _here: http://meme-suite.org/doc/fimo.html?man_type=web
+.. _license: http://meme-suite.org/doc/copyright.html?man_type=web
+
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btr064</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Users/gvk/work/git_workspace/tools-iuc/tools/meme/fimo_wrapper.py	Wed Jun 29 08:24:33 2016 -0400
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+import argparse
+import os
+import shutil
+import string
+import subprocess
+import sys
+import tempfile
+
+BUFFSIZE = 1048576
+# Translation table for reverse Complement, with ambiguity codes.
+DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb")
+
+
+def get_stderr(tmp_stderr):
+    tmp_stderr.seek(0)
+    stderr = ''
+    try:
+        while True:
+            stderr += tmp_stderr.read(BUFFSIZE)
+            if not stderr or len(stderr) % BUFFSIZE != 0:
+                break
+    except OverflowError:
+        pass
+    return stderr
+
+
+def reverse(sequence):
+    # Reverse sequence string.
+    return sequence[::-1]
+
+
+def dna_complement(sequence):
+    # Complement DNA sequence string.
+    return sequence.translate(DNA_COMPLEMENT)
+
+
+def dna_reverse_complement(sequence):
+    # Returns the reverse complement of the sequence.
+    sequence = reverse(sequence)
+    return dna_complement(sequence)
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit(1)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--input_motifs', dest='input_motifs', help='MEME output formatted files for input to fimo')
+parser.add_argument('--input_fasta', dest='input_fasta', help='Fassta sequence file')
+parser.add_argument('--options_type', dest='options_type', help='Basic or Advance options')
+parser.add_argument('--input_psp', dest='input_psp', default=None, help='File containing position specific priors')
+parser.add_argument('--input_prior_dist', dest='input_prior_dist', default=None, help='File containing binned distribution of priors')
+parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, help='The alpha parameter for calculating position specific priors')
+parser.add_argument('--bgfile', dest='bgfile', default=None, help='Background file type, used only if not "default"')
+parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score')
+parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store')
+parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id')
+parser.add_argument('--output_separate_motifs', dest='output_separate_motifs', default='no', help='Output one dataset per motif')
+parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix')
+parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value')
+parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand')
+parser.add_argument('--output_path', dest='output_path', help='Output files directory')
+parser.add_argument('--parse_genomic_coord', dest='parse_genomic_coord', default='no', help='Check each sequence header for UCSC style genomic coordinates')
+parser.add_argument('--remove_duplicate_coords', dest='remove_duplicate_coords', default='no', help='Remove duplicate entries in unique GFF coordinates')
+parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold')
+parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold')
+parser.add_argument('--gff_output', dest='gff_output', help='Gff output file')
+parser.add_argument('--html_output', dest='html_output', help='HTML output file')
+parser.add_argument('--interval_output', dest='interval_output', help='Interval output file')
+parser.add_argument('--txt_output', dest='txt_output', help='Text output file')
+parser.add_argument('--xml_output', dest='xml_output', help='XML output file')
+args = parser.parse_args()
+
+fimo_cmd_list = ['fimo']
+if args.options_type == 'advanced':
+    fimo_cmd_list.append('--alpha %4f' % args.alpha)
+    if args.bgfile is not None:
+        fimo_cmd_list.append('--bgfile "%s"' % args.bgfile)
+    if args.max_strand:
+        fimo_cmd_list.append('--max-strand')
+    fimo_cmd_list.append('--max-stored-scores %d' % args.max_stored_scores)
+    if len(args.motifs) > 0:
+        for motif in args.motifs:
+            fimo_cmd_list.append('--motif "%s"' % motif)
+    fimo_cmd_list.append('--motif-pseudo %4f' % args.motif_pseudo)
+    if args.no_qvalue:
+        fimo_cmd_list.append('--no-qvalue')
+    if args.norc:
+        fimo_cmd_list.append('--norc')
+    if args.parse_genomic_coord == 'yes':
+        fimo_cmd_list.append('--parse-genomic-coord')
+    if args.qv_thresh:
+        fimo_cmd_list.append('--qv-thresh')
+    fimo_cmd_list.append('--thresh %4f' % args.thresh)
+    if args.input_psp is not None:
+        fimo_cmd_list.append('--psp "%s"' % args.input_psp)
+    if args.input_prior_dist is not None:
+        fimo_cmd_list.append('--prior-dist "%s"' % args.input_prior_dist)
+fimo_cmd_list.append('--o "%s"' % (args.output_path))
+fimo_cmd_list.append('--verbosity 1')
+fimo_cmd_list.append(args.input_motifs)
+fimo_cmd_list.append(args.input_fasta)
+
+fimo_cmd = ' '.join(fimo_cmd_list)
+
+try:
+    tmp_stderr = tempfile.NamedTemporaryFile()
+    proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr)
+    returncode = proc.wait()
+    if returncode != 0:
+        stderr = get_stderr(tmp_stderr)
+        stop_err(stderr)
+except Exception, e:
+    stop_err('Error running FIMO:\n%s' % str(e))
+
+shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output)
+
+gff_file = os.path.join(args.output_path, 'fimo.gff')
+if args.remove_duplicate_coords == 'yes':
+    tmp_stderr = tempfile.NamedTemporaryFile()
+    # Identify and eliminating identical motif occurrences.  These
+    # are identical if the combination of chrom, start, end and
+    # motif id are identical.
+    cmd = 'sort -k1,1 -k4,4n -k5,5n -k9.1,9.6 -u -o %s %s' % (gff_file, gff_file)
+    proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True)
+    returncode = proc.wait()
+    if returncode != 0:
+        stderr = get_stderr(tmp_stderr)
+        stop_err(stderr)
+    # Sort GFF output by a combination of chrom, score, start.
+    cmd = 'sort -k1,1 -k4,4n -k6,6n -o %s %s' % (gff_file, gff_file)
+    proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True)
+    returncode = proc.wait()
+    if returncode != 0:
+        stderr = get_stderr(tmp_stderr)
+        stop_err(stderr)
+if args.output_separate_motifs == 'yes':
+    # Create the collection output directory.
+    collection_path = (os.path.join(os.getcwd(), 'output'))
+    # Keep track of motif occurrences.
+    header_line = None
+    motif_ids = []
+    file_handles = []
+    for line in open(gff_file, 'r'):
+        if line.startswith('#'):
+            if header_line is None:
+                header_line = line
+            continue
+        items = line.split('\t')
+        attribute = items[8]
+        attributes = attribute.split(';')
+        name = attributes[0]
+        motif_id = name.split('=')[1]
+        file_name = os.path.join(collection_path, 'MOTIF%s.gff' % motif_id)
+        if motif_id in motif_ids:
+            i = motif_ids.index(motif_id)
+            fh = file_handles[i]
+            fh.write(line)
+        else:
+            fh = open(file_name, 'wb')
+            if header_line is not None:
+                fh.write(header_line)
+            fh.write(line)
+            motif_ids.append(motif_id)
+            file_handles.append(fh)
+    for file_handle in file_handles:
+        file_handle.close()
+else:
+    shutil.move(gff_file, args.gff_output)
+shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output)
+shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output)
+
+out_file = open(args.interval_output, 'wb')
+out_file.write("#%s\n" % "\t".join(("chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value")))
+for line in open(args.txt_output):
+    if line.startswith('#'):
+        continue
+    fields = line.rstrip("\n\r").split("\t")
+    start, end = int(fields[2]), int(fields[3])
+    sequence = fields[7]
+    if start > end:
+        # Flip start and end and set strand.
+        start, end = end, start
+        strand = "-"
+        # We want sequences relative to strand; FIMO always provides + stranded sequence.
+        sequence = dna_reverse_complement(sequence)
+    else:
+        strand = "+"
+    # Make 0-based start position.
+    start -= 1
+    out_file.write("%s\n" % "\t".join([fields[1], str(start), str(end), fields[0], fields[4], strand, sequence, fields[5], fields[6]]))
+out_file.close()