diff interproscan.xml @ 0:bfeae84e23ee draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/interproscan commit 2f5d27a375fcc2e8d77914b3d9e402a9e2df2d97"
author iuc
date Mon, 15 Nov 2021 17:20:51 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interproscan.xml	Mon Nov 15 17:20:51 2021 +0000
@@ -0,0 +1,332 @@
+<tool id="interproscan" name="InterProScan" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09">
+    <description>functional annotation</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">interproscan_4</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">interproscan</requirement>
+    </requirements>
+    <version_command>interproscan.sh --version</version_command>
+    <command><![CDATA[
+## Adapt properties file to use data from data table
+mkdir -p \$HOME/.interproscan-5
+&&
+sed 's|^\(data.directory=\).*$|\1${database.fields.path}|' \$(dirname \$(readlink -f \$(command -v interproscan.sh)))/interproscan.properties > \$HOME/.interproscan-5/interproscan.properties
+&&
+
+## Now run interproscan
+interproscan.sh
+
+## disables the precalculated lookup service, all calculation will be run locally
+-dp
+--input '$input'
+--seqtype $seqtype
+-f ${','.join($oformat)}
+
+#if $licensed.use == 'true' and $licensed.applications_licensed:
+    --applications ${','.join($applications)},${','.join($licensed.applications_licensed)}
+#else:
+    --applications ${','.join($applications)}
+#end if
+--tempdir \$TEMP
+
+$pathways
+$goterms
+$iprlookup
+
+--cpu \${GALAXY_SLOTS:-4}
+
+--output-file-base 'output'
+    ]]></command>
+    <inputs>
+        <param argument="--input" type="data" format="fasta" label="Protein FASTA File"/>
+
+        <param argument="--seqtype" type="select" label="Type of the input sequences" help="">
+            <option value="p" selected="true">Protein</option>
+            <option value="n">DNA / RNA</option>
+        </param>
+
+        <param name="database" label="InterProScan database" type="select">
+            <options from_data_table="interproscan">
+                <column name="value" index="0" />
+                <column name="name" index="1" />
+                <column name="path" index="3" />
+                <filter type="sort_by" column="0" />
+                <filter type="static_value" column="2" value="@TOOL_VERSION@" />
+            </options>
+        </param>
+
+        <param name="applications" type="select" multiple="True" label="Applications to run" help="Select your program">
+            <option value="TIGRFAM" selected="true">TIGRFAM: protein families based on hidden Markov models (HMMs)</option>
+            <option value="SFLD" selected="true">SFLD: a database of protein families based on hidden Markov models (HMMs)</option>
+            <option value="SUPERFAMILY" selected="true">SUPERFAMILY: database of structural and functional annotation for all proteins and genomes</option>
+            <option value="PANTHER" selected="true">PANTHER: Protein ANalysis THrough Evolutionary Relationships</option>
+            <option value="Gene3D" selected="true">Gene3d: Structural assignment for whole genes and genomes using the CATH domain structure database</option>
+            <option value="Hamap" selected="true">HAMAP: High-quality Automated Annotation of Microbial Proteomes</option>
+            <option value="PrositeProfiles" selected="true">PROSITE Profiles: protein domains, families and functional sites as well as associated profiles to identify them</option>
+            <option value="Coils" selected="true">Coils: Prediction of Coiled Coil Regions in Proteins</option>
+            <option value="SMART" selected="true">SMART: identification and analysis of domain architectures based on Hidden Markov Models or HMMs</option>
+            <option value="CDD" selected="true">SMART: protein domains and families based on well-annotated multiple sequence alignment models</option>
+            <option value="PRINTS" selected="true">PRINTS: group of conserved motifs (fingerprints) used to characterise a protein family</option>
+            <option value="PIRSR" selected="true">PIRSR: protein families based on hidden Markov models (HMMs) and Site Rules</option>
+            <option value="PrositePatterns" selected="true">PROSITE Pattern: protein domains, families and functional sites as well as associated patterns to identify them</option>
+            <option value="Pfam" selected="true">Pfam: protein families, each represented by multiple sequence alignments and hidden Markov models</option>
+            <option value="MobiDBLite" selected="true">MobiDBLite: Prediction of intrinsically disordered regions in proteins</option>
+            <option value="PIRSF" selected="true">PIRSF: non-overlapping clustering of UniProtKB sequences into a hierarchical order (evolutionary relationships)</option>
+        </param>
+
+        <conditional name="licensed">
+            <param name="use" type="select" label="Use applications with restricted license, only for non-commercial use?" help="The corresponding tools must be installed manually by the administrator of this Galaxy instance" >
+                <option value="false" selected="true">No</option>
+                <option value="true">Yes</option>
+            </param>
+            <when value="false" />
+            <when value="true">
+                <param name="applications_licensed" type="select" multiple="True" label="Applications to run" help="Select your programm.">
+                    <option value="Phobius" selected="true">Phobius: combined transmembrane topology and signal peptide predictor</option>
+                    <option value="SignalP_GRAM_NEGATIVE" selected="false">SignalP (gram-negative): signal peptide cleavage sites in amino acid sequences for gram-negative prokaryotes</option>
+                    <option value="SignalP_EUK" selected="true">SignalP (eukaryotes): signal peptide cleavage sites in amino acid sequences for eukaryotes</option>
+                    <option value="SignalP_GRAM_POSITIVE" selected="false">SignalP (Gram Positive Bacteria): signal peptide cleavage sites in amino acid sequences for gram-positive prokaryotes</option>
+                    <option value="TMHMM" selected="true">TMHMM: Prediction of transmembrane helices in proteins</option>
+                </param>
+            </when>
+        </conditional>
+
+        <param argument="--pathways" truevalue="--pathways" falsevalue="" checked="True" type="boolean" label="Include pathway information"
+            help="Option that provides mappings from matches to pathway information, which is based on the matched manually curated InterPro entries."/>
+        <param argument="--goterms" truevalue="--goterms" falsevalue="" checked="True" type="boolean" label="Include Gene Ontology (GO) mappings"
+            help="Look up of corresponding Gene Ontology annotation. Implies -iprlookup option."/>
+        <param argument="--iprlookup" truevalue="--iprlookup" falsevalue="" checked="False" type="boolean"
+            label="Provide additional mappings" help="Provide mappings from matched member database signatures to the InterPro entries that they are integrated into"/>
+
+        <param name="oformat" type="select" multiple="true" label="Output format" help="Please select a output format (JSON output can be visualised on https://www.ebi.ac.uk/interpro/result/InterProScan/).">
+            <option value="TSV" selected="true">Tab-separated values format (TSV)</option>
+            <option value="GFF3">GFF3</option>
+            <option value="XML">XML</option>
+            <option value="JSON">JSON</option>
+        </param>
+    </inputs>
+
+    <outputs>
+        <data format="tabular" name="outfile_tsv" from_work_dir="output.tsv" label="InterProScan on ${on_string} (tsv)">
+            <filter>oformat and 'TSV' in outputs</filter>
+        </data>
+        <data format="xml" name="outfile_xml" from_work_dir="output.xml" label="InterProScan on ${on_string} (xml)">
+            <filter>oformat and 'XML' in outputs</filter>
+        </data>
+        <data format="gff3" name="outfile_gff3" from_work_dir="output.gff3" label="InterProScan on ${on_string} (gff3)">
+            <filter>oformat and 'GFF3' in outputs</filter>
+        </data>
+        <data format="json" name="outfile_json" from_work_dir="output.json" label="InterProScan on ${on_string} (json)">
+            <filter>oformat and 'JSON' in outputs</filter>
+        </data>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="input" value="prots.fa" />
+            <param name="seqtype" value="p" />
+            <param name="database" value="5.52-86.0" />
+            <param name="applications" value="MobiDBLite" />
+            <param name="oformat" value="TSV" />
+            <output name="outfile_tsv">
+                <assert_contents>
+                    <has_text text="FUN_000011-T1" />
+                    <has_text text="ea9924e11f7decc417e8d9ed8b9c682e" />
+                    <has_text text="FUN_000012-T1" />
+                    <has_text text="01beedc2fbf8012cba37f0c0d39aa071" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="input" value="prots.fa" />
+            <param name="seqtype" value="p" />
+            <param name="database" value="5.52-86.0" />
+            <param name="applications" value="MobiDBLite" />
+            <param name="oformat" value="TSV,GFF3,XML,JSON" />
+            <output name="outfile_tsv">
+                <assert_contents>
+                    <has_text text="FUN_000011-T1" />
+                    <has_text text="ea9924e11f7decc417e8d9ed8b9c682e" />
+                    <has_text text="FUN_000012-T1" />
+                    <has_text text="01beedc2fbf8012cba37f0c0d39aa071" />
+                </assert_contents>
+            </output>
+            <output name="outfile_xml">
+                <assert_contents>
+                    <has_text text="mobidblite-location" />
+                    <has_text text="Polyampholyte" />
+                    <has_text text="consensus disorder prediction" />
+                    <has_text text="FUN_000011-T1 FUN_000011" />
+                </assert_contents>
+            </output>
+            <output name="outfile_gff3">
+                <assert_contents>
+                    <has_text text="protein_match" />
+                    <has_text text="ID=FUN_000011-T1;md5=" />
+                    <has_text text="MobiDBLite" />
+                </assert_contents>
+            </output>
+            <output name="outfile_json">
+                <assert_contents>
+                    <has_text text="signatureLibraryRelease" />
+                    <has_text text="disorder_prediction" />
+                    <has_text text="Polyampholyte" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="input" value="transcripts.fa" />
+            <param name="seqtype" value="n" />
+            <param name="database" value="5.52-86.0" />
+            <param name="applications" value="MobiDBLite" />
+            <param name="oformat" value="TSV,GFF3,XML,JSON" />
+            <output name="outfile_tsv">
+                <assert_contents>
+                    <has_text text="FUN_000018-T1_orf336" />
+                    <has_text text="0b28fe115d4cc09260b038b19fb0b21d" />
+                    <has_text text="FUN_000012-T1_orf133" />
+                    <has_text text="01beedc2fbf8012cba37f0c0d39aa071" />
+                </assert_contents>
+            </output>
+            <output name="outfile_xml">
+                <assert_contents>
+                    <has_text text="mobidblite-location" />
+                    <has_text text="Polyampholyte" />
+                    <has_text text="consensus disorder prediction" />
+                    <has_text text="orf355" />
+                </assert_contents>
+            </output>
+            <output name="outfile_gff3">
+                <assert_contents>
+                    <has_text text="protein_match" />
+                    <has_text text="ID=FUN_000012-T1;" />
+                    <has_text text="MobiDBLite" />
+                </assert_contents>
+            </output>
+            <output name="outfile_json">
+                <assert_contents>
+                    <has_text text="signatureLibraryRelease" />
+                    <has_text text="disorder_prediction" />
+                    <has_text text="Polyampholyte" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_failure="true">
+            <param name="input" value="prots.fa" />
+            <param name="seqtype" value="p" />
+            <param name="database" value="5.52-86.0" />
+            <param name="applications" value="MobiDBLite" />
+            <conditional name="licensed">
+                <param name="use" value="true" />
+                <param name="applications_licensed" value="Phobius,TMHMM" />
+            </conditional>
+            <param name="oformat" value="TSV" />
+            <assert_stdout>
+                <!-- expected to be "deactivated" as they are not installed by default -->
+                <has_text text="Analysis Phobius does not exist or is deactivated" />
+                <has_text text="Analysis TMHMM does not exist or is deactivated" />
+            </assert_stdout>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+
+**What it does**
+
+Interproscan is a batch tool to query the InterPro database. It provides annotations based on multiple searches of profile and other functional databases.
+
+Phobius (licensed software), SignalP, SMART (licensed components) and TMHMM use
+licensed code and data provided by third parties. If you wish to run these
+analyses it will be necessary for you to obtain a licence from the vendor and
+configure the Galaxy server InterProScan installation to use them.
+
+**Input**
+
+Required is a FASTA file containing protein or nucleotide sequences.
+
+**Output**
+
+In this version of InterProScan, you can retrieve output in any of the following five formats:
+
+ * TSV: tab-separated values format
+ * XML: XML format
+ * GFF: The GFF 3.0 format
+ * JSON: A JSON representation of the protein matches that can be visualised on https://www.ebi.ac.uk/interpro/result/InterProScan/
+
+**Example Output**
+
+
+::
+
+  P51587  14086411a2cdf1c4cba63020e1622579        3418    Pfam    PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain 1        2670    2799    7.9E-43 T       15-03-2013
+  P51587  14086411a2cdf1c4cba63020e1622579        3418    ProSiteProfiles PS50138 BRCA2 repeat profile.   1002    1036    0.0     T       18-03-2013      IPR002093       BRCA2 repeat    GO:0005515|GO:0006302
+  P51587  14086411a2cdf1c4cba63020e1622579        3418    Gene3D  G3DSA:2.40.50.140               2966    3051    3.1E-52 T       15-03-2013
+  ...
+
+
+The TSV format presents the match data in columns as follows:
+
+  - Protein Accession (e.g. P51587)
+  - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579)
+  - Sequence Length (e.g. 3418)
+  - Analysis (e.g. Pfam / PRINTS / Gene3D)
+  - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140)
+  - Signature Description (e.g. BRCA2 repeat profile)
+  - Start location
+  - Stop location
+  - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52)
+  - Status - is the status of the match (T: true)
+  - Date - is the date of the run
+  - (InterProScan annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option is switched on)
+  - (InterProScan annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan option is switched on)
+  - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on)
+  - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on)
+
+
+**Extensible Markup Language (XML)**
+
+XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here].
+
+**Generic Feature Format Version 3 (GFF3)**
+
+The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md].
+
+**Example Output**
+
+
+::
+
+  ##gff-version 3
+  ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269
+  ##sequence-region AACH01000027 1 1347
+  ##seqid|source|type|start|end|score|strand|phase|attributes
+  AACH01000027    provided_by_user        nucleic_acid    1       1347    .       +       .       Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027
+  AACH01000027    getorf  ORF     1       1347    .       +       .       Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347
+  AACH01000027    getorf  polypeptide     1       449     .       +       .       md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347
+  AACH01000027    Pfam    protein_match   84      314     1.2E-45 +       .       Name=PF00696;signature_desc=Amino acid kinase family;Target=null 84 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13"
+  ##sequence-region 2
+  ...
+  >pep_AACH01000027_1_1347
+  LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV
+  LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA
+  GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI
+  LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ
+  ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA
+  TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV
+  DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML
+  RSQKAKGVLIYRDDWISITPEIQLLFTEF
+  ...
+  >match$8_84_314
+  KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK
+  RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL
+  LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR
+  AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS
+
+]]></help>
+
+    <expand macro="citations" />
+</tool>