Mercurial > repos > iuc > interproscan
diff interproscan.xml @ 0:bfeae84e23ee draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/interproscan commit 2f5d27a375fcc2e8d77914b3d9e402a9e2df2d97"
| author | iuc |
|---|---|
| date | Mon, 15 Nov 2021 17:20:51 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan.xml Mon Nov 15 17:20:51 2021 +0000 @@ -0,0 +1,332 @@ +<tool id="interproscan" name="InterProScan" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09"> + <description>functional annotation</description> + <macros> + <import>macros.xml</import> + </macros> + <xrefs> + <xref type="bio.tools">interproscan_4</xref> + </xrefs> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">interproscan</requirement> + </requirements> + <version_command>interproscan.sh --version</version_command> + <command><![CDATA[ +## Adapt properties file to use data from data table +mkdir -p \$HOME/.interproscan-5 +&& +sed 's|^\(data.directory=\).*$|\1${database.fields.path}|' \$(dirname \$(readlink -f \$(command -v interproscan.sh)))/interproscan.properties > \$HOME/.interproscan-5/interproscan.properties +&& + +## Now run interproscan +interproscan.sh + +## disables the precalculated lookup service, all calculation will be run locally +-dp +--input '$input' +--seqtype $seqtype +-f ${','.join($oformat)} + +#if $licensed.use == 'true' and $licensed.applications_licensed: + --applications ${','.join($applications)},${','.join($licensed.applications_licensed)} +#else: + --applications ${','.join($applications)} +#end if +--tempdir \$TEMP + +$pathways +$goterms +$iprlookup + +--cpu \${GALAXY_SLOTS:-4} + +--output-file-base 'output' + ]]></command> + <inputs> + <param argument="--input" type="data" format="fasta" label="Protein FASTA File"/> + + <param argument="--seqtype" type="select" label="Type of the input sequences" help=""> + <option value="p" selected="true">Protein</option> + <option value="n">DNA / RNA</option> + </param> + + <param name="database" label="InterProScan database" type="select"> + <options from_data_table="interproscan"> + <column name="value" index="0" /> + <column name="name" index="1" /> + <column name="path" index="3" /> + <filter type="sort_by" column="0" /> + <filter type="static_value" column="2" value="@TOOL_VERSION@" /> + </options> + </param> + + <param name="applications" type="select" multiple="True" label="Applications to run" help="Select your program"> + <option value="TIGRFAM" selected="true">TIGRFAM: protein families based on hidden Markov models (HMMs)</option> + <option value="SFLD" selected="true">SFLD: a database of protein families based on hidden Markov models (HMMs)</option> + <option value="SUPERFAMILY" selected="true">SUPERFAMILY: database of structural and functional annotation for all proteins and genomes</option> + <option value="PANTHER" selected="true">PANTHER: Protein ANalysis THrough Evolutionary Relationships</option> + <option value="Gene3D" selected="true">Gene3d: Structural assignment for whole genes and genomes using the CATH domain structure database</option> + <option value="Hamap" selected="true">HAMAP: High-quality Automated Annotation of Microbial Proteomes</option> + <option value="PrositeProfiles" selected="true">PROSITE Profiles: protein domains, families and functional sites as well as associated profiles to identify them</option> + <option value="Coils" selected="true">Coils: Prediction of Coiled Coil Regions in Proteins</option> + <option value="SMART" selected="true">SMART: identification and analysis of domain architectures based on Hidden Markov Models or HMMs</option> + <option value="CDD" selected="true">SMART: protein domains and families based on well-annotated multiple sequence alignment models</option> + <option value="PRINTS" selected="true">PRINTS: group of conserved motifs (fingerprints) used to characterise a protein family</option> + <option value="PIRSR" selected="true">PIRSR: protein families based on hidden Markov models (HMMs) and Site Rules</option> + <option value="PrositePatterns" selected="true">PROSITE Pattern: protein domains, families and functional sites as well as associated patterns to identify them</option> + <option value="Pfam" selected="true">Pfam: protein families, each represented by multiple sequence alignments and hidden Markov models</option> + <option value="MobiDBLite" selected="true">MobiDBLite: Prediction of intrinsically disordered regions in proteins</option> + <option value="PIRSF" selected="true">PIRSF: non-overlapping clustering of UniProtKB sequences into a hierarchical order (evolutionary relationships)</option> + </param> + + <conditional name="licensed"> + <param name="use" type="select" label="Use applications with restricted license, only for non-commercial use?" help="The corresponding tools must be installed manually by the administrator of this Galaxy instance" > + <option value="false" selected="true">No</option> + <option value="true">Yes</option> + </param> + <when value="false" /> + <when value="true"> + <param name="applications_licensed" type="select" multiple="True" label="Applications to run" help="Select your programm."> + <option value="Phobius" selected="true">Phobius: combined transmembrane topology and signal peptide predictor</option> + <option value="SignalP_GRAM_NEGATIVE" selected="false">SignalP (gram-negative): signal peptide cleavage sites in amino acid sequences for gram-negative prokaryotes</option> + <option value="SignalP_EUK" selected="true">SignalP (eukaryotes): signal peptide cleavage sites in amino acid sequences for eukaryotes</option> + <option value="SignalP_GRAM_POSITIVE" selected="false">SignalP (Gram Positive Bacteria): signal peptide cleavage sites in amino acid sequences for gram-positive prokaryotes</option> + <option value="TMHMM" selected="true">TMHMM: Prediction of transmembrane helices in proteins</option> + </param> + </when> + </conditional> + + <param argument="--pathways" truevalue="--pathways" falsevalue="" checked="True" type="boolean" label="Include pathway information" + help="Option that provides mappings from matches to pathway information, which is based on the matched manually curated InterPro entries."/> + <param argument="--goterms" truevalue="--goterms" falsevalue="" checked="True" type="boolean" label="Include Gene Ontology (GO) mappings" + help="Look up of corresponding Gene Ontology annotation. Implies -iprlookup option."/> + <param argument="--iprlookup" truevalue="--iprlookup" falsevalue="" checked="False" type="boolean" + label="Provide additional mappings" help="Provide mappings from matched member database signatures to the InterPro entries that they are integrated into"/> + + <param name="oformat" type="select" multiple="true" label="Output format" help="Please select a output format (JSON output can be visualised on https://www.ebi.ac.uk/interpro/result/InterProScan/)."> + <option value="TSV" selected="true">Tab-separated values format (TSV)</option> + <option value="GFF3">GFF3</option> + <option value="XML">XML</option> + <option value="JSON">JSON</option> + </param> + </inputs> + + <outputs> + <data format="tabular" name="outfile_tsv" from_work_dir="output.tsv" label="InterProScan on ${on_string} (tsv)"> + <filter>oformat and 'TSV' in outputs</filter> + </data> + <data format="xml" name="outfile_xml" from_work_dir="output.xml" label="InterProScan on ${on_string} (xml)"> + <filter>oformat and 'XML' in outputs</filter> + </data> + <data format="gff3" name="outfile_gff3" from_work_dir="output.gff3" label="InterProScan on ${on_string} (gff3)"> + <filter>oformat and 'GFF3' in outputs</filter> + </data> + <data format="json" name="outfile_json" from_work_dir="output.json" label="InterProScan on ${on_string} (json)"> + <filter>oformat and 'JSON' in outputs</filter> + </data> + </outputs> + + <tests> + <test> + <param name="input" value="prots.fa" /> + <param name="seqtype" value="p" /> + <param name="database" value="5.52-86.0" /> + <param name="applications" value="MobiDBLite" /> + <param name="oformat" value="TSV" /> + <output name="outfile_tsv"> + <assert_contents> + <has_text text="FUN_000011-T1" /> + <has_text text="ea9924e11f7decc417e8d9ed8b9c682e" /> + <has_text text="FUN_000012-T1" /> + <has_text text="01beedc2fbf8012cba37f0c0d39aa071" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="prots.fa" /> + <param name="seqtype" value="p" /> + <param name="database" value="5.52-86.0" /> + <param name="applications" value="MobiDBLite" /> + <param name="oformat" value="TSV,GFF3,XML,JSON" /> + <output name="outfile_tsv"> + <assert_contents> + <has_text text="FUN_000011-T1" /> + <has_text text="ea9924e11f7decc417e8d9ed8b9c682e" /> + <has_text text="FUN_000012-T1" /> + <has_text text="01beedc2fbf8012cba37f0c0d39aa071" /> + </assert_contents> + </output> + <output name="outfile_xml"> + <assert_contents> + <has_text text="mobidblite-location" /> + <has_text text="Polyampholyte" /> + <has_text text="consensus disorder prediction" /> + <has_text text="FUN_000011-T1 FUN_000011" /> + </assert_contents> + </output> + <output name="outfile_gff3"> + <assert_contents> + <has_text text="protein_match" /> + <has_text text="ID=FUN_000011-T1;md5=" /> + <has_text text="MobiDBLite" /> + </assert_contents> + </output> + <output name="outfile_json"> + <assert_contents> + <has_text text="signatureLibraryRelease" /> + <has_text text="disorder_prediction" /> + <has_text text="Polyampholyte" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="transcripts.fa" /> + <param name="seqtype" value="n" /> + <param name="database" value="5.52-86.0" /> + <param name="applications" value="MobiDBLite" /> + <param name="oformat" value="TSV,GFF3,XML,JSON" /> + <output name="outfile_tsv"> + <assert_contents> + <has_text text="FUN_000018-T1_orf336" /> + <has_text text="0b28fe115d4cc09260b038b19fb0b21d" /> + <has_text text="FUN_000012-T1_orf133" /> + <has_text text="01beedc2fbf8012cba37f0c0d39aa071" /> + </assert_contents> + </output> + <output name="outfile_xml"> + <assert_contents> + <has_text text="mobidblite-location" /> + <has_text text="Polyampholyte" /> + <has_text text="consensus disorder prediction" /> + <has_text text="orf355" /> + </assert_contents> + </output> + <output name="outfile_gff3"> + <assert_contents> + <has_text text="protein_match" /> + <has_text text="ID=FUN_000012-T1;" /> + <has_text text="MobiDBLite" /> + </assert_contents> + </output> + <output name="outfile_json"> + <assert_contents> + <has_text text="signatureLibraryRelease" /> + <has_text text="disorder_prediction" /> + <has_text text="Polyampholyte" /> + </assert_contents> + </output> + </test> + <test expect_failure="true"> + <param name="input" value="prots.fa" /> + <param name="seqtype" value="p" /> + <param name="database" value="5.52-86.0" /> + <param name="applications" value="MobiDBLite" /> + <conditional name="licensed"> + <param name="use" value="true" /> + <param name="applications_licensed" value="Phobius,TMHMM" /> + </conditional> + <param name="oformat" value="TSV" /> + <assert_stdout> + <!-- expected to be "deactivated" as they are not installed by default --> + <has_text text="Analysis Phobius does not exist or is deactivated" /> + <has_text text="Analysis TMHMM does not exist or is deactivated" /> + </assert_stdout> + </test> + </tests> + + <help><![CDATA[ + +**What it does** + +Interproscan is a batch tool to query the InterPro database. It provides annotations based on multiple searches of profile and other functional databases. + +Phobius (licensed software), SignalP, SMART (licensed components) and TMHMM use +licensed code and data provided by third parties. If you wish to run these +analyses it will be necessary for you to obtain a licence from the vendor and +configure the Galaxy server InterProScan installation to use them. + +**Input** + +Required is a FASTA file containing protein or nucleotide sequences. + +**Output** + +In this version of InterProScan, you can retrieve output in any of the following five formats: + + * TSV: tab-separated values format + * XML: XML format + * GFF: The GFF 3.0 format + * JSON: A JSON representation of the protein matches that can be visualised on https://www.ebi.ac.uk/interpro/result/InterProScan/ + +**Example Output** + + +:: + + P51587 14086411a2cdf1c4cba63020e1622579 3418 Pfam PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain 1 2670 2799 7.9E-43 T 15-03-2013 + P51587 14086411a2cdf1c4cba63020e1622579 3418 ProSiteProfiles PS50138 BRCA2 repeat profile. 1002 1036 0.0 T 18-03-2013 IPR002093 BRCA2 repeat GO:0005515|GO:0006302 + P51587 14086411a2cdf1c4cba63020e1622579 3418 Gene3D G3DSA:2.40.50.140 2966 3051 3.1E-52 T 15-03-2013 + ... + + +The TSV format presents the match data in columns as follows: + + - Protein Accession (e.g. P51587) + - Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579) + - Sequence Length (e.g. 3418) + - Analysis (e.g. Pfam / PRINTS / Gene3D) + - Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140) + - Signature Description (e.g. BRCA2 repeat profile) + - Start location + - Stop location + - Score - is the e-value of the match reported by member database method (e.g. 3.1E-52) + - Status - is the status of the match (T: true) + - Date - is the date of the run + - (InterProScan annotations - accession (e.g. IPR002093) - optional column; only displayed if -iprscan option is switched on) + - (InterProScan annotations - description (e.g. BRCA2 repeat) - optional column; only displayed if -iprscan option is switched on) + - (GO annotations (e.g. GO:0005515) - optional column; only displayed if --goterms option is switched on) + - (Pathways annotations (e.g. REACT_71) - optional column; only displayed if --pathways option is switched on) + + +**Extensible Markup Language (XML)** + +XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) is available [http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5 here]. + +**Generic Feature Format Version 3 (GFF3)** + +The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md]. + +**Example Output** + + +:: + + ##gff-version 3 + ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269 + ##sequence-region AACH01000027 1 1347 + ##seqid|source|type|start|end|score|strand|phase|attributes + AACH01000027 provided_by_user nucleic_acid 1 1347 . + . Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027 + AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347 + AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347 + AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase family;Target=null 84 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13" + ##sequence-region 2 + ... + >pep_AACH01000027_1_1347 + LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV + LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA + GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI + LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ + ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA + TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV + DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML + RSQKAKGVLIYRDDWISITPEIQLLFTEF + ... + >match$8_84_314 + KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK + RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL + LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR + AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS + +]]></help> + + <expand macro="citations" /> +</tool>
