Mercurial > repos > iuc > proteinortho

--- a/proteinortho.xml	Fri Jun 16 20:52:09 2023 +0000
+++ b/proteinortho.xml	Fri Dec 13 10:18:38 2024 +0000
@@ -2,24 +2,24 @@
     <description>detects orthologous proteins/genes within different species</description>
     <macros>
         <import>proteinortho_macros.xml</import>
-        <xml name="test_output_proteinortho" tokens="nlines">
+        <xml name="test_output_proteinortho" tokens="nlines" token_nlines_delta="0">
             <output name="proteinortho">
                 <metadata name="column_names" value="species,genes,alg.-conn.,L.fasta,C.fasta,E.fasta,M.fasta"/>
                 <assert_contents>
                     <has_n_columns n="7"/>
-                    <has_n_lines n="@NLINES@"/>
+                    <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/>
                     <has_line_matching expression="# Species\tGenes\tAlg\.-Conn\.\t.*"/>
                     <has_line_matching expression="[0-9]+\t[0-9]+\t.*"/>
                     <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+.*"/>
                 </assert_contents>
             </output>
         </xml>
-        <xml name="test_output_blastgraph" tokens="nlines">
+        <xml name="test_output_blastgraph" tokens="nlines" token_nlines_delta="0">
             <output name="blastgraph">
                 <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
                 <assert_contents>
                     <has_n_columns n="6" comment="#"/>
-                    <has_n_lines n="@NLINES@"/>
+                    <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/>
                     <has_line_matching expression="# file_a\tfile_b"/>
                     <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba"/>
                     <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/>
@@ -41,6 +41,7 @@
             </output>
         </xml>
     </macros>
+    <expand macro="biotools"/>
     <expand macro="requirements"/>
     <expand macro="version_command"/>
     <command detect_errors="exit_code"><![CDATA[
@@ -96,11 +97,15 @@
                 #end for#
             #end if
             2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2)
+        #if $more_options.selfblast:
+            &&
+            mv result.blast-graph_clean result.blast-graph
+        #end if
         #if $synteny.synteny_options == "specified":
             &&
             mv result.poff-graph result.proteinortho-graph &&
             mv result.poff.tsv result.proteinortho.tsv &&
-            mv result.poff.html result.proteinortho.html ;
+            mv result.poff.html result.proteinortho.html
         #end if
     ]]></command>
     <inputs>
@@ -110,6 +115,8 @@
             <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option>
             <option value="blastp">NCBI-BLASTP+ (protein sequences)</option>
             <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option>
+            <option value="mmseqsp">MMseqs2 (aminoacid sequences)</option>
+            <option value="mmseqsn">MMseqs2 (nucleotide sequences)</option>
             <option value="lastp">Last (aminoacid sequences)</option>
             <option value="lastn">Last (nucleotide sequences)</option>
             <option value="blatp">BLAT (aminoacid sequences)</option>
@@ -121,7 +128,7 @@
             <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
             <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
             <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
-            <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/>
+            <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs (not compatible with synteny) "/>
             <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
             <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
             <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
@@ -132,7 +139,7 @@
             </param>
         </section>
         <conditional name="synteny">
-            <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015.">
+            <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015. (Not compatible with selfblast)">
                 <option value="no" selected="true">no</option>
                 <option value="specified">yes</option>
             </param>
@@ -172,16 +179,26 @@
         </data>
     </outputs>
     <tests>
-        <test expect_num_outputs="3"> <!-- test normal -->
+        <test expect_num_outputs="3"> <!-- test normal / default params -->
             <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="diamond"/>
-            <expand macro="test_output_proteinortho" nlines="34"/>
-            <expand macro="test_output_blastgraph" nlines="157"/>
-            <expand macro="test_output_proteinorthograph" nlines="134"/>
+            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
+            <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
+            <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
             </assert_command>
         </test>
+        <test expect_num_outputs="3"> <!-- test normal mmseqs -->
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
+            <param name="p" value="mmseqsp"/>
+            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
+            <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
+            <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
+            <assert_command>
+                <has_text text="--p=mmseqsp"/>
+            </assert_command>
+        </test>
         <test expect_num_outputs="3"> <!-- various parameter -->
             <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="diamond"/>
@@ -190,13 +207,12 @@
             <section name="more_options">
                 <param name="cov" value="42"/>
                 <param name="identity" value="42"/>
-                <param name="selfblast" value="true"/>
                 <param name="singles" value="true"/>
                 <param name="core" value="true"/>
             </section>
-            <expand macro="test_output_proteinortho" nlines="177"/>
-            <expand macro="test_output_blastgraph" nlines="2720"/>
-            <expand macro="test_output_proteinorthograph" nlines="384"/>
+            <expand macro="test_output_proteinortho" nlines="151" nlines_delta="50"/>
+            <expand macro="test_output_blastgraph" nlines="1403" nlines_delta="300"/>
+            <expand macro="test_output_proteinorthograph" nlines="239" nlines_delta="150"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
             </assert_command>
@@ -208,8 +224,8 @@
             <conditional name="synteny">
                 <param name="synteny_options" value="specified"/>
             </conditional>
-            <expand macro="test_output_proteinortho" nlines="38"/>
-            <expand macro="test_output_blastgraph" nlines="157"/>
+            <expand macro="test_output_proteinortho" nlines="38" nlines_delta="20"/>
+            <expand macro="test_output_blastgraph" nlines="300" nlines_delta="150"/>
             <expand macro="test_output_proteinorthograph" nlines="119" nlines_delta="10" ncolumns="8" add_columns=",same_strand,simscore"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
@@ -218,9 +234,9 @@
         <test expect_num_outputs="3"> <!-- blast -->
             <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="blastp"/>
-            <expand macro="test_output_proteinortho" nlines="32"/>
-            <expand macro="test_output_blastgraph" nlines="158"/>
-            <expand macro="test_output_proteinorthograph" nlines="142"/>
+            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
+            <expand macro="test_output_blastgraph" nlines="155" nlines_delta="50"/>
+            <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="50"/>
             <assert_command>
                 <has_text text="--p=blastp"/>
             </assert_command>
@@ -228,9 +244,9 @@
         <test expect_num_outputs="3"> <!-- auto blast -->
             <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="autoblast"/>
-            <expand macro="test_output_proteinortho" nlines="32"/>
-            <expand macro="test_output_blastgraph" nlines="158"/>
-            <expand macro="test_output_proteinorthograph" nlines="142"/>
+            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
+            <expand macro="test_output_blastgraph" nlines="157" nlines_delta="50"/>
+            <expand macro="test_output_proteinorthograph" nlines="136" nlines_delta="50"/>
             <assert_command>
                 <has_text text="--p=autoblast"/>
             </assert_command>
@@ -238,21 +254,21 @@
         <test expect_num_outputs="3"> <!-- last -->
             <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="lastp"/>
-            <expand macro="test_output_proteinortho" nlines="34"/>
-            <expand macro="test_output_blastgraph" nlines="148"/>
-            <expand macro="test_output_proteinorthograph" nlines="133"/>
+            <expand macro="test_output_proteinortho" nlines="34" nlines_delta="20"/>
+            <expand macro="test_output_blastgraph" nlines="148" nlines_delta="50"/>
+            <expand macro="test_output_proteinorthograph" nlines="134" nlines_delta="50"/>
             <assert_command>
                 <has_text text="--p=lastp"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- blat -->
             <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
-            <param name="p" value="blastp"/>
-            <expand macro="test_output_proteinortho" nlines="32"/>
-            <expand macro="test_output_blastgraph" nlines="158"/>
-            <expand macro="test_output_proteinorthograph" nlines="142"/>
+            <param name="p" value="blatp"/>
+            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
+            <expand macro="test_output_blastgraph" nlines="56" nlines_delta="50"/>
+            <expand macro="test_output_proteinorthograph" nlines="56" nlines_delta="50"/>
             <assert_command>
-                <has_text text="--p=blastp"/>
+                <has_text text="--p=blatp"/>
             </assert_command>
         </test>
     </tests>
@@ -264,7 +280,7 @@

   | It compares similarities of given gene/protein sequences and clusters them to find significant groups.
   | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at once.
-  | Details can be found in (doi:10.1186/1471-2105-12-124).
+  | Details can be found in (doi:10.1186/1471-2105-12-124 and doi:10.3389/fbinf.2023.1322477).
   | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already built in Proteinortho.

 ----
@@ -281,8 +297,8 @@

 * **(ii) Cluster the RBH**

-      | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits.
-      | The resulting connected components are outputted in orthology-groups / -pairs
+      | A spectral clustering algorithm is used to remove weak connections, reducing false positives.
+      | The connected components from this process are output as orthology groups or pairs.

 ----

@@ -318,14 +334,14 @@

       | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
       | Every line corresponds to an orthology group.
-      | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general.
+      | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself.
       | Then a column for each species follows containing the proteins of these species.
       | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
       | The '*' represents that this species does not contribute to the group.

 .. csv-table::

-    Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa
+    Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,mouse.faa
     5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
     4,6,0.115,*,C_12,E_315,L_313,M_313
     4,5,0.167,*,C_63,E_19,L_19,M_19
@@ -333,16 +349,43 @@

 ----

-* **orthology-pairs**
-
-      | The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph:
+      | The first group is comprised of 5 proteins of 5 species: 'C_10' of ecoli.faa, 'C_10;test' of human.faa, 'E_10' of snail.faa, 'L_10' of wale.faa, and 'M_10' of mouse.faa.
+      | The alg.-conn. (algebraic connectivity) of 0.715 indicates the connectivity of this group, the higher the more edges are connecting these 5 proteins (at most there can be 10 and at least there need to be 4).
+      | The second group contains 6 proteins distributed over 4 species. The star indicates the species where no protein was found (in this case ecoli.faa).

 .. csv-table::

-    seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
+    seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
+    # ecoli.faa,human.faa
+    # 1.91e-112,357.5,1.825e-113,360
+    L_10,C_10;test,4.32e-151,447,4.30e-151,446
+    L_11,C_11,1.17e-68,209,3.00e-69,210
+    L_14,C_14,3.64e-139,422,1.19e-142,431
+    L_15,C_15,3.51e-100,303,2.12e-102,308
+    L_16,C_16,3.75e-49,157,7.06e-50,159
+    L_17,C_17,2.96e-195,578,5.50e-196,579

 ----

+* **orthology-pairs**
+
+      | Similar to orthology groups, but each edge is printed individually.
+      | The output is formatted the same as the RBH graph.
+      | For example extracting all hits of the second group of the example orthology-group output ('4,6,0.115,*,C_12,E_315,L_313,M_313') using grep (-E, regular expression="(C_12|E_315|L_313|M_313).*(C_12|E_315|L_313|M_313)", input file=proteinortho-graph) would reveal all edges of this groups:
+
+.. csv-table::
+
+    seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
+    M_313,C_12,1.18e-115,407,6.12e-116,407
+    C_12,E_315,4.50e-127,445,4.09e-127,445
+    L_313,M_313,0.00e+00,1368,0.00e+00,1368
+    L_313,C_12,3.76e-114,402,1.94e-114,402
+
+----
+
+    | Especially L_313 and M_313 are very similar, probably identical.
+    | The group cotnains 4 edges out of the 6 possible edges for a group of 4 proteins. The missing edges are M_313-E_315 as well as L_313-E_315. This means that E_315 is only connected to the other 3 proteins via C_12 and thus could be considered as a weak link in the group.
+
 **Proteinortho-Tools for downstream analysis**

 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
@@ -350,12 +393,11 @@

 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho

-**Citations:**
-
-- Lechner, Marcus, et al. "Proteinortho: detection of (co-) orthologs in large-scale analysis." BMC bioinformatics 12.1 (2011): 1-9. (10.1186/1471-2105-12-124)
-- Lechner, Marcus, et al. "Orthology detection combining clustering and synteny for very large datasets." PLoS one 9.8 (2014): e105015. (10.1371/journal.pone.0105015)
-
 ]]>
     </help>
-    <expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. -->
+    <citations>
+        <citation type="doi">10.3389/fbinf.2023.1322477</citation>
+        <citation type="doi">10.1186/1471-2105-12-124</citation>
+        <citation type="doi">10.1371/journal.pone.0105015</citation>
+    </citations>
 </tool>
--- a/proteinortho_macros.xml	Fri Jun 16 20:52:09 2023 +0000
+++ b/proteinortho_macros.xml	Fri Dec 13 10:18:38 2024 +0000
@@ -1,21 +1,21 @@
 <?xml version="1.0"?>
 <macros>
-   <token name="@TOOL_VERSION@">6.2.3</token>
-   <token name="@WRAPPER_VERSION@">1</token>
-   <token name="@PROFILE@">20.09</token>
-   <xml name="citations">
-        <citations>
-            <citation type="doi">10.1186/1471-2105-12-124</citation>
-            <citation type="doi">10.1371/journal.pone.0105015</citation>
-        </citations>
+    <token name="@TOOL_VERSION@">6.3.4</token>
+    <token name="@WRAPPER_VERSION@">0</token>
+    <token name="@PROFILE@">22.05</token>
+    <xml name="biotools">
+        <xrefs>
+            <xref type="bio.tools">proteinortho</xref>
+        </xrefs>
     </xml>
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">proteinortho</requirement>
-            <requirement type="package" version="2.1.4">diamond</requirement>
-            <requirement type="package" version="2.13.0">blast</requirement>
-            <requirement type="package" version="377">ucsc-blat</requirement>
-            <requirement type="package" version="1422">last</requirement>
+            <requirement type="package" version="2.1.8">diamond</requirement>
+            <requirement type="package" version="2.15.0">blast</requirement>
+            <requirement type="package" version="445">ucsc-blat</requirement>
+            <requirement type="package" version="1519">last</requirement>
+            <requirement type="package" version="16.747c6">mmseqs2</requirement>
         </requirements>
     </xml>
     <xml name="version_command">