Repository 'repeatexplorer'
hg clone https://eddie.galaxyproject.org/repos/petrn/repeatexplorer

Changeset 8:3bc73f5dc785 (2019-12-20)
Previous changeset 7:c56807be3b72 (2019-12-20) Next changeset 9:511266aa9235 (2019-12-20)
Commit message:
Uploaded
modified:
tool_dependencies.xml
added:
0INFO.md
CHANGELOG.md
HOW_TO_CITE.html
Makefile
README.md
__init__.py
bin/align_parsing.pl
bin/cap3
bin/formatdb
bin/last_wrapper.py
bin/mgblast
bin/runOGDFlayout
bin/select_and_sort_contigs.pl
build_shed_tarball.sh
checkR.R
config.py
databases/0INFO.md
databases/classification_tree_metazoa_v0.rds
databases/classification_tree_metazoa_v2.rds
databases/classification_tree_metazoa_v3.rds
databases/classification_tree_viridiplantae_v3.0.rds
databases/classification_viridiplantae_tree.rds
databases/create_protein_database_viridiplantae_v3.0.R
databases/database_version.txt
databases/dna_database_masked.fasta
databases/lastal_params
databases/protein_database.fasta
databases/satellite_model.rds
databases/tRNA_database.fasta
environment.yml
fetch_databases.sh
get_version.sh
lib/__init__.py
lib/assembly_tools.py
lib/config.R
lib/create_annotation.R
lib/detect_LTR_insertion_sites.pl
lib/documentation.html
lib/documentation.org
lib/graphtools.py
lib/htmlheader.R
lib/parallel/.gitignore
lib/parallel/__init__.py
lib/parallel/parallel.py
lib/pylintrc
lib/r2py.py
lib/reporting.R
lib/seqtools.py
lib/style1.css
lib/tarean/.gitignore
lib/tarean/OGDF/runOGDFlayout
lib/tarean/OGDF/runOGDFlayout2015.5
lib/tarean/README.md
lib/tarean/htmlheader.R
lib/tarean/kmer_counting.py
lib/tarean/logo_methods.R
lib/tarean/methods.R
lib/tarean/mgblast2GL.R
lib/tarean/tarean.R
lib/tarean/tarean_batch_mode.R
lib/tarean_output_help.html
lib/tarean_output_help.org
lib/utils.R
lib/utils.py
licence/Artistic_License
licence/README.md
louvain/Makefile
louvain/community.cpp
louvain/community.h
louvain/graph.cpp
louvain/graph.h
louvain/graph_binary.cpp
louvain/graph_binary.h
louvain/main_community.cpp
louvain/main_convert.cpp
louvain/main_hierarchy.cpp
louvain/main_random.cpp
louvain/readme.txt
pylintrc
repex_full_clustering.xml
repex_tarean.xml
seqclust
stderr_filter.py
test_repex_pipeline.py
tmp/.dummy
version_info.txt
b
diff -r c56807be3b72 -r 3bc73f5dc785 0INFO.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/0INFO.md Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,25 @@
+# modules and classes in repex
+
+require python3.4
+
+## seqtools
+tools for sequence manipulation, reading, sampling
+### Sequence 
+ single sequence container
+### SequenceSet
+ reading set of seq from fasta file or creating from scratch
+ use sqlite database
+ sample
+ partitioning into chunk
+
+## dependencies:
+python module parse:
+see https://github.com/r1chardj0n3s/parse or https://pypi.python.org/pypi/parse
+install by pyp3 istall parse
+
+
+
+
+
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 CHANGELOG.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/CHANGELOG.md Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,145 @@
+# Changelog
+
+
+## v0.3.5
+Nov 22 2019
+
+   - better reporting when filtering of abundant satellite is used
+   - number of changes in names of output files - file names are now more informative. Changes include:
+   
+| old file name                                                            | new file name                                                        |
+|--------------------------------------------------------------------------|----------------------------------------------------------------------|
+| TR_consensus_rank_1_.fasta                                               | TAREAN_consensus_rank_1.fasta                                        |
+| TR_consensus_rank_2_.fasta                                               | TAREAN_consensus_rank_2.fasta                                        |
+| TR_consensus_rank_3_.fasta                                               | TAREAN_consensus_rank_3.fasta                                        |
+| TR_consensus_rank_4_.fasta                                               | TAREAN_consensus_rank_4.fasta                                        |
+| sequences/sequences.fasta                                                | reads/reads.fasta                                                    |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.aln.info.minRD5_sort-GR     | clustering/clusters/dir_CL0001/contigs.info.minRD5_sort-GR.fasta     |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.aln.info.minRD5_sort-length | clustering/clusters/dir_CL0001/contigs.info.minRD5_sort-length.fasta |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.aln.info.minRD5             | clustering/clusters/dir_CL0001/contigs.info.minRD5_sort-RD.fasta     |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.aln.profile                 | clustering/clusters/dir_CL0001/contigs.profile                       |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.aln.info                    | clustering/clusters/dir_CL0001/contigs.info.fasta                    |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.ace                         | clustering/clusters/dir_CL0001/contigs.ace                           |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.contigs.qual                | clustering/clusters/dir_CL0001/contigs.qual                          |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.aln                         | clustering/clusters/dir_CL0001/contigs.aln                           |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.contigs                     | clustering/clusters/dir_CL0001/contigs.fasta                         |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.info                        | clustering/clusters/dir_CL0001/assembly.info                         |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.singlets                    | clustering/clusters/dir_CL0001/singlets.fasta                        |
+| clustering/clusters/dir_CL0001/reads.fas.CL1.contigs.links               | clustering/clusters/dir_CL0001/contigs.links                         |
+| clustering/clusters/dir_CL0001/reads_oriented.fas                        | clustering/clusters/dir_CL0001/reads_selection_oriented.fasta        |
+| clustering/clusters/dir_CL0001/CL1_directed_graph.RData.                 | clustering/clusters/dir_CL0001/graph_layout_directed.RData           |
+| clustering/clusters/dir_CL0001/CL1_tmb.png                               | clustering/clusters/dir_CL0001/graph_layout_tmb.png                  |
+| clustering/clusters/dir_CL0001/CL1.png                                   | clustering/clusters/dir_CL0001/graph_layout.png                      |
+| clustering/clusters/dir_CL0001/CL1.GL                                    | clustering/clusters/dir_CL0001/graph_layout.GL                       |
+| clustering/clusters/dir_CL0001/blast.csv                                 | clustering/clusters/dir_CL0001/hitsort_part.csv                      |
+
+## v0.3.4
+Oct 31 2019
+
+  - Classification of superclusters improved, classification now uses information about LTR/PBS for classification of Class_I elements
+  
+
+## v0.3.2
+Oct 9 2019
+
+  - Graphical reporting of comparative analysis added 
+
+## v0.3.1
+Jan 9 2019
+
+  - Improved detection of low complexity repeats and satellites with shorter monomer
+
+## v0.3.0
+Oct 25 2018
+
+ - For back-compatibility, it is possible to select protein database version
+ - Databases of protein domains went public 
+
+## v0.2.10
+Oct 24 2018
+
+ - Protein database for Viridiplantae updated
+
+## v0.2.9
+Jan 3 2018
+
+ - read depth for contigs is calculated
+ - contigs are also sorted based on genome representation

+## v0.2.8
+Dec 18 2017
+
+ - by default assembly is done on clusters with size at least 5 reads
+ - preset option of Illumina, Illumina short and Oxford Nanopore
+ - option for analyzing Metazoa/ Viridiplantae added
+ - protein databases can be obtained **using fetch_databases.sh** script (password required)
+
+## v0.2.7
+Dec 05 2017
+
+ - improved DNA database - protein domain were masked not to interfere with protein domain database
+ - another bug fix in parallelization and assembly 
+ - alternative to blastx added - DIAMOND program can be used instead

+## v0.2.6
+Nov 14 2017
+
+- Improved classification of superclusters
+- tables in HTML reports improved
+- assembly is performed on low confidence satellite sequences
+- assembly optimized, bug fix in parallelization
+
+## v0.2.5
+Sep 1 2017
+
++ More options added to _galaxy interface_ of TAREAN and RepeatExplorer2:
+    - automatic satellite filtering
+    - cluster size threshold setting
+    - keep original sequence names option
++ Pipeline tests added
+
+## v0.2.4
+Aug 10 2017
+
+- Filtering of abundant satellite sequences
+- Improved (more sensitive) search of protein domains 
+- bug fix in parallelization
+- SHORT_ILLUMINA option added, this enable to analyze shorter reads (50nt) - command line only
+- OXFORD_NANOPORE option added (experimental feature, command line only)
+
+## v0.2.3
+Jun 9 2017
+
+- some changes in standalone TAREAN
+
+## v0.2.2
+
+- improved visualization of comparative analysis 
+
+## v0.2.1
+
+- log file is automatically copied to output and is part of archive(in galaxy)
+
+## v0.2.0
+
++ Improved HTML output
++ Updated documentation
++ Full clustering analysis - RepeatExplorer2
++ New features:
+    - Cap3 assembly added
+    - Comparative analysis
+    - LTR detection from contigs
+    - Custom database option
+    - Bugfix in excessive CPU usage
+    - Detection of TE protein domains
+
+## v0.1.1
+
++ New features:
+    - Cluster merging
+    - Scan of clusters against DNA database (detection of rDNA, plastid, mitochondrial and contaminants)
+
+## v0.1.0
+
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 HOW_TO_CITE.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/HOW_TO_CITE.html Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,21 @@
+<hr>
+
+<h3> How to cite </h3>
+<p>
+ Novak, P., Neumann, P., Pech, J., Steinhaisl, J., Macas, J. (2013) -
+   <a href="http://bioinformatics.oxfordjournals.org/content/29/6/792">RepeatExplorer: a Galaxy-based web server for genome-wide characterization of eukaryotic repetitive elements from next generation sequence reads.</a> <i> Bioinformatics</i> <b>29</b>:792-793.
+</p>
+
+<p><i> Classification of repetitive elements using REXdb:</i></p>
+<p>Neumann, P., Novak, P., Hostakova, N., Macas, J. (2019) &#8211; <a href="https://mobilednajournal.biomedcentral.com/articles/10.1186/s13100-018-0144-1" target="_blank">Systematic survey of plant LTR-retrotransposons elucidates phylogenetic relationships of their polyprotein domains and provides a reference for element classification</a>. <em>Mobile DNA</em> <b>10</b>:1.</p>
+
+</p>
+<i>The principle of repeat identification implemented in the RepeatExplorer:</i>
+<p>
+   Novak, P., Neumann, P., Macas, J. (2010) - <a href="http://www.biomedcentral.com/1471-2105/11/378">Graph-based clustering and characterization of repetitive sequences in next-generation sequencing data.</a> <i>BMC Bioinformatics</i> <b>11</b>:378.
+</p>
+<i>Using TAREAN for satellite repeat detection and characterization:</i>
+<p>
+  Novak, P., Robledillo, L.A.,Koblizkova, A., Vrbova, I., Neumann, P., Macas, J. (2017) -
+    <a href="https://doi.org/10.1093/nar/gkx257"> TAREAN: a computational tool for identification and characterization of satellite DNA from unassembled short reads.</a> <i> Nucleic Acid Research </i> <b>45</b>:e111
+</p>
b
diff -r c56807be3b72 -r 3bc73f5dc785 Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Makefile Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+CC=g++
+CFLAGS= -ansi -O5 -Wall
+LDFLAGS= -ansi -lm -Wall
+EXEC=bin/louvain_community bin/louvain_convert bin/louvain_hierarchy
+OBJ1= louvain/graph_binary.o louvain/community.o
+OBJ2= louvain/graph.o
+
+DNA_DB=databases/dna_database_masked.fasta.nhr
+PROT_DB=re_databases/protein_database_viridiplantae_v3.0.fasta.phr
+TRNA_DB=databases/tRNA_database.fasta.nhr
+
+
+all: $(EXEC) $(DNA_DB) $(PROT_DB) $(TRNA_DB)
+
+bin/louvain_community : $(OBJ1) louvain/main_community.o
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+bin/louvain_convert : $(OBJ2) louvain/main_convert.o
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+bin/louvain_hierarchy : louvain/main_hierarchy.o
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+$(DNA_DB) : databases/dna_database_masked.fasta
+ makeblastdb -in databases/dna_database_masked.fasta -out databases/dna_database_masked.fasta -dbtype nucl
+
+$(TRNA_DB) : databases/tRNA_database.fasta
+ makeblastdb -in databases/tRNA_database.fasta -out databases/tRNA_database.fasta -dbtype nucl
+
+$(PROT_DB) : config.py
+ ./fetch_databases.sh
+
+dna_database: $(DFASTA)  
+##########################################
+# Generic rules
+##########################################
+
+%.o: %.cpp %.h
+ $(CC) -o $@ -c $< $(CFLAGS)
+
+%.o: %.cpp
+ $(CC) -o $@ -c $< $(CFLAGS)
+
+clean:
+ rm -f *.o *~ $(EXEC)
b
diff -r c56807be3b72 -r 3bc73f5dc785 README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,158 @@
+# RepeatExplorer2 with TAREAN (Tandem Repeat Analyzer) #
+-------------------------------------------------------------------------------
+New version of RepeatExplorer with TAndem REpeat ANalyzer 
+
+## Authors
+Petr Novak, Jiri Macas, Pavel Neumann
+Biology Centre CAS, Czech Republic
+
+## Change log
+
+[link](CHANGELOG.md)
+
+
+  
+## Instalation ##
+To use RepeatExplorer without installation, We recommend to use our freely
+available galaxy server at
+[https://repeatexplorer-elixir.cerit-sc.cz](https://repeatexplorer-elixir.cerit-sc.cz).
+This server is provided in frame of ELIXIR-CZ project. Additionally, the galaxy
+server includs also additional tools useful data preprocessing, quality contraol
+and genome annotation.
+
+For command line version from standalone installation, follow the instruction below:
+
+
+To download source using git command:
+
+ git clone https://bitbucket.org/petrnovak/repex_tarean.git
+ cd repex_tarean
+    
+We recommend to install dependencies using conda (conda can be installed using [miniconda](https://docs.conda.io/en/latest/miniconda.html)). The required environment can be prepared using command:
+
+    conda env create -f environment.yml
+
+activate prepared environment using:
+   
+    conda activate repeatexplorer
+
+In the `repex_tarean` direcory compile source and prepare databases using:
+
+ make
+
+Support for 32-bit executables is required. If you are using Ubuntu distribution you can add 32-bit support by running:
+
+    sudo dpkg --add-architecture i386
+    sudo apt-get update
+    sudo apt-get install libc6:i386 libncurses5:i386 libstdc++6:i386
+   
+
+to verify installation you can run clustering on example data:
+
+    ./seqclust -p -v tmp/clustering_output test_data/LAS_paired_10k.fas
+    
+   
+## Protein databases
+
+Repeatexplorer2 utilize REXdb database of protein domains for repeat annotation and classification. Structure of database is described on [http://repeatexplorer.org/](http://repeatexplorer.org/). Current version of database for repeatexplorer is fetched from bitbucket repository [https://bitbucket.org/petrnovak/re_databases]https://bitbucket.org/petrnovak/re_databases() during compilation using make command
+
+
+## RepeatExplorer command line options
+
+    usage: seqclust [-h] [-p] [-A] [-t] [-l LOGFILE] [-m {float range 0.0..100.0}]
+                    [-M {0,float range 0.1..1}] [-o {float range 30.0..80.0}]
+                    [-c CPU] [-s SAMPLE] [-P PREFIX_LENGTH] [-v OUTPUT_DIR]
+                    [-r MAX_MEMORY] [-d DATABASE DATABASE] [-C] [-k]
+                    [-a {2,3,4,5}]
+                    [-tax {VIRIDIPLANTAE3.0,VIRIDIPLANTAE2.2,METAZOA2.0,METAZOA3.0}]
+                    [-opt {ILLUMINA,ILLUMINA_DUST_OFF,ILLUMINA_SHORT,OXFORD_NANOPORE}]
+                    [-D {BLASTX_W2,BLASTX_W3,DIAMOND}]
+                    sequences
+    
+    RepeatExplorer:
+        Repetitive sequence discovery and clasification from NGS data
+    
+        
+    
+    positional arguments:
+      sequences
+    
+    optional arguments:
+      -h, --help            show this help message and exit
+      -p, --paired
+      -A, --automatic_filtering
+      -t, --tarean_mode     analyze only tandem reapeats without additional classification
+      -l LOGFILE, --logfile LOGFILE
+                            log file, logging goes to stdout if not defines
+      -m {float range 0.0..100.0}, --mincl {float range 0.0..100.0}
+      -M {0,float range 0.1..1}, --merge_threshold {0,float range 0.1..1}
+                            threshold for mate-pair based cluster merging, default 0 - no merging
+      -o {float range 30.0..80.0}, --min_lcov {float range 30.0..80.0}
+                            minimal overlap coverage - relative to longer sequence length, default 55
+      -c CPU, --cpu CPU     number of cpu to use, if 0 use max available
+      -s SAMPLE, --sample SAMPLE
+                            use only sample of input data[by default max reads is used
+      -P PREFIX_LENGTH, --prefix_length PREFIX_LENGTH
+                            If you wish to keep part of the sequences name,
+                             enter the number of characters which should be 
+                            kept (1-10) instead of zero. Use this setting if
+                             you are doing comparative analysis
+      -v OUTPUT_DIR, --output_dir OUTPUT_DIR
+      -r MAX_MEMORY, --max_memory MAX_MEMORY
+                            Maximal amount of available RAM in kB if not set
+                            clustering tries to use whole available RAM
+      -d DATABASE DATABASE, --database DATABASE DATABASE
+                            fasta file with database for annotation and name of database
+      -C, --cleanup         remove unncessary large files from working directory
+      -k, --keep_names      keep sequence names, by default sequences are renamed
+      -a {2,3,4,5}, --assembly_min {2,3,4,5}
+                            Assembly is performed on individual clusters, by default 
+                            clusters with size less then 5 are not assembled. If you 
+                            want need assembly of smaller cluster set *assmbly_min* 
+                            accordingly
+      -tax {VIRIDIPLANTAE3.0,VIRIDIPLANTAE2.2,METAZOA2.0,METAZOA3.0}, --taxon {VIRIDIPLANTAE3.0,VIRIDIPLANTAE2.2,METAZOA2.0,METAZOA3.0}
+                            Select taxon and protein database version
+      -opt {ILLUMINA,ILLUMINA_DUST_OFF,ILLUMINA_SHORT,OXFORD_NANOPORE}, --options {ILLUMINA,ILLUMINA_DUST_OFF,ILLUMINA_SHORT,OXFORD_NANOPORE}
+      -D {BLASTX_W2,BLASTX_W3,DIAMOND}, --domain_search {BLASTX_W2,BLASTX_W3,DIAMOND}
+                            Detection of protein domains can be performed by either blastx or
+                             diamond" program. options are:
+                              BLASTX_W2 - blastx with word size 2 (slowest, the most sesitive)
+                              BLASTX_W3 - blastx with word size 3 (default)
+                              DIAMOND   - diamond program (significantly faster, less sensitive)
+                            To use this option diamond program must be installed in your PATH
+    
+
+
+## Galaxy toolshed
+TODO
+
+## Reproducibility
+To make clustering reproducible between runs with the 
+same data, environment variable PYTHONHASHSEED must be set:
+
+ export PYTHONHASHSEED=0
+    
+## Disk space requirements
+Large sqlite database for temporal data is created in OS specific temp directory- usually /tmp/ 
+To use alternative location, it is necessary specify `TEMP` environment variable.
+
+## CPU and RAM requirements
+
+Resources requirements can be set either from command line arguments `--max-memory` and `--cpu` or
+using environment variables `TAREAN_MAX_MEM` and `TAREAN_CPU`. If not set, pipeline use all
+available resources
+
+## How cite
+
+If you use RepeatExplorer for general repeat characterization in your work please cite:
+
+ - [Novak, P., Neumann, P., Pech, J., Steinhaisl, J., Macas, J. (2013) - RepeatExplorer: a Galaxy-based web server for genome-wide characterization of eukaryotic repetitive elements from next generation sequence read. Bioinformatics 29:792-793](http://bioinformatics.oxfordjournals.org/content/29/6/792)
+
+or
+
+ - [Novak, P., Neumann, P., Macas, J. (2010) - Graph-based clustering and characterization of repetitive sequences in next-generation sequencing data. BMC Bioinformatics 11 :37](http://www.biomedcentral.com/1471-2105/11/378)
+
+If you use TAREAN for satellite detection and characterization please cite:
+
+ - [Novak, P., Robledillo, L.A.,Koblizkova, A., Vrbova, I., Neumann, P., Macas, J. (2017) - TAREAN: a computational tool for identification and characterization of satellite DNA from unassembled short reads. Nucleic Acid Research](https://doi.org/10.1093/nar/gkx257)

b
diff -r c56807be3b72 -r 3bc73f5dc785 bin/align_parsing.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/align_parsing.pl Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+# Parses align file, calculates read depth (RD) and genome representation (GR)
+# of individual contigs; outputs contig sequences in fasta format with this
+# information in the fasta header: 
+# >CLXConitgY (length-average_RD-GR)
+#
+# RD profiles along the contig sequences can also be calculated and saved 
+# in a separate file.
+#
+# (renamed from align_parsing_2.pl, 2010-05-10)
+
+
+use Getopt::Std;
+use warnings;
+
+getopt('iop');
+if ($opt_i) {
+  $align_file = $opt_i;    # 
+} else {
+  die "Missing parameter: -i align_file\n";
+}
+if ($opt_o) {
+  $out_file = $opt_o;    # 
+} else {
+  $out_file = $align_file.".info";
+  print "Output file not set, using \"$out_file\"\n";
+}
+if ($opt_p) {
+  $profile_file = $opt_p;    # RD profiles will be recorded
+} else {
+  $profile_file = 0;     # will be used below to switch profile output off 
+  print "Parameter -p not set, RD profiles will not be saved\n";
+}
+
+open (ALIGN,$align_file) or die "Could not read from $align_file\n";
+open (OUT,">$out_file") or die "Could not write to $out_file\n";
+if ($profile_file) {
+  open (PROF,">$profile_file") or die "Could not write to $profile_file\n";
+}
+
+while ($radek = <ALIGN>) {
+  if ($radek =~/^DETAILED DISPLAY OF CONTIGS/) {    # start of alignment section
+      # print $radek;
+    $radek = <ALIGN>;
+    while ($radek =~/\*\*\* (CL\d+) {0,1}Contig (\d+) \*\*\*/) {   # parsing individual contigs
+      #  print $radek;
+      $contig_id = $1."Contig".$2;
+      @pokryti = ();       # array ve ktere bude pro kazdou pozici v sekvenci pokryti
+      $seq = "";           # consensus sekvence contigu
+#      print "$contig_id\n";
+#      print $radek;
+      $radek = <ALIGN>;
+      while ($radek =~/\:    \.    \:/) {  # -> nasleduje dalsi blok 
+        @blok = ();                        # radky z aktualniho bloku alignmentu
+        do {
+          $radek = <ALIGN>;
+          push(@blok,$radek); 
+        } while (not $radek =~/^consensus/);
+        $cons_line = $radek;
+        $radek = <ALIGN>;
+        $radek = <ALIGN>;     # toto posledni nacteni radku slouzi v podmince tohoto i nadrazeneho cyklu while
+        if (not $radek) { $radek = " "; };      # aby to nehlasilo chyby na konci souboru kde ty radky muzou chybet
+      
+        pop(@blok); pop(@blok);            # odstrani posledni dva radky (mezera a consensus)
+      
+        for ($f=10;$f<=length($cons_line);$f++) {
+          $suma_pozice = 0;
+          if (substr($cons_line,$f,1) =~/([A,T,G,C,N])/) {
+            $seq .= $1;
+            foreach $cteni (@blok) {
+              if (substr($cteni,$f,1) =~/[A,T,G,C,N]/) {
+                $suma_pozice++;
+              }
+            }
+            push(@pokryti,$suma_pozice);
+          }
+        }
+      
+      }
+    
+      $delka_cons = @pokryti;
+      $soucet = 0;
+      $prumer = 0;
+      foreach $suma_pozice (@pokryti) { 
+        $soucet += $suma_pozice;
+      }
+      $prumer = sprintf("%0.1f",$soucet/$delka_cons);
+
+      print OUT ">$contig_id ($delka_cons-$prumer-$soucet)\n";
+      while ($seq_line = substr($seq,0,60,"")) {
+        print OUT "$seq_line\n";
+      }
+      if ($profile_file) {
+        print PROF ">$contig_id ($delka_cons-$prumer-$soucet)\n";
+        foreach $suma_pozice (@pokryti) {
+          print PROF "$suma_pozice ";
+        }
+        print PROF "\n";
+      }
+      
+    }
+  }
+}
+
+close ALIGN;
+close OUT;
+if ($profile_file) {
+  close PROF;
+}
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 bin/cap3
b
Binary file bin/cap3 has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 bin/formatdb
b
Binary file bin/formatdb has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 bin/last_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/last_wrapper.py Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+'''
+wrapper for last program
+run last  with BlastTab+ output and return mgblast like formated output
+last BlastTab+ output column order:
+1 query name
+2 reference name
+3 percent identity
+4 alignment length
+5 mismatches
+6 gap opens
+7 query start
+8 query end
+9 reference start
+10 reference end
+11 e-value
+12 bitscore
+13 length of query
+14 length of reference sequence
+(accordin lastal manual - more column may be added in future)
+
+Needed mgblast order:
+
+qseqid   1 -> 1
+qlen     2 -> 13
+qstart   3 -> 7
+qend     4 -> 8
+sseqid   5 -> 2
+slen     6 -> 14
+sstart   7 -> 9
+send     8 -> 10
+pident   9 -> 3
+bitscore 10-> 12
+evalue   11-> 11
+sstrand  12-> must be evaluated!
+
+'''
+import subprocess
+import sys
+last_command = " ".join(["lastal"] + sys.argv[1:])
+p = subprocess.Popen(last_command, shell=True, stdout=subprocess.PIPE)
+for j in p.stdout:
+    line = j.decode()
+    if line[0] != "#":
+        items = line.split("\t")
+        strand = "+" if int(items[6]) < int(items[7]) else "-"
+        out = "\t".join([items[i - 1]
+                         for i in [1, 13, 7, 8, 2, 14, 9, 10, 3, 12, 11]
+                         ]) + "\t" + strand + "\n"
+        print(out, end="")
b
diff -r c56807be3b72 -r 3bc73f5dc785 bin/mgblast
b
Binary file bin/mgblast has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 bin/runOGDFlayout
b
Binary file bin/runOGDFlayout has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 bin/select_and_sort_contigs.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/select_and_sort_contigs.pl Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,91 @@
+#!/usr/bin/env perl
+
+# Selects contigs exceeding the specified RD 
+# and sort them based on their properties
+# which have to be provided in FASTA ID line:
+# >CLXContigY (lenght-RD-lenght*RD)
+# (RD = average Read Depth of the contig)
+# (GR = genome representation (also marked as "pxd") = lenght*RD)
+#
+# ver. 03 - requires input filename {fn}.info.fasta
+#         - output saved to {fn}.info.minRDxxx.fasta
+# 
+use warnings;
+
+$jm_contigs = $ARGV[0];
+if ($ARGV[1]) {
+  $min_pokryti = $ARGV[1];
+} else {
+  die "Missing argument - minRD\n";
+}
+if ($jm_contigs =~/(\S+).fasta$/) {
+  $jm_vystup_zakladni = $1.".minRD".$min_pokryti;
+  } else {
+  $jm_vystup_zakladni = $jm_contigs.".minRD".$min_pokryti;
+  }
+$jm_vystup_sort_RD    = $jm_vystup_zakladni."_sort-RD.fasta";       # min. read depth
+$jm_vystup_sort_delka = $jm_vystup_zakladni."_sort-length.fasta";
+$jm_vystup_sort_pxd   = $jm_vystup_zakladni."_sort-GR.fasta";
+
+%pokryti = ();         # prum. pokryti contigu ctenimi
+%delka_cont = ();      # delka contigu (jeho consensu)
+%pxd = ();             # pxd = pokryti x delka
+%sekvence = ();        # sekvence contigu
+
+open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n";
+while ($radek = <CONT>) {
+  if ($radek =~/^>(CL\d+Contig\d+) \((\d+)\-(\S+)\-(\d+)\)/) {
+    if ($3>=$min_pokryti) {
+#      print "$1 : delka $2, prum. pokryti $3, pxd $4\n";
+      $delka_cont{$1}   = $2;
+      $pokryti{$1}      = $3;
+      $pxd{$1}          = $4; 
+    }
+  }
+}
+close CONT;
+
+# ze souboru contigu se nactou sekvence tech, vybranych v predchozi fazi
+open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n";
+$ukladat = 0;
+while ($radek = <CONT>) {
+  if ($radek =~/>(CL\d+Contig\d+)/) {
+      if (exists($pokryti{$1})) {
+          $ukladat = 1;
+          $jmeno = $1;
+      } else {
+          $ukladat = 0;
+      }
+  } elsif ($ukladat) {
+      $sekvence{$jmeno} .= $radek;
+  }
+}
+close CONT;
+
+# vygeneruji se soubory, kde jsou contigy co prosly pres >= $min_pokryti setrideny
+# podle pokryti, delky, nebo delky x pokryti
+open (VYSTUP, ">$jm_vystup_sort_RD") or die "Cannot write to $jm_vystup_sort_RD\n";
+foreach $klic (sort {$pokryti{$b} <=> $pokryti{$a}} keys(%pokryti)) {
+  print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n";
+  print VYSTUP "$sekvence{$klic}";
+}
+close VYSTUP;
+open (VYSTUP, ">$jm_vystup_sort_delka") or die "Cannot write to $jm_vystup_sort_delka\n";
+foreach $klic (sort {$delka_cont{$b} <=> $delka_cont{$a}} keys(%delka_cont)) {
+  print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n";
+  print VYSTUP "$sekvence{$klic}";
+}
+close VYSTUP;
+open (VYSTUP, ">$jm_vystup_sort_pxd") or die "Cannot write to $jm_vystup_sort_pxd\n";
+foreach $klic (sort {$pxd{$b} <=> $pxd{$a}} keys(%pxd)) {
+  print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n";
+  print VYSTUP "$sekvence{$klic}";
+}
+close VYSTUP;
+
+$celkem_pxd = 0;
+foreach $hodnota (values(%pxd)) {
+#  print "H: $hodnota\n";
+  $celkem_pxd += $hodnota;
+}
+print "\nTotal GR: $celkem_pxd\n\n"; 
b
diff -r c56807be3b72 -r 3bc73f5dc785 build_shed_tarball.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/build_shed_tarball.sh Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,4 @@
+#!/bin/sh
+./get_version.sh
+planemo shed_build .
+# TODO - add verification that version in xml match the version tag
b
diff -r c56807be3b72 -r 3bc73f5dc785 checkR.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/checkR.R Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,31 @@
+#!/usr/bin/env Rscript
+rfiles = dir(path = "lib", pattern ="[.]R$", full.names = TRUE, recursive = TRUE)
+rcode = grep('library', unlist(sapply(rfiles, readLines)), value = TRUE)
+
+packages = unique(gsub("[),].*$","", gsub("^.*library[(]","",rcode)))
+packages = c(packages, "Rserve") # Rserve added - it is loaded from python
+versions = list('igraph'="1.0.0")
+for (i in packages){
+    missing_packages=c()
+    packgs = installed.packages()
+    tryCatch(
+    {
+        suppressPackageStartupMessages(library(i, character.only = TRUE))
+        message(paste("package ",i,"succesfuly loaded"))
+        if( !is.null(versions[[i]])){
+            if (numeric_version(versions[[i]])>numeric_version(packgs[i,"Version"])){
+              message(paste("\033[0;31mversion",packgs[i,"Version"],
+                            "of package ",i,"is installed but at least ",
+                            numeric_version(versions[[i]]),"is needed!  \033[0m"))
+            }
+        }
+    },
+    error=function(cond){
+      message(paste('\033[0;31mpackage ',i,
+                    'was not loaded \033[0m - please install this package!'));
+        missing_packages = c(missing_packages,i)
+    }
+    )
+}
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 config.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/config.py Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,323 @@\n+\'\'\'\n+All configuration for clustering\n+\'\'\'\n+import os\n+import tempfile\n+from math import exp\n+from collections import namedtuple\n+MAIN_DIR = os.path.dirname(os.path.realpath(__file__))\n+def add_base_path(base):\n+    \'\'\'automates generating absolute path in config\'\'\'\n+    def joined_path(p):\n+        \'\'\'create absolute path function \'\'\'\n+        return os.path.join(base, p)\n+    return joined_path\n+\n+PATH = add_base_path(MAIN_DIR)\n+\n+# clustering general settings\n+DIRECTORY_TREE = {\'libdir\': \'libdir\',\n+                  \'seqclust\': \'seqclust\',\n+                  \'assembly\': \'seqclust/small_clusters_assembly\',\n+                  \'blastx\': \'seqclust/blastx\',\n+                  \'clustering\': \'seqclust/clustering\',\n+                  \'clusters\': \'seqclust/clustering/clusters\',\n+                  \'superclusters\': \'seqclust/clustering/superclusters\',\n+                  \'mgblast\': \'seqclust/mgblast\',\n+                  \'blastn\': \'seqclust/blastn\',\n+                  \'prerun\': \'seqclust/prerun\',\n+                  \'prerun_clusters\': \'seqclust/prerun/clusters\',\n+                  \'sequences\': \'seqclust/reads\',\n+                  \'custom_databases\': \'seqclust/custom_databases\'}\n+\n+if "TEMP" in os.environ:\n+    DIRECTORY_TREE[\'TEMP\'] = os.environ["TEMP"]\n+else:\n+    DIRECTORY_TREE[\'TEMP\'] = tempfile.TemporaryDirectory().name\n+\n+FILES = {\'sample_db\': DIRECTORY_TREE[\'TEMP\'] + "/sample.db",\n+         \'sample_fasta\': DIRECTORY_TREE[\'prerun\'] + "/sample.fasta",\n+         \'prerun_cls_file\' : DIRECTORY_TREE[\'prerun\'] + "/sample_hitsort.cls",\n+         \'filter_sequences_file\' : DIRECTORY_TREE[\'prerun\'] + "/filter_sequences.fasta",\n+         \'sequences_db\': DIRECTORY_TREE[\'TEMP\'] + "/sequences.db",\n+         \'sequences_fasta\': DIRECTORY_TREE[\'sequences\'] + "/reads.fasta",\n+         \'hitsort\': DIRECTORY_TREE[\'clustering\'] + "/hitsort",\n+         \'hitsort_db\': DIRECTORY_TREE[\'TEMP\'] + "/hitsort.db",\n+         \'cls_file\': DIRECTORY_TREE[\'clustering\'] + "/hitsort.cls",\n+         \'clusters_summary_csv\': "CLUSTER_TABLE.csv",\n+         \'profrep_classification_csv\': "PROFREP_CLASSIFICATION_TEMPLATE.csv",\n+         \'superclusters_csv_summary\': "SUPERCLUSTER_TABLE.csv",\n+         \'comparative_analysis_counts_csv\': "COMPARATIVE_ANALYSIS_COUNTS.csv",\n+         \'clusters_info\': ".clusters_info.csv",\n+         \'tarean_report_html\': "tarean_report.html",\n+         \'cluster_report_html\' : "cluster_report.html",\n+         \'supercluster_report_html\' : \'supercluster_report.html\',\n+         \'repeat_annotation_summary_rds\' : \'repeat_annotation_summary.rds\',\n+         \'summarized_annotation_html\' :\'summarized_annotation.html\',\n+         \'main_report_html\' : \'index.html\',\n+         \'TR_consensus_fasta\': "TAREAN_consensus_rank_{}.fasta",\n+         \'summary_histogram\' : \'summary_histogram.png\',\n+         \'comparative_summary_map\': \'comparative_summary.png\',\n+         "how_to_cite" : "HOW_TO_CITE.html",\n+         \'logfile\' : "logfile.txt",\n+         \'contigs\' : "contigs.fasta",\n+         \'filter_omitted\' : DIRECTORY_TREE[\'sequences\'] + "/removed_filtering_positive_reads.fasta",\n+         \'filter_kept\' : DIRECTORY_TREE[\'sequences\'] + "/kept_filtering_positive_reads.fasta"\n+}\n+\n+\n+# include in output-  [source, destination]\n+INCLUDE = [\n+    [PATH("HOW_TO_CITE.html"), FILES["how_to_cite"]]\n+]\n+\n+# this is attribute of path - not a file name!\n+FILES_TO_DISCARD_AT_CLEANUP = [\n+    \'prerun\', \'mgblast\', \'blastn\', "blastx",\n+    \'hitsort\', "repeat_annotation_summary_rds"\n+]\n+\n+# relative links for html files\n+HTML_LINKS = {\n+    "CLUSTER_TO_SUPERCLUSTER" : "../../superclusters/dir_SC%04d/index.html",\n+    "SUPERCLUSTER_TO_CLUSTER" : "../../clusters/dir_CL%04d/index.html",\n+    "CLUSTER_TO_CLUSTER" : "../dir_CL%04d/index.html",\n+    "SUPERCLUSTER_TO_SUPERCLUSTER" : "../dir_SC%04d/index.html",\n+    "CLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html",\n+    "SEPERCLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html",\n+    "ROOT_TO_CLUSTER" : "se'..b'bitscore",\n+    \'column_types\' : [str, str, float, float, float, float, float],\n+    \'program\': \'diamond blastx\',\n+    \'filter_function\' : lambda x: x.bitscore >= 30,\n+    \'parallelize\' : False\n+}\n+BLASTX_W3 = {\n+    \'args\': \' -num_alignments 1 -word_size 2 -evalue 0.01 \',\n+    \'output_columns\' : "qseqid sseqid qlen slen length ppos bitscore",\n+    \'column_types\' : [str, str, float, float, float, float, float],\n+    \'program\': \'blastx\',\n+    \'filter_function\' : lambda x: x.bitscore >= 33\n+}\n+BLASTX_W2 = BLASTX_W3\n+BLASTX_W2[\'args\'] = \' -num_alignments 1 -word_size 3 -evalue 0.01 \'\n+\n+\n+ARGS = None\n+\n+ILLUMINA = Option(\n+    name="illumina",\n+    database=\'blastdb_legacy\',\n+    all2all_search_params=(\'mgblast -p 75 -W18 -UT -X40 -KT -JF  -F \'\n+                           \'"m D" -v100000000 -b100000000\'\n+                           \' -D4 -C 30 -H 30 -i {query} -d {blastdb}\'),\n+    filtering_threshold=FilteringThreshod(55, 90, 0, 0, 1),\n+    filter_self_hits=False,\n+    legacy_database=True,\n+    lastdb=False,\n+    annotation_search_params=AnnotationParams(\n+        blastn={\n+            \'args\': \' -task blastn  -num_alignments 1 -evalue 0.01 \',\n+            \'output_columns\' : "qseqid sseqid qlen slen length ppos bitscore",\n+            \'column_types\' : [str, str, float, float, float, float, float],\n+            \'program\': \'blastn\',\n+            \'filter_function\' : lambda x: x.length > 30 and x.bitscore > 60\n+\n+        },\n+        blastx=BLASTX_W3,\n+        blastn_trna={\n+            \'args\': \' -task blastn  -num_alignments 1 -word_size 7\',\n+            \'output_columns\' : "qseqid sseqid qlen slen length ppos bitscore",\n+            \'column_types\' : [str, str, float, float, float, float, float],\n+            \'program\': \'blastn\',\n+            \'filter_function\' : lambda x: x.length > 18 and x.bitscore > 60\n+        }\n+    )\n+)\n+\n+ILLUMINA_DUST_OFF = ILLUMINA._replace(\n+    all2all_search_params=(\'mgblast -p 75 -W18 -UT -X40 -KT -JF  -F \'\n+                           \'F -v100000000 -b100000000\'\n+                           \' -D4 -C 30 -H 30 -i {query} -d {blastdb}\'),\n+)\n+\n+ILLUMINA_SHORT = ILLUMINA._replace(\n+    name="illumina_short",\n+    all2all_search_params=(\'mgblast -p 75 -W18 -UT -X40 -KT -JF  -F \'\n+                           \'"m D" -v100000000 -b100000000\'\n+                           \' -D4 -C 20 -H 30 -i {query} -d {blastdb}\'),\n+    filtering_threshold=FilteringThreshod(40, 90, 0, 0, 0.1)\n+)\n+\n+\n+OXFORD_NANOPORE = Option(\n+    name="oxford_nanopore",\n+    database=\'lastdb\',\n+    all2all_search_params=(\'last_wrapper.py -f blasttab+ -P1 \'\n+                           \' -m 700 -p {} \'\n+                           \' {{blastdb}} {{query}}  \').format(LASTAL_PARAMS),\n+    filtering_threshold=FilteringThreshod(40, 50, 0, 0, 0.01),\n+    filter_self_hits=True,\n+    legacy_database=False,\n+    lastdb=True,\n+    annotation_search_params=AnnotationParams(\n+        blastn={\n+            \'args\': \' -task blastn  -num_alignments 1 -evalue 0.01 -word_size 11\',\n+            \'output_columns\' : "qseqid sseqid qlen slen length ppos bitscore",\n+            \'column_types\' : [str, str, float, float, float, float, float],\n+            \'program\': \'blastn\',\n+            \'filter_function\' : lambda x: x.length > 30 and x.bitscore > 50\n+        },\n+        blastx={\n+            \'args\': \' -num_alignments 1 -word_size 2 -evalue 0.1\',\n+            \'output_columns\' : "qseqid sseqid qlen slen length ppos bitscore",\n+            \'column_types\' : [str, str, float, float, float, float, float],\n+            \'program\': \'blastx\',\n+            \'filter_function\' : lambda x: x.bitscore >= 30\n+        },\n+        blastn_trna={\n+            \'args\': \' -task blastn  -num_alignments 1 -word_size 7\',\n+            \'output_columns\' : "qseqid sseqid qlen slen length ppos bitscore",\n+            \'column_types\' : [str, str, float, float, float, float, float],\n+            \'program\': \'blastn\',\n+            \'filter_function\' : lambda x: x.length > 18 and x.length > 60\n+        }\n+    )\n+)\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/0INFO.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/databases/0INFO.md Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,9 @@
+# DATABASE FOLDER CONTENT #
+
+This folder contain fasta files sequence databases used for cluster annotation.
+To run clustering, blast databases must be created from fasta files. This will be done automaticaly when make is run.
+folder content:
+dna_database.fasta - mitochondrial, plastid, rDNA genes, contaminants
+protein_database.fasta - protein domains from transposable elements
+tRNA_database.fasta - fragments from tRNA sequences for identification of Primer Binding Sites of transposable elements

b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/classification_tree_metazoa_v0.rds
b
Binary file databases/classification_tree_metazoa_v0.rds has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/classification_tree_metazoa_v2.rds
b
Binary file databases/classification_tree_metazoa_v2.rds has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/classification_tree_metazoa_v3.rds
b
Binary file databases/classification_tree_metazoa_v3.rds has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/classification_tree_viridiplantae_v3.0.rds
b
Binary file databases/classification_tree_viridiplantae_v3.0.rds has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/classification_viridiplantae_tree.rds
b
Binary file databases/classification_viridiplantae_tree.rds has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/create_protein_database_viridiplantae_v3.0.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/databases/create_protein_database_viridiplantae_v3.0.R Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,119 @@
+#!/usr/bin/env Rscript
+## prepare protein database in format suitable fro repeatexplorer search
+library(Biostrings)
+domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/new_protein_domains_prelim/coded01/ALL_protein-domains_05.fasta")
+                                        # this cannot be used - __ is also in element id!!!
+                                        # element_names = gsub("^.+__","",names(domains))  #
+
+# this shou be version 3
+domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_protein-domains.fasta")
+                                        # this cannot be used - __ is also in element id!!!
+                                        # element_names = gsub("^.+__","",names(domains))  #
+
+element_names = sapply(strsplit(names(domains),split="__"),function(x)paste(x[-1],collapse="__"))
+
+classification = readLines("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_classification")
+## classification contain slash in categories - must be replaced with underscore
+classification = gsub("/","_",classification)
+
+names(classification) = sapply (strsplit(classification, split="\t"),"[[", 1)
+classification_formated = sapply (sapply(strsplit(classification, "\t"), "[",-1), paste, collapse="/")
+domain_name = gsub("__.+","",names(domains))
+table(domain_name)
+full_names = paste0(names(domains),"#", classification_formated[element_names],':', domain_name)
+head(full_names)
+names(domains) = full_names
+writeXStringSet(domains,"/mnt/raid/users/petr/workspace/repex_tarean/databases/protein_database_viridiplantae_v3.0.fasta")
+
+
+library(data.tree)
+library(treemap)
+data(GNI2014)
+class(GNI2014)
+head(classification_formated)
+
+## compile all classification together
+dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database.fasta")
+dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database_masked.fasta")
+
+add_weight = function(i, name){
+    if (is.null(i$parent[[name]])){
+        i$parent[[name]] = i[[name]]
+    }else{
+        i$parent[[name]] = i[[name]] + i$parent[[name]]
+    }
+    if (i$parent$level == 1){
+        return()
+    }else{
+        add_weight(i$parent, name)
+    }
+}
+
+
+cls_string = c(
+    "All/contamination",
+    "All/organelle/plastid",
+    "All/organelle/mitochondria",
+    "All/repeat/rDNA/45S_rDNA/18S_rDNA",
+    "All/repeat/rDNA/45S_rDNA/25S_rDNA",
+    "All/repeat/rDNA/45S_rDNA/5.8S_rDNA",
+    "All/repeat/rDNA/5S_rDNA",
+    "All/repeat/satellite",
+    "All/repeat/mobile_element/Class_I/SINE",
+    "All/repeat/mobile_element/Class_II/Subclass_1/TIR/MITE"
+)
+cls_full_name =c(
+    "contamination",
+    "organelle/plastid",
+    "organelle/mitochondria",
+    "45S_rDNA/18S_rDNA",
+    "45S_rDNA/25S_rDNA",
+    "45S_rDNA/5.8S_rDNA",
+    "5S_rDNA/5S_rDNA",
+    "satellite",
+    "Class_I/SINE",
+    "Class_II/Subclass_1/TIR/MITE"
+)
+
+
+
+df1 = data.frame(pathString = cls_string, full_name=cls_full_name, stringsAsFactors = FALSE, nhits = 0,domains=0, prop=0, mean_weight=0, total_weight=0)
+df2 = data.frame(pathString = paste("All/repeat/mobile_element",unique(classification_formated),sep="/"), full_name= unique(classification_formated), stringsAsFactors = FALSE,  nhits = 0, domains = 0, prop =0, mean_weight=0, total_weight = 0)
+cls_tree = as.Node (rbind(df1,df2))
+saveRDS(object = cls_tree, file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree_viridiplantae_v3.0.rds")
+
+print(cls_tree, "nhits", 'domains')
+
+names(cls_tree)
+cls_tree$leaves[[3]]$mean_weight
+
+cls_tree$leaves[[1]]$mean_weight
+
+
+
+## add to nodes
+add_value_to_nodes = function(tr, name="weight"){
+  for (i in Traverse(tr)){
+      w = sum(sapply(i$leaves,"[[",name))
+      i[[name]] = w
+  }
+  return(tr)
+}
+
+
+tr2 = add_value_to_nodes(cls_tree, name="nhits")
+print(tr2,"nhits")
+
+## add_nhits
+for (i in cls_tree$leaves){
+    cls_string = i$full_name
+    ## go to the root:
+    nhits = i$nhits
+    prop = i$prop
+    mean_weight = i$mean_weight
+    add_weight(i, "nhits")
+}
+
+
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/database_version.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/databases/database_version.txt Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,6 @@
+#Databases versions are stored in the following format:
+#filename md5sum database_name version_number note
+dna_database.fasta f15fad8ea4df2d782019f40b422dbaec dna_database 1 contains mitochondrial, plastid, rDNA, contaminants
+
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/dna_database_masked.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/databases/dna_database_masked.fasta Fri Dec 20 14:17:59 2019 +0000
b
b'@@ -0,0 +1,1445911 @@\n+>1_gi|2462950|emb|X94256.1|#45S_rDNA/18S_rDNA\n+AACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTGTAAGTATAAACTCTTTTGTAC\n+TGTGAAACTGCGAATGGCTCATTAAATCAGTTATAGTTTCTTTGATGGTACCTTGCTACTCGGATACCCGTAGTAATTCT\n+AGAGCTAATACGTGCACCAACTCCCGACTCCTTTTGGGGAAGGGACGTATTTATTAGATAAAAGGCCGATGCGGGCTTGC\n+CCGGTGTTGCGGTGAATCATGATAACTTGTCGAATCGCACGGCCCCAGCGCTGGCGATGTTTCATTCAAATTTCTGCCCT\n+ATCAACTTTCGATGGTAGGATAGAGGCCTACCATGGTGGTAACGGGTGACGGAGAATTAGGGTTCGATTCCGGAGAGGGA\n+GCCTGAGAAACGGCTACCACATCCAAGGAAGGCAGCAGGCGCGCAAATTACCCAATCCCGACACGGGGAGGTAGTGACAA\n+TAAATAACAATACTGGGCTTTTACAAGTCTGGTAATTGGAATGAGTACAATCTAAATCCCTTAACGAGGATCCATTGGAG\n+GGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAG\n+TTGGATCTTGGGTCGGGGGGAGCGGTCCGCCCCTCGTGGGTGTGCACTGGTCCACCCGACCTTTCTGCCGGGGACGCGCT\n+CCTGGCCTTCGCTGGTCGGGACGCGGAGTCGGCGATGTTACTTTGAAAAAATTAGAGTGCTCAAAGCAAGCCTATGCTCT\n+GAATACATTAGCATGGAATAACGTGATAGGACTCTGGTCCTGTTGTGTTGGTCTTCGGGACCGGAGTAATGATTAATAGG\n+GACAGTTGGGGGCATTCGTATTTCATTGTCAGAGGTGAAATTCTTGGATTTATGAAAGACGAACTTCTGCGAAAGCATTT\n+GCCAAGGATGTTTTCATTAATCAAGAACGAAAGTTGGGGGCTCGAAGACGATCAGATACCGTCCTAGTCTCAACCATAAA\n+CGATGCCGACTAGGGATTGGCGGGTGTTGATTTGATGACCCCGCCAGCACCTTATGAGAAATCAAAGTTTTTGGGTTCCG\n+GGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGGAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTT\n+GACTCAACACGGGGAAACTTACCAGGTCCAGACATAGTAAGGATTGACAGATTGAGAGCTCTTTCTTGATTCTATGGGTG\n+GTGGTGCATGGCCGTTCTTAGTTGGTGGAGTGATTTGTCTGGTTAATTCCGTTAACGAACGAGACCTCAGCCTGCTAACT\n+AGTTACGCGAAGGATTTTTTCCTTTGCGGCCAACTTCTTAGAGGGACTATCGGCGTCTAGCCGATGGAAGTTTGAGGCAA\n+TAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGATGGATTCAACGAGTTTATAACCTGGGC\n+CGATAGGCCTGGGTAATCTTTTGAAATTTCATCGTGATGGGGATAGATCATTGCAATTATTGATCTTCAACGAGGAATTC\n+CTAGTAAGCGCGAGTCATCAGCTCGCGTTGACTACGTCCCTGCCCTTTGTACACACCGCCCGTCGCTCCTACCGATTGAA\n+TGGTCCGGTGAAGTTTTCGGATCGCGGCGACGCCGGCGGTTCGCCGCCGGTGACGTTGTGAGAAGTTCATTAAACCTTAT\n+CATTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCAGAAGGATCA\n+>2_gi|37694160|gb|AY330423.1|#45S_rDNA/18S_rDNA\n+AAGATTAAGCCATGCATGTGTAAGTATAAACTCTTTTGTACTGTGAAACTGCGAATGGCTCATTAAATCAGTTATAGTTT\n+CTTTGATGGTACCTTGCTACTCGGATAACCGTAGTAATTCTAGAGCTAATACGTGCACAAACTCCCGACTTCTTTTTGAG\n+GAAGGGACGTATTTATTAGATAAAAGGCCGATGCGGGCTTGCCCGGTATTGCGGTGAATCATGATAACTTGTCGAATCGC\n+ACGGCCCCAGCGCTGGCGATGTTTCATTCAAATTTCTGCCCTATCAACTTTCGATGGTAGGATAGAGGCCTACCATGGTG\n+GTAACGGGTGACGGAGAATTAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGAAGGCAGCAG\n+GCGCGCAAATTACCCAATCCCGACACGGGGAGGTAGTGACAATAAATAACAATACTGGGCTTTTACAAGTCTGGTAATTG\n+GAATGAGTACAATCTAAATCCCTTAACGAGGATCCATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTC\n+CAATAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATCTTGGGTCGGGGGGAGCGGTCCGCCCCTTGTG\n+GGTGTGCACTGGTCCACCCGACCTTTTGGCCGGGGACGCGCTCCTGGTCTTAATTGATCGGGACGCGGAGTCGGCGATGT\n+TACTTTGAAAAAATTAGAGTGCTCAAAGCAAGCCTATGCTCTGAATACATTAGCATGGAATAACGTGATAGGACTCTGGT\n+CCTGTTGTGTTGGTCTTCGGGACCGGAGTAATGATTAATAGGGACAGTTGGGGGCATTCGTATTTCATTGTCAGAGGTGA\n+AATTCTTGGATTTATGAAAGACGAACTTCTGCGAAAGCATTTGCCAAGGATGTTTTCATTAATCAAGAACGAAAGTTGGG\n+GGCTCGAAGACGATCAGATACCGTCCTAGTCTCAACCATAAACGATGCCGACTAGGGATTGGCGGATGTTGATTTGATGA\n+CTCCGCCAGCACCTTATGAGAAATCAAAGTTTTTGGGTTCCGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGGAATT\n+GACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGGAAACTTACCAGGTCCAGACATAGT\n+AAGGATTGACAGATTGAGAGCTCTTTCTTGATTCTATGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGTGATTTGT\n+CTGGTTAATTCCGTTAACGAACGAGACCTCAGCCTGCTAACTAGTTACGCGAAGGATTCTCTCCTTTGCGGCCAACTTCT\n+TAGAGGGACTATCGGCGTCTAGCCGATGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCA\n+CGCGCGCTACACTGATGAATTCAACGAGTTTATAACCTGGGCCGATAGGCCTGGGTAATCTTTTTAAATTTCATCGTGAT\n+GGGGATAGATCATTGCAATTATTGATCTTCAACGAGGAATTCCTAGTAAGCGCGAGTCATCAGCTCGCGTTGACTACGTC\n+CCTGCCCTTTGTACACACCGCCCGTCGCTCCTACCGATTGAATGGTCCGGTGAAGTGTTCGGATCGCGGCGATGCCGGCG\n+GTTCGCCGCCGGTGACGTTGTGAGAAGTTCATTAAACCTTATCATTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCCGTA\n+>3_gi|987961|emb|X80986.1|#45S_rDNA/18S_rDNA\n+AACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTGTAAGTATAAACTCTTTTGTAC\n+TGTGAAACTGCGAATGGCTCATTAAATCAGTTATAGTTTCTTTGATGGTACCTTGCTACTCGGATAACCGTAGTAATTCT\n+A'..b'ATAAATTAATTAAATTGAAGTTAATTAATTTATGTTGACATTATACTA\n+TGTATGTGATATTATAATTTTAGTATTGCCAGAATAGCTCAGTAGGTAGAGCAGTGGACTGAAAATCCGCGTGTCACCAG\n+TTCAAATCTGGTTTCTGGCAAATTTCAATGTTATATACATTGATGATTTCAATTAATTAAGATTTTATTTTATTCTATTC\n+TATTCTATTCTATTCTATGATATTCTATTTACAGTAATATCATAGAAACCAAATATTTACAAGATTTTATGCCTTCCTAT\n+GAAGAACATCTAAAATTAGTTAAATTATAACATAATCGTTTTGTTCGTTTTTAACAAAATGTTACCAATTTATTTTTCAA\n+TTCATAAATAAATGAATAATAAAATAAAATTATGTATTGTGTAAATTAAACAACACGAACTTTTTATTCTTTTATTCAAT\n+GCTTTCAAACTGCTTTAGTTGCTAAAGCAGTTTGAAAAACATAACATGTTGTAATTATAATTTATACTACTGCACCAATA\n+GAAACACCAATGACTATTTTATTAAGATCACCAATAAAAATGTTCGTTGTTATCAACATTTTGTATGTTCATTTTGTTGA\n+CTGTGGTGATGATGTATATATATAATAAATATGCCAAAAGAGTAACGAAAAATACACAATCAATTTGCTAGATACACAAG\n+AAAAACAAACGCGTCCATCGTCTAAAGGATAGGACAGAGGTCTTCTAAACCTCTAGTATAGGTTCAATTCCTATTGGACG\n+CAAAATAGAATAAATACAAAAAAGTGTTTGTCTTTTTATTCTTGTTTTTTATCAATATAATGATACATTATTGTATTATT\n+ATTCCATCAGGTAATTGAGTGCTTGAAATCATAAAGAAAATTAGTTTGTATTATTTTAACGTCCTTGTTTCGTAGTTATT\n+GACATATTGAAGAATCATGTTTAATGAATGTTGTATCGTTAGTTATTTTGTATTGATTTGAAATTTCAACAAATGGAAGT\n+TTTTTCTTAACACTTATTTTAAGTCATAATAATTTTTATTACATTAATTTAAGTGAAAATTATTTCTGCATTTGAATTGT\n+TTCAATTGATTCAAACTTCAACATTATTCTATTGTCTAAAATGTGACAACATTTGAACACTTTTAAATGATTTTGTTTGT\n+AATATTTTTTTATATTATACTTACTTAGTTCTTGCAAATTATCAATATAATTTATTAATTAAATATTTGATAATTTAAAT\n+AAAAATTTATTTAAATTATTGTGTTATCCAACTTTACCATTACTGTCTATTCTATTGTTAATTATTAACAATTAAAATGC\n+TTGTCTATTAATATTCAATTGTGTATCCCACCCATTGTTAAAGACAAACGAAATCATTAATCATTTTTATTGAGTCTGTT\n+TAACAAGATTGACAATGATTCAGTTGACCAGCCATACTTTGGGTAGGTTATAATATAACAGTTTACGAGGCCTTTATGGT\n+TAAATTGAATTTATGAAAACATATGCAATTATTGAGGCCGGAGGTGAACAACTTCAAGTAGAGCCAGGCCGTTTTTACAA\n+TATCCGTCATCTGTCTCTCCGAGGTGTAAATTTTTGGGGACAAAACACAAAACTTTTGTTGTATCGTGTATTAATGATTC\n+GTCATCAATCTGCAACTGTGCTTGGTAATCCTTGGGTTCAAAATGCTACAGTAAAAGGTCGTATTTTAGATGCTCGACGT\n+GATGATAAACTTGTAATCTATAAAATGCGCGCAAAAAAGAAAACCAGAAGAAAGCGTGGACATCGCCAAGGTCTTACTAG\n+ATTTGTAGTAGATGCTATTTGTTTGAATGGAAAAGTACTTTTGGACTAAATCAATTGTAGAGTGATTTCTGTTATTCCAT\n+TTTAGATTGACTTGTGTAATTCGATGGTTATATAAATATGATTAAGTATGATACTAAAATTTTTTTAATTGTGTTAACAC\n+TATATAAAAAAATCTTAATAAATATAAGTGACTAACATGTTGGTTTATCCAACAAAGTCACTTATATTTATTAAGATTTT\n+TTTATAAAAATGTAATAAATATAAGTAAATGTTGGTTCTTCCAAAAGGTAAAAACCTTTTGAAAGAACGGGTCAATGATT\n+TATTCTAGCTACAAAACCATATTGTTTTCAATTTGTGTTATGTAAACAAATGAATTAATTTAACACTATAGAGAAACAAA\n+CAGACAGACAAATAGATAGATATATAAATTAATACATTATCTTGTTAATGATTTTCGTTTTAAATAAATATAGTTAATTA\n+TTAGTAAAATTGCAAATGAGACAAGTAACACTACGCTTGCAATTGCAGTAGCTCCTTTATAATCATATTGTTCTAATCTT\n+TGAAATATTAGTACAGAAACTACTAAATCTTTCATAGGAATGTTAGATGCTACTAAAACAATTGAACCATATTCTCCAAT\n+GGCTCTAGAAAAACCAAGAGCAGTTCCAGTCAACAAGGGAGATATCATAGGAGGAAATAACACATTCCAAAATGTTGTCC\n+AAGGTGATGCGCCAATACACCATGCTGCTTCTTCTGTTTCTTCTTCCATACTTTGTAAAACAGGTTGTATGGTGCGTACA\n+ATAAAGGGAAGTGATACAAACATCATAGCTATCAATACACCTAATCGGCTAAATGCTATTTTTATACCAAGCCATGAACA\n+GATTGGTCCCATCCAACCTTTATCACTATATACAGTCATAAGTGTTAGACCTCCTACTGACGTAGGAAGTGCGAAAGGAA\n+GATCTACTGCAGCATCTAAAAAATTTTTTCCTGGAAAACGATATCTTACTAAGATCCAAGCAAGGATTAAACCTAAAAAA\n+GCATTAATAACAGCTGCAAGTGCTGCAGTTAAAAAAGTTACTTTATATGCAGATAAAACAACAGGTTCAGTTACTGCCTG\n+AAAAATAGTATACCAAGATTGTTCTTTAGTCCTTAATAAAAGAGCTGTAACAGGTAAAAATAATATTAATATTCCGTAAT\n+GTAAAGCGGCTATTAATAAAAATTCAAAATAAGTAAAAAAACGAATATCCCGTTTACGAATATTTAAGAGTACTGTTCGA\n+GATATCACACACAATAGAATCATATTATAATACTGTTATTATTTTTTGGTCACTGTCACACAATAATAAATACAAATATA\n+GCAAACATCAATAAATTATATTGTGATGTGACTATTTATTATGTTGATTTCCTATCTATTTAGATAGAACTGTCTTTAAA\n+TAGCTAGATAATTTGTGTAAACATAAACATTTCGTATAAAAAAGTATTCTATAGTATTTGTCATTTATAATAATATATAA\n+GTTATGATATATGTTAATTTTGTTATTAAAACAGACAATTAATTTATGTTATGCTAAGCTATATTTTTTTAGTACAATGA\n+ATAAAAAAGAAAAGACATAGAATTTATAAATTTAATTTAATGTTATAAAGATAATTTTTTTATTGTATACTAATTATACA\n+ATAAAAAACAAAATGTTTAAACAAGACTGAAATACATTAAAAAATAAAACAGATTCAATTGAAATCACTAAACACATAAA\n+ACCTTGAATTACATACAAAGTAAATGAAAATAAGCATATATAAAATTATTCATTTTCAAAACCATATATTTTGTTATTAG\n+TACAAATTGGAAATTGGTCACTTATTAATTCTTTCAAATTTATGTGGATACATCATTCTTTAGAATACTTATATATGAGA\n+GGCATAATGTTTTGATTTCCTATGTAAACACATAAAACTAAATTGCATTATGAATTATTTCTAACAATACTTCTATGATG\n+TTTTCAATTCACTGTTATTGTAAGGACAATCAGGATTGAACTGATGTCTTCCACCACGTCAAGGTGGCACTCTACCGCTG\n+AGTTATGTCCCT\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/lastal_params
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/databases/lastal_params Fri Dec 20 14:17:59 2019 +0000
b
b'@@ -0,0 +1,661 @@\n+# lastal version: 956\n+# maximum percent identity: 100\n+# scale of score parameters: 4.5512\n+# scale used while training: 91.024\n+\n+# lastal -j7 -S1 -P8 -r5 -q5 -a15 -b3 ps_tail_near /tmp/tmpLzO6pN\n+\n+# aligned letter pairs: 645322.5\n+# deletes: 53030.076\n+# inserts: 52961.894\n+# delOpens: 27321.807\n+# insOpens: 27522.598\n+# alignments: 1715\n+# mean delete size: 1.94094\n+# mean insert size: 1.92431\n+# delOpenProb: 0.0389264\n+# insOpenProb: 0.0392125\n+# delExtendProb: 0.484787\n+# insExtendProb: 0.480332\n+\n+# delExistCost: 290\n+# insExistCost: 288\n+# delExtendCost: 66\n+# insExtendCost: 67\n+\n+# substitution percent identity: 80.7124\n+\n+# count matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 167864.68      8440.25411     14824.7457     10778.6026    \n+# C 8305.6793      92568.78       5527.30401     14593.1965    \n+# G 14303.26163    5657.737106    91602.82       8233.2595     \n+# T 10559.8569     14962.3325     8284.02998     168830.9      \n+\n+# probability matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 0.260119       0.0130788      0.0229721      0.0167023     \n+# C 0.0128703      0.143442       0.00856498     0.0226133     \n+# G 0.022164       0.0087671      0.141946       0.0127581     \n+# T 0.0163633      0.0231853      0.0128367      0.261616      \n+\n+# score matrix (query letters = columns, reference letters = rows):\n+#        A      C      G      T\n+# A     89   -137    -85   -161\n+# C   -138    128   -128    -87\n+# G    -87   -126    129   -138\n+# T   -163    -85   -138     89\n+\n+# lastal -j7 -S1 -P8 -p- ps_tail_near /tmp/tmpLzO6pN\n+\n+# aligned letter pairs: 657515.8\n+# deletes: 54576.405\n+# inserts: 54467.682\n+# delOpens: 29338.269\n+# insOpens: 29782.167\n+# alignments: 1810\n+# mean delete size: 1.86025\n+# mean insert size: 1.82887\n+# delOpenProb: 0.0408357\n+# insOpenProb: 0.0414535\n+# delExtendProb: 0.462437\n+# insExtendProb: 0.453214\n+\n+# delExistCost: 277\n+# insExistCost: 273\n+# delExtendCost: 70\n+# insExtendCost: 72\n+\n+# substitution percent identity: 82.8127\n+\n+# count matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 172119.6       6326.08979     16347.823382   7375.214      \n+# C 6451.841992    99738.42       4226.312847    16076.55299   \n+# G 15766.519333   4374.698921    99296.38       6396.707638   \n+# T 7212.54884     16138.520254   6316.90954     173353.79     \n+\n+# probability matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 0.261772       0.00962117     0.0248629      0.0112167     \n+# C 0.00981242     0.151689       0.00642768     0.0244504     \n+# G 0.0239788      0.00665335     0.151017       0.00972857    \n+# T 0.0109694      0.0245446      0.0096072      0.263649      \n+\n+# score matrix (query letters = columns, reference letters = rows):\n+#        A      C      G      T\n+# A     93   -165    -79   -194\n+# C   -163    128   -159    -81\n+# G    -81   -156    129   -164\n+# T   -196    -81   -166     93\n+\n+# lastal -j7 -S1 -P8 -p- ps_tail_near /tmp/tmpLzO6pN\n+\n+# aligned letter pairs: 668948.5\n+# deletes: 57981.575\n+# inserts: 58221.335\n+# delOpens: 31807.825\n+# insOpens: 32507.3002\n+# alignments: 1869\n+# mean delete size: 1.82287\n+# mean insert size: 1.79102\n+# delOpenProb: 0.0432681\n+# insOpenProb: 0.0442196\n+# delExtendProb: 0.451415\n+# insExtendProb: 0.44166\n+\n+# delExistCost: 268\n+# insExistCost: 263\n+# delExtendCost: 72\n+# insExtendCost: 74\n+\n+# substitution percent identity: 83.6608\n+\n+# count matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 177183.17      5504.869925    17286.571547   6133.25055    \n+# C 5702.212901    102462.3342    3582.471673    16864.00328   \n+# G 16580.48127'..b'79943.91     \n+\n+# probability matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 0.267243       0.00676707     0.0264971      0.00723927    \n+# C 0.00726576     0.155061       0.00426108     0.0259319     \n+# G 0.0255491      0.00442638     0.153474       0.00708736    \n+# T 0.0071642      0.0257506      0.00684726     0.269435      \n+\n+# score matrix (query letters = columns, reference letters = rows):\n+#        A      C      G      T\n+# A     95   -197    -73   -235\n+# C   -191    131   -196    -76\n+# G    -75   -192    131   -193\n+# T   -235    -76   -196     94\n+\n+# lastal -j7 -S1 -P8 -p- ps_tail_near /tmp/tmpLzO6pN\n+\n+# aligned letter pairs: 667848.4\n+# deletes: 61567.479\n+# inserts: 62006.567\n+# delOpens: 34755.709\n+# insOpens: 35753.0592\n+# alignments: 1895\n+# mean delete size: 1.77143\n+# mean insert size: 1.7343\n+# delOpenProb: 0.0469511\n+# insOpenProb: 0.0482984\n+# delExtendProb: 0.435486\n+# insExtendProb: 0.423399\n+\n+# delExistCost: 255\n+# insExistCost: 248\n+# delExtendCost: 76\n+# insExtendCost: 78\n+\n+# substitution percent identity: 84.5102\n+\n+# count matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 178498.03      4524.992357    17712.490447   4841.2626     \n+# C 4848.343909    103538.1147    2846.8428447   17309.54422   \n+# G 17107.155533   2955.1213614   102468.661     4729.73675    \n+# T 4786.48956     17208.555606   4576.294121    179887.61     \n+\n+# probability matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 0.267277       0.00677557     0.0265221      0.00724914    \n+# C 0.00725975     0.155034       0.00426277     0.0259187     \n+# G 0.0256157      0.0044249      0.153433       0.00708215    \n+# T 0.00716713     0.0257675      0.00685239     0.269358      \n+\n+# score matrix (query letters = columns, reference letters = rows):\n+#        A      C      G      T\n+# A     95   -197    -72   -234\n+# C   -191    131   -196    -76\n+# G    -75   -192    131   -193\n+# T   -235    -76   -196     94\n+\n+# lastal -j7 -S1 -P8 -p- ps_tail_near /tmp/tmpLzO6pN\n+\n+# aligned letter pairs: 667329.4\n+# deletes: 61526.981\n+# inserts: 61935.551\n+# delOpens: 34736.121\n+# insOpens: 35715.1049\n+# alignments: 1891\n+# mean delete size: 1.77127\n+# mean insert size: 1.73416\n+# delOpenProb: 0.0469615\n+# insOpenProb: 0.048285\n+# delExtendProb: 0.435433\n+# insExtendProb: 0.42335\n+\n+# delExistCost: 255\n+# insExistCost: 248\n+# delExtendCost: 76\n+# insExtendCost: 78\n+\n+# substitution percent identity: 84.5031\n+\n+# count matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 178304.73      4517.257925    17741.252681   4858.77123    \n+# C 4843.594396    103456.2463    2845.5696656   17298.79994   \n+# G 17092.716475   2952.0734537   102484.764     4727.692877   \n+# T 4781.83456     17183.48844    4570.956339    179662.41     \n+\n+# probability matrix (query letters = columns, reference letters = rows):\n+#   A              C              G              T             \n+# A 0.267194       0.00676923     0.0265857      0.007281      \n+# C 0.00725826     0.155032       0.00426416     0.0259227     \n+# G 0.0256139      0.00442376     0.153576       0.00708457    \n+# T 0.00716571     0.0257499      0.0068497      0.269229      \n+\n+# score matrix (query letters = columns, reference letters = rows):\n+#        A      C      G      T\n+# A     95   -197    -72   -234\n+# C   -191    131   -196    -76\n+# G    -75   -192    131   -193\n+# T   -235    -76   -196     94\n+\n+#last -a 13\n+#last -A 12\n+#last -b 4\n+#last -B 4\n+#last -S 1\n+# score matrix (query letters = columns, reference letters = rows):\n+       A      C      G      T\n+A      5    -10     -4    -12\n+C    -10      7    -10     -4\n+G     -4    -10      7    -10\n+T    -12     -4    -10      5\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/protein_database.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/databases/protein_database.fasta Fri Dec 20 14:17:59 2019 +0000
b
b'@@ -0,0 +1,11472 @@\n+>Ty1-GAG__ATCOPIA27_I#Ty1_copia/AleI/Retrofit\n+IRHHIAESLKNQYLTVEDPLELWLELKNRYDHQRTIQLPKAQHDWLNLRIQDYKSVEEYNSELFKIVSILRLCGEKVTEN\n+DMLEKTFSTFHANNVLLQQQYRAKGFTTYTSLASCLLLAEKNNELLLMNSALRPPGSTA\n+>Ty1-GAG__HOBS_I#Ty1_copia/AleI/Retrofit\n+LLNSLGPAQQALVDTSTTARKVWEKLRENYAQNVAQQIASLEAQLANLYQGDDKINVYSYKLETICRKLDHVDAPVSGLR\n+KLRTFLRGLGPQHDVWRKIFYFNTRLFFQKEGDSD\n+>Ty1-GAG__Copia21-PTR_I#Ty1_copia/AleI/Retrofit\n+IYGTISEDLLNTILERDSTAALAWNRLRDIFSDNKNSRALYLEQEFSKVQMEHFADASSYCQHLKSLSDQLSNVGSPVTN\n+ERLVLQLVSGLTDAYASVGSQMRHGDSLPPFYKARSMLVLEE\n+>Ty1-GAG__COP_I_MT#Ty1_copia/AleI/Retrofit\n+IYSTISFDLLTTIMEKGSTAMAAWNRLADIFEDNQNSRAVALEQDFSSTRMEDFSNVSAYCQRLKQLSDQLKNVGAPVSS\n+HRLVLQLVSGLSEPYRGVATLIRQSNPLPSFFQARSMLTLEE\n+>Ty1-GAG__Copia-50_ZM-I#Ty1_copia/AleI/Retrofit\n+ILTTVSKGVFDIIRRDCNDAFSLWHAIEDLFQDNELQRAVYLEAELRSLQQGDLSMNAYCTKLKRLADQLRDIGHPVSEP\n+SQVLNLLRGLNPKYRYVKPVITSKFPAHTFMSARSFLMLEE\n+>Ty1-GAG__Copia-52_ZM-I#Ty1_copia/AleI/Retrofit\n+LHATLADDLLDMVMDDDDGTAHQVWSKIANFFLGNKDSRAVQLEQDLHNLEQGDLSAAAFCHRLKTLADALADCDRPIDD\n+RALVHQLIRGLHPKFHVLKQMLPAMPSFPTFMEARDHLIVAE\n+>Ty1-GAG__Copia-47_ZM-I#Ty1_copia/AleI/Retrofit\n+IVGTISVDLHSLLRNLPHARAVWLAIEGQFMGNAEARALRLDAAFRTFVQGDLSVSAYCRKMKTMADSLGDLGCPVEDRI\n+LVLNVLRGLGDRYTHLRSLIMRQRPFPTFLQVRDDLALEE\n+>Ty1-GAG__SBCOPIA1_I#Ty1_copia/AleI/Retrofit\n+ILGTISLDLHDLVRNTPSARGAWLALEGQFLGNAEARALRLDASFRTFVQGDLSVSEYCRQMKGMADSLGDLGWPVEDRI\n+LVLNVLRGLSDRYSYLRTWITRQRPFPTFLQVRDDLVMEE\n+>Ty1-GAG__Copia8-ZM_I#Ty1_copia/AleI/Retrofit\n+IFGTVSIELQERHGTARQAWLALENHFIGNRETRALHLDATFRNFVQGDLTVGEYCRKMKGFADALSDLDAPVSDRILVL\n+NVLRGLNPKYANLRTIITRSVPFXTFHKVRDDLVLEE\n+>Ty1-GAG__Copia3-ZM_I#Ty1_copia/AleI/Retrofit\n+SPSPDPRAPCTSALQCRPPRSRRGPAPRRLACGSVSPXGGRRRLHLRLPPPVHRAASCAPAAASSASTSAVAAAASVADP\n+DACRRRRLSGRLRTDNSPPSLPCAPXACAPPAAXPRPCAASRRRPS\n+>Ty1-GAG__Copia-48_ZM-I#Ty1_copia/AleI/Retrofit\n+INGSITDNLADMISERGASARVLWLAIESQFLGNRTTRTLYADQAFRSFTQGDLSAADYCRRYKKLAEDLRDLGEPVSDK\n+TLVLNIIRGLNERFQALGLHLRRTSPLPTFLQVRDDLTLEE\n+>Ty1-GAG__OSCOPIA2_I#Ty1_copia/AleI/Retrofit\n+LFGTISFDLLQDVLATDTTARLVWRGLEYQFLGNSEQRALNLTTEFHTFQQGDLSVDEYCRKMKTFADSLGDVGEPVRDR\n+TLVLNTLNGLSEKFNNLRSLVPMQRPFPTFAELRSLLRLEE\n+>Ty1-GAG__Copia-44_SB-I#Ty1_copia/AleI/Retrofit\n+LYSTVSKEIWNDVYRPNNTALAAWTAITGQFLDNSLQQAVYLQQEFHSLFQGDLSVGEYCGRLKRLADSLYDCGAAVSDQ\n+ALVINTLRGLNNKFSQAIAVLSTMTPPPSFLYTKSYLLQEE\n+>Ty1-GAG__Copia-60_SB-I#Ty1_copia/AleI/Retrofit\n+MYGSVDDAVLDLAMEPDQDARALWVSIEALFQANKESRAVVLEQEFHNLLQGDLSIDVYAQQMKRTADALREVGHTVSPA\n+QLVLNLLRGLNPRFANTADIISNTSPLPDFKAATNMLRVKE\n+>Ty1-GAG__ATCOPIA51_I#Ty1_copia/AleI/Retrofit\n+LFGTLSEEVLGHVHNLTTSRQIWISLAENFNKSSIAREFSLRRNLQLLTKKDKSLSVYCRDFKIICDSLSSIGKPVEESM\n+KIFGFLNGLGREYDPITTVIQSSLSKLPAPTFNDVISEVQGFD\n+>Ty1-GAG__ATCOPIA52_I#Ty1_copia/AleI/Retrofit\n+LFGTLSEEVLGYVHNLQTSRDIWISLAENFNKSSVAREFTLRRTLQLLSKKDKTLSAYCREFIAVCDALSSIGKPVDESM\n+KIFGFLNGLGREYDPITTVIQSSLSKISPPTFRDVISEVKGFD\n+>Ty1-GAG__ATCOPIA7I#Ty1_copia/AleI/Retrofit\n+IFGSLSEEALKVVIGLNSAQEVWLGLARRFNRFSTTRKYDLQKRLGTCSKAGKTMDAYLSEVKNICDQLDSIGFPVTEQE\n+KIFGVLNGLGKEYESIATVIEHSLDVYPGPCFDDVVYKLTTFD\n+>Ty1-GAG__ATCOPIA47_I#Ty1_copia/AleI/Retrofit\n+ILGSLSEDILEEVITESTAQQVWEGLARYFNRVSTARLFELQRKQQTMCKHDTPMIDYIKGIKNICEQLASAGSPVKEQM\n+KFFAALNGLGREYEPIKTSIEGSMESTPAPTLDSITPRLTGFA\n+>Ty1-GAG__ATCOPI1_I#Ty1_copia/AleI/Retrofit\n+LLGSFAEDILSVVVNCFTSHQVWLTLANHFNRVSSSRLFELQRRLQTLEKKDNTMEVFLKDLKHICDQLASVGSPVPEKM\n+KIFSALNGLGREYEPIKTTIENSVDSNPSLSLDEVASKLRGYD\n+>Ty1-GAG__ATCOPIA21I#Ty1_copia/AleI/Retrofit\n+LIGAISVAVQPLLSQATTSAQIWRKLVDTYANPSRGHKQQIREQIKQWKKGSRSIDDYVLGLTTRFDQLALLEEAIPHED\n+QIAYILGGLSDDYRRVIDQIEGRDISPSITELHEKLINFE\n+>Ty1-GAG__ATCOPIA3I#Ty1_copia/AleI/Retrofit\n+LLGAISISVQPILSRTTTSAEIWTKLMDTYAKPSWSHIQQLRQQIKQWKKDTKSIDEFFQGLVMRFDQLALLGKPMESEE\n+QMEVIVEGLSDDYKQVIDQIQGREVPPSLTEIHEKLLNHE\n+>Ty1-GAG__ATCOPIA6I#Ty1_copia/AleI/Retrofit\n+LLGAISLSVQPLLSKANTSAEIWETLSSTFANPSWAHVQQLRQQLKQWTKGTKSIVTYFQGFTTRFDHLALLGKAPEREE\n+QIELILGGLPEDYKTVVDQIEGRENPPALTEVLEKLINHE\n+>Ty1-GAG__ATCOPIA4I#Ty1_copia/AleI/Retrofit\n+LIGAISPPVQPLVSRATKASQIWKTLTNTYAKSSYDHIKQLRTQIKQLKKGTKTIDEYVLSHTTLLDQLAILGKPMEHEE\n+QVERILEGLPEDYKTVVDQIEGKDNTPSITEIHERLINHE\n+>Ty1-GAG_'..b'copia/AleII\n+DENMTYATKPLEPQEDKEQLEAEEQLVPEEALIVPAGPLTRSKSKKFNQAINGLLKELKKNQEDVAQSSFIVITAQ\n+>Ty3-CHDCR__CRA4_ID2#Ty1_copia/AleII\n+DINLTSQTAELQAVPHLLLQPVPEVPDGIMTSSKAKQLKKRFNLVVQDILSYQEL\n+>Ty3-CHDCR__LotJ3_ID88#Ty1_copia/AleII\n+DEDRSSPDKDPLQEIGGPMTRSKTKRMKQALQGLILELKGKEDQNKLEATPKWVNFLEH\n+>Ty3-CHDCR__PopT1_ID21#Ty1_copia/AleII\n+DVDQPRNTSKDPLHVPNGPMTQSKTKTLKEALNALVLNVSTRSELKGPLEYQEETLVHLI\n+>Ty3-CHDCR__PopT10_ID168#Ty1_copia/AleII\n+DTNKPNTKRNHANDPLEVPIGPITRARANKLKEALNELVQNIWSKMDLERLGTFKEHKGQPLIHLV\n+>Ty3-CHDCR__VitV2_ID128#Ty1_copia/AleII\n+DENQQAFKDPLHVPVGLITKARSKKIKEALNGLIQDI\n+>Ty3-CHDCR__MusB1_ID98#Ty1_copia/AleII\n+NEQVDHNSAKDPLIFRGGPMTRAKAKMMKEALTCLLEGIWKEXAGQNLVKVLWIQEEPKIVNMI\n+>Ty3-CHDCR__MedT1_ID93#Ty1_copia/AleII\n+DEDIVQDISDAIQSLGGPMTRARARRVNDALVHFIIKSIEGSAQVEEGVAQVEEKEPKFIIII\n+>Ty3-CHDCR__PiSat1_ID200#Ty1_copia/AleII\n+DEDIIQDINDTMQGLGGPMTRARARRVNDALVHFMIKSIECMGQIEEKEPKFILIIQA\n+>Ty3-CHDCR__MedT2_ID89#Ty1_copia/AleII\n+DEGMVVHDTSASIQGLGGPMTRSRTKKAKEALTQLVAKVLESKPTLESMEDKMVMCI\n+>Ty3-CHDCR__LjRE2_ID63#Ty1_copia/AleII\n+DEDKDKDKGHGALKGLGGPMTRARAKRAKEALQQMIALALEEGTHVRELEPKLVNFLMNYEE\n+>Ty3-CHDCR__LotJ2_ID65#Ty1_copia/AleII\n+DEDKDKDKGHGALKGLGGPMTRARAKKAKETLQQVVATILEDKVVEEMEPKIMMIIQAQEE\n+>Ty3-CHDII__Reina_ZeaM_ID49#Ty1_copia/AleII\n+VLESRLLRKGNKVIPQLLIRWSNWPASLSTWEDEHAIKQQFPRAPA\n+>Ty3-CHDII__GlyM_ID230#Ty1_copia/AleII\n+VIGSRLITQGGVSIPHSLIQWKNKSSEDVTWEDDAVIRGQFPDFSL\n+>Ty3-CHDII__PopT_ID202#Ty1_copia/AleII\n+IVDRQVRKLRSKDIASVKVQWKGHSREEATWELEDKMREEYPHLFD\n+>Ty3-CHDII__GlyM_ID222#Ty1_copia/AleII\n+ITDRRTKSLRGKEIALVKVQWGTDEGDSTWELEDRMRELYPSLFI\n+>Ty3-CHDII__Peabody_PiSat_ID47#Ty1_copia/AleII\n+IENPELKQLRGKEIALVKVAWGGPAGGNVTWELESQMKESYPELFA\n+>Ty3-CHDII__LotJ_ID226#Ty1_copia/AleII\n+IEERRIKQLRNKQVPLVKVIWNQVTGDATWELEEKMKEQYPELFT\n+>Ty3-CHDII__GlyT_ID221#Ty1_copia/AleII\n+IEDRRIKTLRGKEIPLVKVIWGRGTSEDATWELESKMRASYPTLFE\n+>Ty3-CHDII__MedT_ID218#Ty1_copia/AleII\n+IDDRKVKTLRGKEIPLVRVVWSGATGESLTWELESKMLESYPELFA\n+>Ty3-CHDII__VitV_ID214#Ty1_copia/AleII\n+FWKLESIGSRNKVIPAVKVWWQHHGIEEATWEPEEEMRRHYPQLFY\n+>Ty3-CHDII__OryS_ID254#Ty1_copia/AleII\n+ILDEAEKRTRSKVWRMYKVQWSNHTEAEATWESEEFLRTEYPHLFE\n+>Ty3-CHDII__Retrosor2_SorB_ID48#Ty1_copia/AleII\n+ILETAERVTRSRVIRMCKVQWNRHSEAEATWEREDDLRKSYSYLFE\n+>Ty3-CHDII__Tekay_ZeaM_ID50#Ty1_copia/AleII\n+ILETSRRITRSKVINMCKVQWSHHSEDEATWEREDELRAEFPQLFS\n+>Ty3-CHDII__Legolas_AraT_ID39#Ty1_copia/AleII\n+IMDRMTKGTRGKARDLLKVLWNCRGREEYTWETENKMKANFPEWFK\n+>Ty3-CHDII__AraT_ID257#Ty1_copia/AleII\n+VLERRIKELRRKKIPLIKVMWDCDGVTEETWEPEARMKARFKKWFE\n+>Ty3-CHDII__BraR_ID236#Ty1_copia/AleII\n+VLSVRRVQHGTKEVQEALIKWKNMPIEEATWEEYDQLVASFPFFVS\n+>Ty3-CHDII__LORE2A_LotJ_ID51#Ty1_copia/AleII\n+IMDTRENRDGDLEVLIRWKDLPTFEDSWEDFSKLLDQFPNHQL\n+>Ty3-CHDII__LORE2B_LotJ_ID52#Ty1_copia/AleII\n+IMDTRENRDGDLEVLIRWKDLPTFEDSWEDFSKLLDQFPNHQL\n+>Ty3-CHDII__SelM_ID252#Ty1_copia/Angela\n+ILNARKSKRQGREVREFHVKWRGFPHCEATWEPEENLANARDLVEE\n+>Ty3-CHDII__PhyP_ID251#Ty1_copia/Angela\n+VLDSRRNRRKLEYLVHWSGYDINERTWERAENLANAPKKVXE\n+>Ty3-CHDII__LotJ_ID234#Ty1_copia/Angela\n+ILAKRTVTVQGEQIQQLLVQWKGQGLDEATWEDLITIKSQFPSFSL\n+>Ty3-CHDII__MedT_ID232#Ty1_copia/Angela\n+ILNVRNIIRGDRKVEQLLVKWKDMQNSEATWEDKQEMLDSYPNLNL\n+>Ty3-CHDII__GlyM_ID228#Ty1_copia/Angela\n+ILASRIIIRGHNQIEQILVQWENGLQDEATWEDIEDIKASYPTFNL\n+>Ty3-CHDII__Galadriel_LycE_ID42#Ty1_copia/Angela\n+ILDHRVLGTSKKNTKTEFLVHWKGKSAADAVWEKAKDLWQFDAQIDD\n+>Ty3-CHDII__PopT_ID203#Ty1_copia/Angela\n+IMDHRRLGQHRKNRRTEFLVKWKKNEEVSWEKDTDLWQFEDQIQD\n+>Ty3-CHDII__Gloin_AraT_ID210#Ty1_copia/Angela\n+LLDIRQSRTTDGADVLVQWSGMSALEATWEPLVTLVKQFPSFDL\n+>Ty3-CHDII__MusA_ID43#Ty1_copia/Angela\n+ILADRKIKLPNGAEQTEYLVKWRKLPRTEASWEPEDALRHEEEVINN\n+>Ty3-CHDII__VitV_ID211#Ty1_copia/Angela\n+IIADRIIRRRGVPPATEYLVKWKGLPESEASWEPANALWQFQEQIER\n+>Ty3-CHDII__Gimli_AraT_ID40#Ty1_copia/Angela\n+ILKRKLVNRHGRAATKVLVQWTNEDEAEATWEFLFDLLQKYPTFNH\n+>Ty3-CHDII__LORE1a_LotJ_ID256#Ty1_copia/Angela\n+ILKRRMVQRRHKAVTEVLVQWLGEMEEEATWEVLYNLKLKYPTFDT\n+>Ty3-CHDII__VitV_ID215#Ty1_copia/Angela\n+ILDRRLVKRHNVPAVQLLIHWVNKSPTDASWEFADDLKRRFPAFFL\n+>Ty3-CHDII__MusA_ID235#Ty1_copia/Angela\n+ILDRRIVMRRRHPSTEVLVHWNNLPLEDATWEPYEELKTRFPEFME\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/satellite_model.rds
b
Binary file databases/satellite_model.rds has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 databases/tRNA_database.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/databases/tRNA_database.fasta Fri Dec 20 14:17:59 2019 +0000
b
b'@@ -0,0 +1,2790 @@\n+>TTCGATTCCTGTAAGGGATAcca__Glu-2x\n+TTCGATTCCTGTAAGGGATAcca\n+>TTCGAACCCCATAGCCAGCAcca__Leu-3x\n+TTCGAACCCCATAGCCAGCAcca\n+>TTCAATTCCCGTTGTTCACCcca__His-1x\n+TTCAATTCCCGTTGTTCACCcca\n+>TTCAAATCTGATTCCCGGCGcca__Phe-1x\n+TTCAAATCTGATTCCCGGCGcca\n+>TTCGAATCACGGTGGGACCTcca__Gln-1x\n+TTCGAATCACGGTGGGACCTcca\n+>TTCGAATCCCACGGTGGGCAcca__Lys-2x\n+TTCGAATCCCACGGTGGGCAcca\n+>TTCAACTCCTGCTTCGACCTcca__Ile-1x\n+TTCAACTCCTGCTTCGACCTcca\n+>TTCGAGTCTGGGCAACGCCAcca__Val-1x\n+TTCGAGTCTGGGCAACGCCAcca\n+>TTCGACTCCCCGTAGGAGCGcca__Pro-1x\n+TTCGACTCCCCGTAGGAGCGcca\n+>TTCAAATCCCGTCTCCGCAAcca__Met-1x\n+TTCAAATCCCGTCTCCGCAAcca\n+>TTTGACTCCCACTGTGGTCAcca__SeC-1x\n+TTTGACTCCCACTGTGGTCAcca\n+>TCAAATCCTACGTGTTGCAAcca__Pro-1x\n+TCAAATCCTACGTGTTGCAAcca\n+>TTCAAGTCCCAATAACAAAAcca__Glu-1x\n+TTCAAGTCCCAATAACAAAAcca\n+>TCGAATCCCCCCCTCTTCTTcca__Tyr-1x\n+TCGAATCCCCCCCTCTTCTTcca\n+>TTCGAATCCCTCCATGGTGAcca__Arg-1x\n+TTCGAATCCCTCCATGGTGAcca\n+>TTTGAGCCCCGTCAATCTCAcca__Ala-3x\n+TTTGAGCCCCGTCAATCTCAcca\n+>TTTGTTCCCCGGCAACGGTGcca__Asp-1x\n+TTTGTTCCCCGGCAACGGTGcca\n+>TTCAAATCCCGGCAACGGAAcca__Glu-3x\n+TTCAAATCCCGGCAACGGAAcca\n+>TTCAAAGCTCGGCAGTGGAAcca__Tyr-1x\n+TTCAAAGCTCGGCAGTGGAAcca\n+>TTCGATTCCCGGATGGTGCAcca__Gly-1x\n+TTCGATTCCCGGATGGTGCAcca\n+>TTCGAGCCCTATGGTGGGTGcca__Lys-2x\n+TTCGAGCCCTATGGTGGGTGcca\n+>TTCGATTCCCGGTAACGGAGcca__Glu-2x\n+TTCGATTCCCGGTAACGGAGcca\n+>CGAACCCCAGCCACTCCACAcca__Sup-1x\n+CGAACCCCAGCCACTCCACAcca\n+>TTCGATCCTGCATGGAGGCAcca__Thr-2x\n+TTCGATCCTGCATGGAGGCAcca\n+>ATCGAAACCAGGCTTTGATAcca__Met-2x\n+ATCGAAACCAGGCTTTGATAcca\n+>TTCGAATCCTGCCGCTCATGcca__Ser-1x\n+TTCGAATCCTGCCGCTCATGcca\n+>TTCGAGCCCCGCCGGGAGCAcca__Ile-8x\n+TTCGAGCCCCGCCGGGAGCAcca\n+>TTCAAATTTGGGTGCCGCCTcca__Cys-2x\n+TTCAAATTTGGGTGCCGCCTcca\n+>TTCGATTCCCGGCACCTCCAcca__Ala-1x\n+TTCGATTCCCGGCACCTCCAcca\n+>GTTCAACCCTCCTTCTAGCGcca__Asn-1x\n+GTTCAACCCTCCTTCTAGCGcca\n+>TTCGAACCTGGGATTAGACAcca__Val-2x\n+TTCGAACCTGGGATTAGACAcca\n+>TTCAAACCTGGGCGAAGCCAcca__Val-1x\n+TTCAAACCTGGGCGAAGCCAcca\n+>TTTGATCCCCGGCAGCGGCGcca__Asp-1x\n+TTTGATCCCCGGCAGCGGCGcca\n+>TTCGATTCCCGTAAGGGATGcca__Glu-2x\n+TTCGATTCCCGTAAGGGATGcca\n+>GTTCGAATCCGGCGACGCCAcca__Val-1x\n+GTTCGAATCCGGCGACGCCAcca\n+>ATCGATACCCCGCATCTCTAcca__Ala-2x\n+ATCGATACCCCGCATCTCTAcca\n+>TTCGAGCTACATGATGGGTGcca__Lys-2x\n+TTCGAGCTACATGATGGGTGcca\n+>TTCAAATCCAACTCAGCTCAcca__Asp-1x\n+TTCAAATCCAACTCAGCTCAcca\n+>TTCAAATTCCACTGTCGTCAcca__Leu-2x\n+TTCAAATTCCACTGTCGTCAcca\n+>TTCAAATCTCTCTCTCTCTCcca__Ser-1x\n+TTCAAATCTCTCTCTCTCTCcca\n+>TTCGAGTCCCATCGCGATCGcca__Arg-4x\n+TTCGAGTCCCATCGCGATCGcca\n+>TTCAAATCCTGCAGCTGACGcca__Ser-1x\n+TTCAAATCCTGCAGCTGACGcca\n+>GGTTCGAATCCTGCCGACCAcca__Ser-1x\n+GGTTCGAATCCTGCCGACCAcca\n+>TCCGACTCCCGGCAAACGCAcca__Gly-1x\n+TCCGACTCCCGGCAAACGCAcca\n+>ATCGAAACCTGTCTTTGATAcca__Met-1x\n+ATCGAAACCTGTCTTTGATAcca\n+>TTCGAGTCCCACCGTGATCGcca__Arg-11x\n+TTCGAGTCCCACCGTGATCGcca\n+>TTCGAGTCCCAGCGTGGTCGcca__Arg-5x\n+TTCGAGTCCCAGCGTGGTCGcca\n+>TTTAAGACTTGCATGAACCAcca__Thr-1x\n+TTTAAGACTTGCATGAACCAcca\n+>TTCAAGCCCCACGGTGGGTGcca__Lys-4x\n+TTCAAGCCCCACGGTGGGTGcca\n+>TTCGACTCTCAACGAAAGCAcca__Ser-1x_Thr-20x\n+TTCGACTCTCAACGAAAGCAcca\n+>TTCGATCCTGGGTAGCAACAcca__Val-1x\n+TTCGATCCTGGGTAGCAACAcca\n+>TTCGAGTCCCGGCAACGGAGcca__Glu-11x\n+TTCGAGTCCCGGCAACGGAGcca\n+>TTTAAATCCCACAGCCGTCAcca__Leu-1x\n+TTTAAATCCCACAGCCGTCAcca\n+>TTCGATCCCCCGCGTCTCCAcca__Ala-2x\n+TTCGATCCCCCGCGTCTCCAcca\n+>GTTCGAATCCCGAACACACCcca__Pro-1x\n+GTTCGAATCCCGAACACACCcca\n+>TTTGATCCCAGTTGGGTCGTcca__Tyr-1x\n+TTTGATCCCAGTTGGGTCGTcca\n+>TTTGAGCCACACGGTGGGTGcca__Lys-4x\n+TTTGAGCCACACGGTGGGTGcca\n+>TTCAATTCTTGTTGGATGCAcca__Pro-1x\n+TTCAATTCTTGTTGGATGCAcca\n+>TTTGAACCTGGGCAGAAACAcca__Val-1x\n+TTTGAACCTGGGCAGAAACAcca\n+>TTCGATCCCCGGCAGCGGCGcca__Asp-19x\n+TTCGATCCCCGGCAGCGGCGcca\n+>TTCGAATCCCGGCGAGACCTcca__Gln-1x\n+TTCGAATCCCGGCGAGACCTcca\n+>TTCGAATCCCTCTCCATCCGcca__Ser-9x\n+TTCGAATCCCTCTCCATCCGcca\n+>TACGACTCTCAACGAAAGCAcca__Thr-1x\n+TACGACTCTCAACGAAAGCAcca\n+>TTCGAACCCAGGCTCAGATAcca__Val-1x\n+TTCGAACCCAGGCTCAGATAcca\n+>GAACCCTGTGGACGCTCATAcca__Leu-1x\n+GAACCCTGTGGACGCTCATAcca\n+>TTCGAACCTGGACTTAGACAcca__Val-1x\n+TTCGAACCTGGACTTAGACAcca\n+>TTCAAACCCCAACAGGACTAcca__Met-1x\n+TTCAAACCCCAACAGGACTAcca\n+>TTCAAATCCTACTT'..b'ca\n+>TTCGACTCCCCATGGGAGCGcca__Ala-2x_Pro-3x\n+TTCGACTCCCCATGGGAGCGcca\n+>TTCAAATCCTGCCGCTCACGcca__Ser-1x\n+TTCAAATCCTGCCGCTCACGcca\n+>TTCGAATCCCAACAACCACAcca__His-1x\n+TTCGAATCCCAACAACCACAcca\n+>TTCGAGCCCCACGATGGGTGcca__Lys-6x\n+TTCGAGCCCCACGATGGGTGcca\n+>GTCAAATCCTACAGAGCGTGcca__Trp-1x\n+GTCAAATCCTACAGAGCGTGcca\n+>TTCAAATCCTTCTTGAGGAGcca__Asn-1x\n+TTCAAATCCTTCTTGAGGAGcca\n+>TTCGATTCCTTGCTGGTGCAcca__Gly-1x\n+TTCGATTCCTTGCTGGTGCAcca\n+>TTCAAGCTCCATGGTGGACGcca__Lys-4x\n+TTCAAGCTCCATGGTGGACGcca\n+>GTTTGAGTCCCATTGGGGCGcca__Arg-1x\n+GTTTGAGTCCCATTGGGGCGcca\n+>TTGGATCCTCACTGGGGGCAcca__Thr-1x\n+TTGGATCCTCACTGGGGGCAcca\n+>TTCGATTCCCGTCGCTCGCCcca__Gly-9x\n+TTCGATTCCCGTCGCTCGCCcca\n+>TTTGAATCCTACTTGGGGAGcca__Asn-1x\n+TTTGAATCCTACTTGGGGAGcca\n+>TTCGAATCCTTTTACTCCAGcca__Gln-16x\n+TTCGAATCCTTTTACTCCAGcca\n+>TTCAAGCCCCATGATGGGTGcca__Lys-4x\n+TTCAAGCCCCATGATGGGTGcca\n+>TTCGAACCCCACTGCTGACAcca__Sup-1x_Leu-6x\n+TTCGAACCCCACTGCTGACAcca\n+>TTCGAAACCTGTACAGAGCAcca__Ile-6x\n+TTCGAAACCTGTACAGAGCAcca\n+>TTTGATTCCCGCCGCTCGCCcca__Gly-1x\n+TTTGATTCCCGCCGCTCGCCcca\n+>TTCGAACCCCAGAGACCCCAcca__Pro-1x\n+TTCGAACCCCAGAGACCCCAcca\n+>TTCAATTCCCGGCTGGTGCAcca__Gly-5x\n+TTCAATTCCCGGCTGGTGCAcca\n+>TTCGAATCCGCAATCACTTAcca__Tyr-1x\n+TTCGAATCCGCAATCACTTAcca\n+>TTCGAGTCCTCTTCAAGGCAcca__Phe-1x_Leu-60x\n+TTCGAGTCCTCTTCAAGGCAcca\n+>TTCGAACCTCACCAGGAGCAcca__Lys-1x\n+TTCGAACCTCACCAGGAGCAcca\n+>TTCGAGCCCCACTATGAGCGcca__Lys-2x\n+TTCGAGCCCCACTATGAGCGcca\n+>TTCGAGACCCGTGGTGGGTGcca__Lys-2x\n+TTCGAGACCCGTGGTGGGTGcca\n+>TTCAAATCCCGGCAATGGAAcca__Glu-52x\n+TTCAAATCCCGGCAATGGAAcca\n+>TCAAATCCTATTTGGACGCAcca__Arg-1x\n+TCAAATCCTATTTGGACGCAcca\n+>TTCGAGCCCCACAGTGATTTcca__Glu-1x\n+TTCGAGCCCCACAGTGATTTcca\n+>TTCAAGACCTGCATGGGCCAcca__Ile-2x\n+TTCAAGACCTGCATGGGCCAcca\n+>TTCGACTCCCGGTAAACGCAcca__Gly-1x\n+TTCGACTCCCGGTAAACGCAcca\n+>TTCAAACCTCGGTGGGACCTcca__Gln-1x\n+TTCAAACCTCGGTGGGACCTcca\n+>TTCGATCCTGCGTGAGGGCAcca__Thr-12x\n+TTCGATCCTGCGTGAGGGCAcca\n+>TCGAACCCTGTGGTTGCTAAcca__Leu-1x\n+TCGAACCCTGTGGTTGCTAAcca\n+>TTCGAATCCCAGGCGAGGAAcca__Sup-1x\n+TTCGAATCCCAGGCGAGGAAcca\n+>GTTCGAACCCCGGATACTCAcca__Sup-1x\n+GTTCGAACCCCGGATACTCAcca\n+>TTCGACTCCCCGTGGGAGTGcca__Ala-1x\n+TTCGACTCCCCGTGGGAGTGcca\n+>TTCAAATCCAGCTCGGCCTAcca__Tyr-2x\n+TTCAAATCCAGCTCGGCCTAcca\n+>TTCAAGCCCCACGGTGAGTGcca__Lys-1x\n+TTCAAGCCCCACGGTGAGTGcca\n+>TTCGAACCCCCGCCTCCTATcca__Sup-1x\n+TTCGAACCCCCGCCTCCTATcca\n+>TCGAACCCTGAGGTTGCTAAcca__Leu-1x\n+TCGAACCCTGAGGTTGCTAAcca\n+>ATCGAAACTTGCCTCTGATAcca__Met-2x\n+ATCGAAACTTGCCTCTGATAcca\n+>TTCAAATCCAGCTCGGCCCAcca__Tyr-23x\n+TTCAAATCCAGCTCGGCCCAcca\n+>TTCGGATCCGGTTGGTCGGAcca__His-1x\n+TTCGGATCCGGTTGGTCGGAcca\n+>TTCGAGACCTTCATGGGCCAcca__Ile-1x\n+TTCGAGACCTTCATGGGCCAcca\n+>TTCGAGTCCCACTGTGATCGcca__Arg-2x\n+TTCGAGTCCCACTGTGATCGcca\n+>TTCGATTCATTGCATCTCCAcca__Pro-1x\n+TTCGATTCATTGCATCTCCAcca\n+>TTCAAATCCAATAGTCGGCTcca__Thr-1x\n+TTCAAATCCAATAGTCGGCTcca\n+>TTTGATTTATACTCACTGCAcca__Phe-1x\n+TTTGATTTATACTCACTGCAcca\n+>ATCGAAACCTGGCTCCGATAcca__Met-8x\n+ATCGAAACCTGGCTCCGATAcca\n+>TTCGAATCCTTCCGTCCCAGcca__Gln-17x\n+TTCGAATCCTTCCGTCCCAGcca\n+>TTCGAGCCCTACTAAGCCCAcca__Met-9x\n+TTCGAGCCCTACTAAGCCCAcca\n+>TTCAAACCCCGGCCACCACCcca__Ile-1x\n+TTCAAACCCCGGCCACCACCcca\n+>GGGTTCGAACTCTCTTTCCGcca__Ser-1x\n+GGGTTCGAACTCTCTTTCCGcca\n+>TTCGAATCCCTCTCTTTTCGcca__Ser-5x\n+TTCGAATCCCTCTCTTTTCGcca\n+>TTCAAACCCCACTGCTGACAcca__Leu-1x\n+TTCAAACCCCACTGCTGACAcca\n+>TTCGATTGCCACAGACGGCGcca__Lys-1x\n+TTCGATTGCCACAGACGGCGcca\n+>TTCGAATCCCTCTCTTTCCAcca__Ser-1x_Gly-1x\n+TTCGAATCCCTCTCTTTCCAcca\n+>TCGATTCCTTCTGGTGCCAAcca__Gln-1x\n+TCGATTCCTTCTGGTGCCAAcca\n+>GGGTTCAATCCCCAACAGCGcca__Asp-1x\n+GGGTTCAATCCCCAACAGCGcca\n+>TTCAACTCTCAATGAAAGCAcca__Ile-1x_Thr-29x\n+TTCAACTCTCAATGAAAGCAcca\n+>TTCGAATCCGCTAGGTCGGAcca__Tyr-5x\n+TTCGAATCCGCTAGGTCGGAcca\n+>TTCGAAACCCGCAGGGACTAcca__Val-3x\n+TTCGAAACCCGCAGGGACTAcca\n+>TTTGAGTCTCTCTCACCCCAcca__Met-1x\n+TTTGAGTCTCTCTCACCCCAcca\n+>TTCGAACCCGGGCTCAGACAcca__Val-40x\n+TTCGAACCCGGGCTCAGACAcca\n+>TTCAAGCCCCACGGTGGGCGcca__Lys-4x\n+TTCAAGCCCCACGGTGGGCGcca\n+>TTCGAACCCGGGCGAAACCAcca__Val-1x\n+TTCGAACCCGGGCGAAACCAcca\n+>TTCGAGACCCACTGGTGCCCcca__Phe-2x\n+TTCGAGACCCACTGGTGCCCcca\n+>TTCGATCCCCAGCATGGTCGcca__Arg-7x\n+TTCGATCCCCAGCATGGTCGcca\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 environment.yml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/environment.yml Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,32 @@
+name: repeatexplorer
+channels:
+  - iuc
+  - conda-forge
+  - bioconda
+dependencies:
+  - python 3.7.*
+  - compilers
+  - pyrserve 0.9.1
+  - last >=956
+  - mafft
+  - imagemagick
+  - blast
+  - diamond
+  - blast-legacy
+  - r-igraph
+  - r-data.tree
+  - r-stringr
+  - r-r2html
+  - r-hwriter
+  - r-dt
+  - r-scales
+  - r-plotrix
+  - r-png
+  - r-plyr
+  - r-dplyr
+  - r-optparse
+  - r-dbi
+  - r-rsqlite
+  - r-rserve
+  - bioconductor-biostrings
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 fetch_databases.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fetch_databases.sh Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,35 @@
+#!/bin/bash
+#set -euo pipefail
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+echo "protein databases necessary for full clustering analysis"
+echo "are downloaded from bitbucket repository using git"
+echo ""
+cd $DIR
+git clone https://bitbucket.org/petrnovak/re_databases.git 2> $DIR/.fetchdb.log 
+GITEXIT=$?
+if [ $GITEXIT -eq "0" ]
+then
+    echo "databases successfully downloaded"
+    ln -sf $DIR/re_databases/* $DIR/databases/
+fi
+# cat $DIR/.fetchdb.log
+if [ $GITEXIT -eq "128" ]
+then
+    if grep -q -F "Authentication failed" $DIR/.fetchdb.log
+    then
+        echo "get login credential from (neumann at umbr.cas.cz)!"
+    fi
+    if grep -q -F "already exist" $DIR/.fetchdb.log
+    then
+        echo "repository alredy exists, updating..."
+        cd $DIR/re_databases
+        git pull && ln -f -s $DIR/re_databases/* $DIR/databases/ && find . -type f -exec touch {} +
+    fi
+
+fi
+
+
+
+
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 get_version.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_version.sh Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,8 @@
+#!/bin/sh
+# this file is copied into ./git/hooks.post-commit and post-checkout
+branch=$(git rev-parse --abbrev-ref HEAD)
+shorthash=$(git log --pretty=format:'%h' -n 1)
+revcount=$(git log --oneline | wc -l)
+tag=$(git describe --tags --abbrev=0)
+echo  "version:" ${tag}"-"${revcount}"("$shorthash") branch:" $branch > version_info.txt
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/assembly_tools.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/assembly_tools.py Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,221 @@\n+#!/usr/bin/env python3\n+import sys\n+import logging\n+import subprocess\n+import os\n+import tempfile\n+import config\n+import shutil\n+import itertools\n+import pickle\n+import shlex\n+from lib.parallel.parallel import parallel2 as parallel\n+from lib.parallel.parallel import get_max_proc\n+\n+REQUIRED_VERSION = (3, 4)\n+MAX_BUFFER_SIZE = 100000\n+if sys.version_info < REQUIRED_VERSION:\n+    raise Exception("\\n\\npython 3.4 or higher is required!\\n")\n+LOGGER = logging.getLogger(__name__)\n+MAX_FASTA_IN_DIRECTORY = 1000\n+\n+def assembly(sequences, hitsort, clusters_info, assembly_dir,\n+             contigs_file, min_size_of_cluster_for_assembly = 0):\n+    \'\'\'\n+    Runs assembly on sequences (SequenceSet). Assembly is\n+    performed on each cluster separatelly, clusters are taken\n+    from hitsort(Graph)\n+    Cluster_listing - list of Clusters\n+    if cluster.tandem_rank is 1 or 2 - no assembly is performed!!\n+    \'\'\'\n+\n+    # iterate over large clusters, assembly is performed on sequences stored in\n+    # cluster_little[index].fasta_file_full - for annotated clusters\n+    # sequences of small clusters are retrieved from database\n+    fasta_seqs = [\n+        i.fasta_file_full\n+        for i in clusters_info\n+        if not i.tandem_rank in config.SKIP_CAP3_ASSEMBLY_TANDEM_RANKS\n+    ]\n+    prefixes = ["CL{}".format(i.index)\n+                for i in clusters_info\n+                if not i.tandem_rank in config.SKIP_CAP3_ASSEMBLY_TANDEM_RANKS]\n+    LOGGER.info("Number of clusters for assembly: {}".format(hitsort.number_of_clusters))\n+    LOGGER.info("Assembling large clusters")\n+    assembly_files = parallel(cap3worker, fasta_seqs, prefixes)\n+    LOGGER.info("Large clusters assembled")\n+    # some clusters - tanem rank 1 assembled\n+    j = 0\n+    for cl in clusters_info:\n+        if cl.tandem_rank in config.SKIP_CAP3_ASSEMBLY_TANDEM_RANKS:\n+            cl.assembly_files = {i: None for i in config.CAP3_FILES_MAPPING}\n+            consensus_file = cl.dir + "/tarean_contigs.fasta"\n+            cl.assembly_files["{}.{}.contigs"] = consensus_file\n+            with open(cl.dir_tarean + "/tarean_contigs.fasta",\n+                      \'r\') as fin, open(consensus_file, \'w\') as fout:\n+                for line in fin:\n+                    if line[0] == ">":\n+                        line = ">CL{}Contig{}".format(cl.index, line[1:])\n+                    fout.write(line)\n+        else:\n+            cl.assembly_files = assembly_files[j]\n+            j += 1\n+    # assembly of small clusters:\n+    # connection to files were results will be concatenated\n+    LOGGER.info("Assembly of small cluster - making tmp files")\n+\n+    tmp_dir_root = tempfile.mkdtemp()\n+    tmp_subdir = tempfile.mkdtemp(dir=tmp_dir_root)\n+    nproc = get_max_proc()\n+    prefixes = [[] for i in range(nproc)]\n+    tmp_seq_files = [[] for i in range(nproc)]\n+\n+    LOGGER.info("Assembly of small clusters - saving small cluster to tmp files")\n+    seq_dictionary = sequences.toDict()\n+    fasta_count = 0\n+    chunk_counter = itertools.cycle(range(nproc))\n+    for index in range(len(clusters_info) + 1, hitsort.number_of_clusters):\n+        chunk = next(chunk_counter)\n+        ids = hitsort.get_cluster_reads(index)\n+        if len(ids) < min_size_of_cluster_for_assembly:\n+            break\n+        prefixes[chunk].append("CL{}".format(index))\n+        fasta_count += 1\n+        if fasta_count > MAX_FASTA_IN_DIRECTORY:\n+            # create new subdir to keep number of files in directory low\n+            fasta_count = 1\n+            tmp_subdir = tempfile.mkdtemp(dir=tmp_dir_root)\n+        fasta_file_name = "{}/{}".format(tmp_subdir, index)\n+        write_seqDict_to_fasta(file_name=fasta_file_name, sequences=seq_dictionary, subset=ids)\n+        tmp_seq_files[chunk].append(fasta_file_name)\n+    del seq_dictionary\n+    LOGGER.info("Assembly of small clusters running")\n+    pickled_fparts_small_contigs = parallel(cap3worker_multiple, tmp_seq_files, prefixes)\n+    LOGGER.info("Assembly o'..b'+        os.system("align_parsing.pl -i {fn} -o {out}.info.fasta -p {out}.profile 2>&1".format(fn=small_aln_file, out=file_base_name))\n+        os.system("select_and_sort_contigs.pl {fn}.info.fasta 5 2>&1".format(fn=file_base_name))\n+        small_contig_file = file_base_name + ".info.fasta"\n+        with open(small_contig_file, \'r\') as fin:\n+            for i in fin:\n+                fout.write(i)\n+    shutil.rmtree(tmp_dir_root)\n+\n+def write_seqDict_to_fasta(file_name, sequences, subset):\n+    with open(file_name, \'w\') as f:\n+        for i in subset:\n+            f.write(">{}\\n{}\\n".format(i, sequences[i]))\n+\n+\n+\n+\n+def cap3worker(seqfile, prefix="cap", cap3args=" -p 80 -o 40 "):\n+    prefix2 = "cap"\n+    cmd = "cap3 " + seqfile + cap3args + " -x " + prefix2\n+    with open(seqfile + "." + prefix2 + ".aln", "w") as aln:\n+        subprocess.check_call(shlex.split(cmd), shell=False, stdout=aln)\n+    # this generate three files\n+    files_dict = {}\n+    for fkey in config.CAP3_FILENAMES:\n+        fn = fkey.format(seqfile, prefix2)\n+        fn_tmp = "{}.tmp".format(fn)\n+        files_dict[fkey] = fn\n+        if config.CAP3_PATTERNS_REPLACE[fkey]:\n+            pattern, replacement = config.CAP3_PATTERNS_REPLACE[fkey]\n+            with open(fn, "r") as con_in, open(fn_tmp, \'w\') as con_out:\n+                for line in con_in:\n+                    con_out.write(line.replace(pattern, replacement.format(\n+                        prefix)))\n+            os.rename(fn_tmp, fn)\n+    \n+    # make new meaningful names here\n+\n+    for fkey in config.CAP3_FILES_GOODNAMES:\n+        config.CAP3_FILES_GOODNAMES[fkey]\n+        fn_goodname = os.path.dirname(files_dict[fkey]) + "/" + config.CAP3_FILES_GOODNAMES[fkey]\n+        os.rename(files_dict[fkey], fn_goodname)\n+        files_dict[fkey] = fn_goodname\n+\n+    aln_file = files_dict["{}.{}.aln"]\n+    file_base_name = aln_file[:-4]\n+    os.system("align_parsing.pl -i {fn} -o {out}.info.fasta -p {out}.profile 2>&1".format(fn=aln_file, out=file_base_name))\n+    os.system("select_and_sort_contigs.pl {fn}.info.fasta 5".format(fn=file_base_name))\n+    # TODO -add new file to files_dict\n+    # replace simple fasta with info.fasta\n+    files_dict["{}.{}.contigs"] = file_base_name + ".info.fasta"\n+    return files_dict\n+\n+def cap3worker_multiple(many_seqfile, many_prefixes, cap3args=" -p 80 -o 40 "):\n+    \'\'\'\n+    purpose of this script is to run multiple assemblies within single process\n+    avoiding running high number of short parallel subprocesses\n+    As starting subprocess for each cap3 assembly was very ineffective,\n+    all ap3 commands are written to single file and run using single subprocess\n+    command -\n+    \'\'\'\n+    cmd_file = tempfile.NamedTemporaryFile(mode="w",delete=False).name\n+    with open(cmd_file,\'w\') as cmdf:\n+        for seqfile, prefix in zip(many_seqfile, many_prefixes):\n+            cmd = "cap3 " + seqfile + cap3args + " -x " + prefix + " > " + seqfile + "." + prefix + ".aln\\n"\n+            cmdf.write(cmd)\n+    os.system("sh "+ cmd_file)\n+    # collect results:\n+    files_dict_many = []\n+    for seqfile, prefix in zip(many_seqfile, many_prefixes):\n+        files_dict = {}\n+        for fkey in config.CAP3_FILENAMES:\n+            fn = fkey.format(seqfile, prefix)\n+            fn_tmp = "{}.tmp".format(fn)\n+            files_dict[fkey] = fn\n+            if config.CAP3_PATTERNS_REPLACE[fkey]:\n+                pattern, replacement = config.CAP3_PATTERNS_REPLACE[fkey]\n+                with open(fn, "r") as con_in, open(fn_tmp, \'w\') as con_out:\n+                    for line in con_in:\n+                        con_out.write(line.replace(pattern, replacement.format(\n+                            prefix)))\n+                os.rename(fn_tmp, fn)\n+        files_dict_many.append(files_dict)\n+    # this is too large to be return directly - use picking\n+    f = tempfile.NamedTemporaryFile(delete=False)\n+    os.unlink(cmd_file)\n+    pickle.dump(files_dict_many,file=f)\n+    return f.name\n+    \n+\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/config.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/config.R Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,11 @@
+#/usr/bin/env Rscript
+TANDEM_RANKS = c(
+      "Putative satellites (high confidence)" =  1,
+      "Putative satellites (low confidence)" = 2,
+      "Putative LTR elements" = 3,
+      "rDNA" = 4,
+      "Other" = 0
+)
+# inverted - key value
+RANKS_TANDEM = names(TANDEM_RANKS)
+names(RANKS_TANDEM) = TANDEM_RANKS
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/create_annotation.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/create_annotation.R Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,1384 @@\n+#!/usr/bin/env Rscript\n+Sys.setlocale("LC_CTYPE", "en_US.UTF-8")  # this is necessary for handling unicode characters (data.tree package)\n+suppressPackageStartupMessages(library(data.tree))\n+suppressPackageStartupMessages(library(stringr))\n+suppressPackageStartupMessages(library(R2HTML))\n+suppressPackageStartupMessages(library(hwriter))\n+suppressPackageStartupMessages(library(DT))\n+suppressPackageStartupMessages(library(tools))\n+suppressPackageStartupMessages(library(scales))\n+suppressPackageStartupMessages(library(igraph))\n+suppressPackageStartupMessages(library(plotrix))\n+suppressPackageStartupMessages(library(png))\n+\n+source("htmlheader.R")\n+source("config.R")  # load tandem ranks info\n+source("utils.R")  \n+DT_OPTIONS = options = list(pageLength = 1000, lengthMenu = c(10,50,100,1000,5000,10000))\n+WD = getwd()   # to get script directory when run from Rserve\n+HTMLHEADER = htmlheader  ## header (character) loaded from htmlheader.R\n+htmlheader = gsub("Superclusters summary","TAREAN summary", htmlheader)\n+\n+evaluate_LTR_detection = function(f){\n+\tNO_LTR=NULL\n+\tif (length(readLines(f)) == 11 ){\n+\t\treturn(NO_LTR)\n+\t}\n+\tdf=read.table(f,as.is=TRUE,sep="\\t", skip=11,fill=TRUE)\n+\tif (ncol(df) != 23){\n+\t\t#df is smaller if no pbs is detected!\n+\t\treturn(NO_LTR)\n+\t}\n+  df=df[!df$V13 == "",,drop=FALSE]\n+\tif (nrow(df)==0){\n+\t\treturn(NO_LTR)\n+\t}\n+\t# criteria:\n+\tdf_part=df[df$V15 >=12 & df$V20 == 23 & df$V21<df$V20,,drop=FALSE]\n+\tif (nrow(df_part) == 0){\n+\t\treturn(NO_LTR)\n+\t}\n+  PBS_type = gsub("_","",(str_extract_all(df_part$V13, pattern="_([A-Z][a-z]{2})", simplify=TRUE))) %>%\n+      paste(collapse=" ")\n+\treturn(PBS_type)\n+}\n+\n+\n+\n+## annotate superclusters\n+select_reads_id = function(index, search_type = c("cluster","supercluster")) {\n+    ## select read if base on the supecluster index need database connection\n+    ## HITSORTDB!\n+    search_type = match.arg(search_type)\n+    x = dbGetQuery(HITSORTDB,\n+                   paste0("SELECT vertexname FROM vertices WHERE vertexindex IN ",\n+                          "(SELECT vertexindex  FROM communities ",\n+                          "WHERE ", search_type,"=\\"", index,\n+                          "\\")"))\n+    return(x$vertexname)\n+}\n+\n+\n+get_reads_annotation = function(reads_id) {\n+    ## select annotation from tables in SEQDB which has name in format *_database\n+    annot_types = grep("_database", dbListTables(SEQDB), value = TRUE)\n+    annot = list()\n+    for (i in annot_types) {\n+        query = paste0("SELECT * FROM ", i, " WHERE name IN (", paste0("\\"", reads_id, \n+            "\\"", collapse = ", "), ")")\n+        annot[[i]] = dbGetQuery(SEQDB, query)\n+    }\n+    return(annot)\n+}\n+\n+supercluster_size = function(supercluster) {\n+    x = dbGetQuery(HITSORTDB, paste0("SELECT count(*) FROM vertices WHERE vertexindex IN ", \n+        "(SELECT vertexindex  FROM communities ", "WHERE supercluster=\\"", supercluster, \n+        "\\")"))\n+    return(x$"count(*)")\n+}\n+\n+\n+cluster_annotation = function(cluster, search_type = c("cluster", "supercluster")){\n+    ## searcheither for cluster or supercluster annotation\n+    ## read annotation from sqlite databases database is access though SEQDB (sequence\n+    ## annotation) and HITSORTDB - clustering information\n+    search_type = match.arg(search_type)\n+    reads_id = select_reads_id(cluster, search_type)\n+    annot = get_reads_annotation(reads_id)\n+    return(annot)\n+}\n+\n+get_tarean_info = function(cluster, search_type = c("cluster", "supercluster")){\n+    search_type = match.arg(search_type)\n+    if (search_type == "cluster") {\n+        search_type = "[index]"\n+    }\n+    tarean_info = dbGetQuery(HITSORTDB,\n+                         paste0(\n+                             "SELECT [index], supercluster, satellite_probability, tandem_rank, size_real, satellite FROM cluster_info WHERE ",\n+                             search_type, " = ", cluster))\n+    nhits = sum(tarean_info$size_real[tarean_info$tandem_rank %in% 1:2])\n+    proportion = nhi'..b'rs$color_table)\n+    domains_detected = length(vertex_colors$legend) > 0\n+    par(mar = c(0, 0, 0, 0))\n+    plot.new()\n+    if (domains_detected){\n+        # domains found\n+        legend("topleft", col=vertex_colors$legend,\n+               legend = names(vertex_colors$legend),\n+               pch = 15, cex = 0.7)\n+    }\n+    dev.off()\n+\n+    HTML.title("protein domains:", HR=4, file = clinfo$html_report_main)\n+    if (!domains_detected){\n+        HTML("No protein domains detected", file = clinfo$html_report_main)\n+    }\n+    HTML("protein domains:", HR=4, file = clinfo$html_report_main)\n+    html_insert_image(\n+        img_file = fs_relative$graph_domains,\n+        htmlfile = clinfo$html_report_main)\n+\n+    #############################################################################\n+    if (nrow(annot_summary) == 0){\n+    HTML.title("Reads annotation summary", HR = 3, file = clinfo$html_report_main)\n+        HTML("No similarity hits to repeat databases found", file = clinfo$html_report_main)\n+    }else{\n+        HTML(annot_summary, file = clinfo$html_report_main, align = "left")\n+    }\n+\n+\n+    ## similarity and mate cluster\n+    mate_clusters = get_cluster_connection_info(index, search_type="pair")\n+    similar_clusters = get_cluster_connection_info(index, search_type="similarity")\n+    ## report mate and similarity clusters\n+    if (!is.null(similar_clusters)){\n+        HTML.title("clusters with similarity:", file =clinfo$html_report_main, HR = 3)\n+        cat(df2html(\n+            similar_clusters,\n+            header=c("Cluster","Number of similarity hits"),\n+            sort_col = "N", scroling = TRUE\n+            ),\n+        file =clinfo$html_report_main, append=TRUE)\n+    }\n+    if (!is.null(mate_clusters)){\n+        HTML.title("clusters connected through mates:", file =clinfo$html_report_main, HR = 3)\n+        cat(df2html(\n+            mate_clusters[,c(\'cl\',\'N\',\'k\')],\n+            header = c(\'Cluster\',\'Number of shared<br> read pairs\',\'k\'),\n+            sort_col = "N", scroling = TRUE\n+            ),\n+        file = clinfo$html_report_main,append = TRUE\n+        )\n+\n+        ## create base graph images - it will serve as background for\n+        ## mate clusters plots\n+        png(fs$graph_base,\n+            width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS)\n+        par(mar=c(0,0,0,0),xaxs="i",yaxs="i")\n+        plotg(GL$G,GL$L, col = "#00000050")\n+        dev.off()\n+        ## load base as raster image\n+        base_image = readPNG(fs$graph_base)\n+\n+        for (i in order(mate_clusters$N, decreasing = TRUE)){\n+            mate_ids = unlist(strsplit(mate_clusters$ids[[i]],split=","))\n+            ## print only graph above MAX_N mates\n+            if (length(mate_ids) < MAX_N){  # TODO  - use constant\n+                next\n+            }\n+            png(sprintf(fs$graph_mates,index, mate_clusters$cl[i]),\n+                width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS)\n+            color_mate = gsub(".$","",V(GL$G)$name) %in% mate_ids %>%\n+                ifelse("#FF0000FF", "#000000AA")\n+            par(mar=c(0,0,0,0),xaxs="i",yaxs="i")\n+            plot(range(GL$L[,1]), range(GL$L[,2]), type = "n", xlab = "", ylab = "", axes = FALSE,\n+                 main = paste0("CL",index," ----> CL",mate_clusters$cl[i]))\n+            rasterImage(base_image,\n+                        range(GL$L[,1])[1], range(GL$L[,2])[1],\n+                        range(GL$L[,1])[2], range(GL$L[,2])[2]\n+                        )\n+            points(GL$L[,1:2], col = color_mate,pch=18,cex=.8)\n+            dev.off()\n+            title = paste0("CL",index," ----> CL",mate_clusters$cl[i])\n+            footer = paste0("No. of shared pairs: :", mate_clusters$N[i])\n+            html_insert_floating_image(\n+                img_file = sprintf(fs_relative$graph_mates, index, mate_clusters$cl[i]),\n+                htmlfile = clinfo$html_report_main, width = 200,\n+                title = title, footer = footer\n+            )\n+        }\n+    }\n+}\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/detect_LTR_insertion_sites.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/detect_LTR_insertion_sites.pl Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,390 @@\n+#!/usr/bin/env perl\n+\n+# parses ACE files of assembled repeats and detects potential\n+# LTR borders/insertion sites of LTR-retroelements\n+\n+# "site" is a region (size of $window) including TG or CA\n+# "out" is a region adjacent to the site, presumably representing insertion sites\n+\n+# this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl"\n+# -m default set to 10 \n+\n+use Getopt::Std;\n+\n+\n+\n+\n+getopt(\'iowsmdrp\');\n+if ($opt_i) {\n+\t$infile = $opt_i;\n+} else {\n+\tdie "-i input_file_name missing\\n";\n+}\n+\n+if ($opt_p) {\n+    $db_PBS = $opt_p;\n+} else {\n+    die "-p PBS database is missing\\n";\n+}\n+\n+\n+\n+\n+if ($opt_o) {\n+\t$outfile = $opt_o;\n+} else {\n+\tdie "-o output_file_name missing\\n";\n+}\n+if ($opt_w) {\n+\t$window = $opt_w;\n+} else {\n+\t$window = 7;\n+\tprint "window size not set, using default ($window)\\n";\n+}\n+if ($opt_s) {\n+\t$min_site_depth = $opt_s;   # minimal average read depth (over $window) required for the site\n+} else {\n+\t$min_site_depth = 10;\n+\tprint "min_site_depth not set, using default ($min_site_depth)\\n";\n+}\n+if ($opt_m) {\n+\t$min_out_masked = $opt_m;  # minimal average number of masked reads outside the site (over $window)\n+} else {\n+\t$min_out_masked = 10;\n+\tprint "min_out_masked not set, using default ($min_out_masked)\\n";\n+}\n+if ($opt_d) {\n+\t$min_masked_fold_diff = $opt_d;  # how many times should the proportion of masked reads "out" be higher than in "site"\n+} else {\n+\t$min_masked_fold_diff = 3;\n+\tprint "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\\n";\n+}\n+if ($opt_x) {\n+\t$max_char_to_masked = $opt_x;  # max fold difference between depth in "site" and masked depth "out"\n+} else {\n+\t$max_char_to_masked = 10;\n+\tprint "max_char_to_masked not set, using default ($max_char_to_masked)\\n"; \n+}\n+if ($opt_r) {\n+\t$extract_region = $opt_r;\n+} else {\n+\t$extract_region = 30;\n+\tprint "extract_region not set, using default ($extract_region)\\n";\n+}\n+\n+# main\n+$out_table = $outfile;\n+$out_LTR   = "$outfile.LTR";\n+$out_ADJ   = "$outfile.ADJ";\n+open (IN,$infile) or die;\n+open (OUT,">$out_table") or die;\n+open (LTR,">$out_LTR") or die;  # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented)\n+open (ADJ,">$out_ADJ") or die;  # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region\n+print OUT "#Parameters:\\n";\n+print OUT "#infile\\t$infile\\n#outfile\\t$outfile\\n#window\\t$window\\n#min_site_depth\\t$min_site_depth\\n";\n+print OUT "#min_out_masked\\t$min_out_masked\\n#min_masked_fold_diff\\t$min_masked_fold_diff\\n#max_char_to_masked\\t$max_char_to_masked\\n#extract_region\\t$extract_region\\n\\n";\n+print OUT "CL\\tcontig\\tTG/CA\\tposition\\tsite\\tsite_depth\\tout_masked\\tmasked_ratio_site\\tmasked_ratio_out\\tregion_in\\tregion_out\\tblast PBS\\n";\n+print "Analyzing ACE file...\\n";\n+$prev = 0;\n+while ($radek = <IN>) {\n+\t$contig_found = &read_contig;\n+\tif ($contig_found) {\n+\t\tif ($cl > $prev) {\n+\t\t\t$prev = $cl;\n+\t\t}\n+\t\t&reconstruct_assembly;\n+\t\t&find_sites;\n+\t}\n+}\n+close IN;\n+close OUT;\n+close LTR;\n+close ADJ;\n+print "Running blast against tRNA database...\\n";\n+&add_PBS_info;    # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!)\n+\n+$error = system("rm $out_table");\n+if ($error) {\n+\tprint "Error removing $out_table\\n";\n+}\n+\n+sub read_contig {\n+\tmy ($reads_found,$read_id);\n+\t# global variables\n+\t$cl = 0;\n+\t$contig = 0;\n+\t$cont_length = 0;\n+\t$reads = 0;   # number of reads\n+\t$cons = "";   # contig consensus (including gaps *)\n+\t%read_starts = ();  # starts of reads within assembly\n+\t%read_lengths = (); # length of reads in assembly (may contain gaps)\n+\t%read_from = ();    # start of non-masked part of read sequence (relative to the read)\n+\t%read_to = ();      # end of non-masked part of read sequence   \n+\t\n+\tdo {\n+\t\tif ($radek =~/^CO CL(\\d+)Contig(\\d+) (\\d+) (\\d+)/) {\n+\t\t\t$cl = $1; $contig = $2; $cont_length = $3; $reads = $4;\n+\t\t\twhile ($radek = <IN> and length($radek) > 1) {\n'..b'o_site\\t$masked_ratio_out\\t";\n+\t\t\t\t\t$region = "";\n+\t\t\t\t\tfor ($f=$pos;$f<=$assembly_length;$f++) {\n+\t\t\t\t\t\tif ($assembly_seq[$f] ne "*") {\n+\t\t\t\t\t\t\t$region .= $assembly_seq[$f];\n+\t\t\t\t\t\t}\n+\t\t\t\t\t\tif (length($region) == $extract_region) {\n+\t\t\t\t\t\t\t$f = $assembly_length;  # terminate cycle\n+\t\t\t\t\t\t}\n+\t\t\t\t\t}\n+\t\t\t\t\tprint OUT "$region\\t";\n+\t\t\t\t\tprint LTR ">CL",$cl,"c".$contig."_TG_$pos\\n";\n+\t\t\t\t\t$region = &revcompl($region);\n+\t\t\t\t\tprint LTR "$region\\n";\n+\t\t\t\t\t$region = "";\n+\t\t\t\t\tfor ($f=$pos-1;$f>0;$f=$f-1) {\n+\t\t\t\t\t\tif ($assembly_seq[$f] ne "*") {\n+\t\t\t\t\t\t\t$region = $assembly_seq[$f].$region;\n+\t\t\t\t\t\t}\n+\t\t\t\t\t\tif (length($region) == $extract_region) {\n+\t\t\t\t\t\t\t$f = 0;  # terminate cycle\n+\t\t\t\t\t\t}\n+\t\t\t\t\t}\n+\t\t\t\t\tprint OUT "$region\\n";\n+\t\t\t\t\tprint ADJ ">CL",$cl,"c".$contig."_TG_$pos\\n";\n+\t\t\t\t\t$region = &revcompl($region);\n+\t\t\t\t\tprint ADJ "$region\\n";\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t}\n+\t\n+\tforeach $pos (@CA) {\n+\t\tif ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) {\n+\t\t\t$site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";\n+\t\t\tfor ($f=$pos-$window+1;$f<=$pos;$f++) {\n+\t\t\t\t$site_sum_char += $assembly_char[$f];\n+\t\t\t\t$site_sum_masked += $assembly_masked[$f];\n+\t\t\t\t$site_seq .= $assembly_seq[$f];\n+\t\t\t}\n+\t\t\t$out_sum_char = 0; $out_sum_masked = 0;\n+\t\t\tfor ($f=$pos+1;$f<=$pos+$window;$f++) {\n+\t\t\t\t$out_sum_char += $assembly_char[$f];\n+\t\t\t\t$out_sum_masked += $assembly_masked[$f];\n+\t\t\t}\n+\t\t\t$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site\n+\t\t\t$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site\n+\t\t\t$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));\n+\t\t\t$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));\n+\t\t\tif ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {\n+\t\t\t\tif ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {\n+\t\t\t\t\tprint OUT "$cl\\t$contig\\tCA\\t$pos\\t$site_seq\\t$site_depth\\t$out_masked\\t$masked_ratio_site\\t$masked_ratio_out\\t";\n+\t\t\t\t\t$region = "";\n+\t\t\t\t\tfor ($f=$pos;$f>0;$f=$f-1) {\n+\t\t\t\t\t\tif ($assembly_seq[$f] ne "*") {\n+\t\t\t\t\t\t\t$region = $assembly_seq[$f].$region;\n+\t\t\t\t\t\t}\n+\t\t\t\t\t\tif (length($region) == $extract_region) {\n+\t\t\t\t\t\t\t$f = 0;  # terminate cycle\n+\t\t\t\t\t\t}\n+\t\t\t\t\t}\n+\t\t\t\t\tprint OUT "$region\\t";\n+\t\t\t\t\tprint LTR ">CL",$cl,"c".$contig."_CA_$pos\\n";\n+\t\t\t\t\tprint LTR "$region\\n";\n+\t\t\t\t\t$region = "";\n+\t\t\t\t\tfor ($f=$pos+1;$f<=$assembly_length;$f++) {\n+\t\t\t\t\t\tif ($assembly_seq[$f] ne "*") {\n+\t\t\t\t\t\t\t$region .= $assembly_seq[$f];\n+\t\t\t\t\t\t}\n+\t\t\t\t\t\tif (length($region) == $extract_region) {\n+\t\t\t\t\t\t\t$f = $assembly_length;  # terminate cycle\n+\t\t\t\t\t\t}\n+\t\t\t\t\t}\n+\t\t\t\t\tprint OUT "$region\\n";\n+\t\t\t\t\tprint ADJ ">CL",$cl,"c".$contig."_CA_$pos\\n";\n+\t\t\t\t\tprint ADJ "$region\\n";\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t}\n+}\n+\n+sub add_PBS_info {\n+\tmy ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length);\n+\t\n+\t$pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F";\n+\t\n+\t@table = ();\n+\topen (TAB,$out_table) or die;\n+\twhile ($rad = <TAB>) {\n+\t\tpush(@table,$rad);\n+\t\t$tab_length++;\n+\t}\n+\tclose TAB;\n+\t\n+\topen (BLAST,"$pbs_blast_command |") or die;\n+\t$prev_query = "";\n+\twhile ($rad = <BLAST>) {\n+\t\tif ($rad =~/^CL(\\d+)c(\\d+)_(TG|CA)_(\\d+)\\t\\S+\\t\\S+\\t/) {   \n+\t\t\tif ("$1\\t$2\\t$3\\t$4" ne $prev_query) {           # to exclude additional HSPs from the same query/subject pair\n+\t\t\t\tfor ($f=0;$f<$tab_length;$f++) {       \n+\t\t\t\t\t@pol = split(/\\t/,$table[$f]);\n+\t\t\t\t\tif ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") {\n+\t\t\t\t\t\tchomp($table[$f]);\n+\t\t\t\t\t\t$table[$f] .= "\\t$rad";\n+\t\t\t\t\t\t$f = $tab_length;  # terminate cycle\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t\t$prev_query = "$1\\t$2\\t$3\\t$4";\n+\t\t\t}\n+\t\t}\n+\t}\n+\tclose BLAST;\n+\t\n+\topen (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die;\n+\tfor ($f=0;$f<$tab_length;$f++) {\n+\t\tprint TAB_WITH_BLAST $table[$f];\n+\t}\n+\tclose TAB_WITH_BLAST;\n+}\n+\n+\n+\n+\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/documentation.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/documentation.html Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,342 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">\n+<head>\n+<!-- 2019-12-09 Po 07:55 -->\n+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n+<meta name="viewport" content="width=device-width, initial-scale=1" />\n+<title>RepeatExplorer documentation</title>\n+<meta name="generator" content="Org mode" />\n+<meta name="author" content="petr" />\n+<style type="text/css">\n+ <!--/*--><![CDATA[/*><!--*/\n+  .title  { text-align: center;\n+             margin-bottom: .2em; }\n+  .subtitle { text-align: center;\n+              font-size: medium;\n+              font-weight: bold;\n+              margin-top:0; }\n+  .todo   { font-family: monospace; color: red; }\n+  .done   { font-family: monospace; color: green; }\n+  .priority { font-family: monospace; color: orange; }\n+  .tag    { background-color: #eee; font-family: monospace;\n+            padding: 2px; font-size: 80%; font-weight: normal; }\n+  .timestamp { color: #bebebe; }\n+  .timestamp-kwd { color: #5f9ea0; }\n+  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }\n+  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }\n+  .org-center { margin-left: auto; margin-right: auto; text-align: center; }\n+  .underline { text-decoration: underline; }\n+  #postamble p, #preamble p { font-size: 90%; margin: .2em; }\n+  p.verse { margin-left: 3%; }\n+  pre {\n+    border: 1px solid #ccc;\n+    box-shadow: 3px 3px 3px #eee;\n+    padding: 8pt;\n+    font-family: monospace;\n+    overflow: auto;\n+    margin: 1.2em;\n+  }\n+  pre.src {\n+    position: relative;\n+    overflow: visible;\n+    padding-top: 1.2em;\n+  }\n+  pre.src:before {\n+    display: none;\n+    position: absolute;\n+    background-color: white;\n+    top: -10px;\n+    right: 10px;\n+    padding: 3px;\n+    border: 1px solid black;\n+  }\n+  pre.src:hover:before { display: inline;}\n+  /* Languages per Org manual */\n+  pre.src-asymptote:before { content: \'Asymptote\'; }\n+  pre.src-awk:before { content: \'Awk\'; }\n+  pre.src-C:before { content: \'C\'; }\n+  /* pre.src-C++ doesn\'t work in CSS */\n+  pre.src-clojure:before { content: \'Clojure\'; }\n+  pre.src-css:before { content: \'CSS\'; }\n+  pre.src-D:before { content: \'D\'; }\n+  pre.src-ditaa:before { content: \'ditaa\'; }\n+  pre.src-dot:before { content: \'Graphviz\'; }\n+  pre.src-calc:before { content: \'Emacs Calc\'; }\n+  pre.src-emacs-lisp:before { content: \'Emacs Lisp\'; }\n+  pre.src-fortran:before { content: \'Fortran\'; }\n+  pre.src-gnuplot:before { content: \'gnuplot\'; }\n+  pre.src-haskell:before { content: \'Haskell\'; }\n+  pre.src-hledger:before { content: \'hledger\'; }\n+  pre.src-java:before { content: \'Java\'; }\n+  pre.src-js:before { content: \'Javascript\'; }\n+  pre.src-latex:before { content: \'LaTeX\'; }\n+  pre.src-ledger:before { content: \'Ledger\'; }\n+  pre.src-lisp:before { content: \'Lisp\'; }\n+  pre.src-lilypond:before { content: \'Lilypond\'; }\n+  pre.src-lua:before { content: \'Lua\'; }\n+  pre.src-matlab:before { content: \'MATLAB\'; }\n+  pre.src-mscgen:before { content: \'Mscgen\'; }\n+  pre.src-ocaml:before { content: \'Objective Caml\'; }\n+  pre.src-octave:before { content: \'Octave\'; }\n+  pre.src-org:before { content: \'Org mode\'; }\n+  pre.src-oz:before { content: \'OZ\'; }\n+  pre.src-plantuml:before { content: \'Plantuml\'; }\n+  pre.src-processing:before { content: \'Processing.js\'; }\n+  pre.src-python:before { content: \'Python\'; }\n+  pre.src-R:before { content: \'R\'; }\n+  pre.src-ruby:before { content: \'Ruby\'; }\n+  pre.src-sass:before { content: \'Sass\'; }\n+  pre.src-scheme:before { content: \'Scheme\'; }\n+  pre.src-screen:before { content: \'Gnu Screen\'; }\n+  pre.src-sed:before { content: \'Sed\'; }\n+  pre.src-sh:before { content: \'shell\'; }\n+  pre.src-sql:before { content: \'SQL\'; }\n+  pre.src-sqlite:before { content: \'SQLite\'; }\n+  /* additional languages in org.el\'s org-babel-load-languages a'..b'</li>\n+<li>other clusters \xe2\x80\x93 these clusters are not reconstructed by TAREAN because no potential tandem like structure was found.</li>\n+</ul>\n+\n+<p>\n+Summary tables from TAREAN html report include following information:\n+</p>\n+\n+<dl class="org-dl">\n+<dt>Cluster</dt><dd>cluster identifier</dd>\n+<dt>Proportion<code>[%]</code></dt><dd>(Number of sequences in cluster/Number of sequences in clustering) x 100%</dd>\n+<dt>Proportion adjusted<code>[%]</code></dt><dd></dd>\n+\n+<dt>Number of reads</dt><dd>Number of reads in the cluster</dd>\n+<dt>Satellite probability</dt><dd>Empirical probability estimate that cluster sequences are derived from satellite repeat. This estimate is based on analysis of  manually anotated and experimentaly validated satellite repeats</dd>\n+<dt>Consensus length</dt><dd></dd>\n+\n+<dt>Consensus</dt><dd>Consensus sequence is outcome of kmer-based analysis and represents the most probable satellite monomer sequence, other alternative consensus sequences are included in individual cluster reports</dd>\n+<dt>Graph layout</dt><dd>Graph-based visualization of similarities among sequence reads</dd>\n+<dt>Kmer analysis</dt><dd>hyperlink to Individual clusters TAREAN kmer report (fig X, box 10)</dd>\n+<dt>Connected component index C</dt><dd>Proportion of nodes of the graph which are part of the the largest strongly connected component</dd>\n+<dt>Pair completeness index P</dt><dd>Proportion of reads with available mate-pair within the same cluster</dd>\n+<dt>Kmer coverage</dt><dd>Sum of relative frequencies of all kmers used for consensus sequence reconstruction</dd>\n+<dt>|V|</dt><dd>Number of vertices of the graph</dd>\n+<dt>|E|</dt><dd>Number of edges of the graph</dd>\n+<dt>PBS score</dt><dd>Primer binding site detection score</dd>\n+<dt>Similarity hits</dt><dd>similarity hits based on the search using blastn/blastx against built-in databases of known  sequences. By default, this will contain similarity hits to built in database which include rDNA sequences, plastid and mitochondrial sequences. If TAREAN was run within RepeatExplorer2 pipeline, it will also contain information about similarity hist against REXdb database.</dd>\n+</dl>\n+\n+<p>\n+In individual clusters TAREAN report contain other variant of consensus\n+sequences sorted by kmer coverage score. For each consensus, corresponding\n+de-Bruijn graph representation and corresponding sequence logo is shown.\n+</p>\n+\n+<h1 id="kmer"> TAREAN k-mer analysis report </h1>\n+\n+<p>\n+TAREAN module generates kmer analysis report for each cluster assigned to a putative satellite, rDNA or a putative LTR category. Monomer sequences  of putative tandem repeats  are reconstructed using k-mer based method using the most frequent k-mers. Several k-mer lengths are evaluated and the best estimated of monomer consensus sequence are reported. Kmer analysis summary contain the following information:\n+</p>\n+<dl class="org-dl">\n+<dt>k-mer length</dt><dd>length of the k-mer used for monomer reconstruction</dd>\n+<dt>Variant index</dt><dd>Each kmer of given length can yield multiple consensus variant. Variants are indexed</dd>\n+<dt>k-mer coverage score </dt><dd>is sum of proportions of all k-mer used for reconstruction of particular monomer. If the value is 1 then all kmers from corresponding cluster were used for reconstruction of monomer meaning that there is no variability. The more variable the monomer, the lower the k-mer coverage score.</dd>\n+<dt>Consensus length</dt><dd>length of estimated monomer</dd>\n+<dt>Consensus</dt><dd>consensus sequence shows the consensus sequence extracted from position probability matrix.</dd>\n+<dt>k-mer bases graph</dt><dd>the visualization of de-Bruijn graph. Each vertex corespond to single k-mer. Size of vertex is proportional to the kmer frequency. Path which was used to reconstruct monomer sequence is grey out.</dd>\n+<dt>Sequence logo </dt><dd>visualization of position probability matrices for corresponding consensus variant.</dd>\n+</dl>\n+</div>\n+</body>\n+</html>\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/documentation.org
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/documentation.org Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,86 @@
+#+TITLE: RepeatExplorer documentation
+#+HTML_HEAD_EXTRA: <link rel="stylesheet" type="text/css" href="style1.css" />
+#+LANGUAGE: en
+#+OPTIONS: html-postamble:nil
+
+#+begin_export html 
+<h1 id="clust"> Cluster annotation table </h1>
+#+end_export
+
+- Cluster :: cluster index, contain link to individual cluster report
+- Supercluster :: Supercluster index, contains link inf individual supercluster report
+- Proportion[%] :: Proportion of the reads in the cluster with respect to the amount of number of analyzed sequence.
+- Proportions adjusted[%] ::  Adjusted genome proportion can differ from unadjusted value if the Perform automatic filtering of abundant satellite repeats was on. Sequences belonging to high abundance satellites were partially removed from all-to-all comparison and clustering. This causes that the Genome proportion estimate for these satellite is underestimated. Adjusted Genome proportion provide corrected estimate of â€˜real’ genomic proportion for particular satellite repeat. 
+- Number of reads ::  number of reads in the cluster
+- Graph layout :: Preview of graph based visualization of sequence reads cluster. More detailed graph layout can be foun in individual cluster reports
+- Similarity hits :: summarize the proportion of reads in the clusters with similarity to REXdb or DNA reference databases. Only hits with proportion above 0.1% are shown
+- LTR detection  ::  Show if the LTR with primer binding site was detected on contig assembly and what type of tRNA is used for priming. 
+- Satellite probability :: provide empirical probability that cluster represent satellite
+- TAREAN classification  :: TAREAN divides clusters into five categories described in box 9.
+- Consensus length :: For clusters analyzed by TAREAN module, the best estimate of monomer length is shown.
+- Consensus :: The best consensus estimate reconstructed by TAREAN module
+- Kmer analysis :: if cluster was analyzed by TAREAN, this field contains the link to the detailed TAREAN kmer analysis (box 10)
+- Connected component index C,  Pair completeness index P, Kmer coverage :: statistics reported by TAREAN module
+- |V|  :: Number of vertices of the graph
+- |E|  :: Number of edges of the graph
+
+
+#+begin_export html 
+<h1 id="superclust"> Supercluster annotation table </h1>
+#+end_export
+
+ - Supercluster :: supercluster index
+ - Reads :: number of reads in supercluster
+ - Automatic classification :: Result of automatic supercluster classification
+ - Similarity hits :: Number similarity hits against REXdb and DNA database are shown in the classification tree structure together with the number of reads assigned to putative satellite cluster  and information about detection of LTR/PBS. The parts of the tree without any evidences are pruned off.
+ - TAREAN annotation :: Clusters which are part of supercluster and classified by TAREAN as putative satellite are listed here
+ - Clusters :: hyperlinked list of clusters which are part of the superclusters. 
+
+#+begin_export html 
+<h1 id="tra"> Tandem repeat analysis </h1>
+#+end_export
+
+TAREAN divides clusters into five categories with corresponding files in the
+archive:
+
+    - High confidence satellites with consensus sequences in file ~TR_consensus_rank_1_.fasta~
+    - Low confidence satellites with consensus sequences in file ~TR_consensus_rank_2_.fasta~
+    - Putative LTR element with consensus sequences in file ~TR_consensus_rank_3_.fasta~
+    - rDNA with consensus in ~TR_consensus_rank_4_.fasta~
+    - other clusters â€“ these clusters are not reconstructed by TAREAN because no potential tandem like structure was found.
+
+Summary tables from TAREAN html report include following information:
+
+    - Cluster :: cluster identifier
+    - Proportion[%] ::  (Number of sequences in cluster/Number of sequences in clustering) x 100%
+    - Proportion adjusted[%] ::
+    - Number of reads :: Number of reads in the cluster
+    - Satellite probability :: Empirical probability estimate that cluster sequences are derived from satellite repeat. This estimate is based on analysis of  manually anotated and experimentaly validated satellite repeats
+    - Consensus length ::
+    - Consensus :: Consensus sequence is outcome of kmer-based analysis and represents the most probable satellite monomer sequence, other alternative consensus sequences are included in individual cluster reports
+    - Graph layout :: Graph-based visualization of similarities among sequence reads
+    - Kmer analysis :: hyperlink to Individual clusters TAREAN kmer report (fig X, box 10)
+    - Connected component index C :: Proportion of nodes of the graph which are part of the the largest strongly connected component
+    - Pair completeness index P :: Proportion of reads with available mate-pair within the same cluster
+    - Kmer coverage :: Sum of relative frequencies of all kmers used for consensus sequence reconstruction
+    - |V| :: Number of vertices of the graph
+    - |E| :: Number of edges of the graph
+    - PBS score :: Primer binding site detection score
+    - Similarity hits ::  similarity hits based on the search using blastn/blastx against built-in databases of known  sequences. By default, this will contain similarity hits to built in database which include rDNA sequences, plastid and mitochondrial sequences. If TAREAN was run within RepeatExplorer2 pipeline, it will also contain information about similarity hist against REXdb database. 
+
+In individual clusters TAREAN report contain other variant of consensus
+sequences sorted by kmer coverage score. For each consensus, corresponding
+de-Bruijn graph representation and corresponding sequence logo is shown.
+
+#+begin_export html 
+<h1 id="kmer"> TAREAN k-mer analysis report </h1>
+#+end_export
+
+TAREAN module generates kmer analysis report for each cluster assigned to a putative satellite, rDNA or a putative LTR category. Monomer sequences  of putative tandem repeats  are reconstructed using k-mer based method using the most frequent k-mers. Several k-mer lengths are evaluated and the best estimated of monomer consensus sequence are reported. Kmer analysis summary contain the following information:
+- k-mer length :: length of the k-mer used for monomer reconstruction
+- Variant index :: Each kmer of given length can yield multiple consensus variant. Variants are indexed
+- k-mer coverage score  :: is sum of proportions of all k-mer used for reconstruction of particular monomer. If the value is 1 then all kmers from corresponding cluster were used for reconstruction of monomer meaning that there is no variability. The more variable the monomer, the lower the k-mer coverage score.
+- Consensus length :: length of estimated monomer
+- Consensus :: consensus sequence shows the consensus sequence extracted from position probability matrix.
+- k-mer bases graph :: the visualization of de-Bruijn graph. Each vertex corespond to single k-mer. Size of vertex is proportional to the kmer frequency. Path which was used to reconstruct monomer sequence is grey out.  
+- Sequence logo  :: visualization of position probability matrices for corresponding consensus variant.
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/graphtools.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/graphtools.py Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,1079 @@\n+#!/usr/bin/env python3\n+\'\'\'\n+This module is mainly for large graph (e.i hitsort) storage, parsing and for clustering\n+\'\'\'\n+import os\n+import sys\n+import sqlite3\n+import time\n+import subprocess\n+import logging\n+from collections import defaultdict\n+import collections\n+import operator\n+import math\n+import random\n+import itertools\n+import config\n+from lib import r2py\n+from lib.utils import FilePath\n+from lib.parallel.parallel import parallel2 as parallel\n+REQUIRED_VERSION = (3, 4)\n+MAX_BUFFER_SIZE = 100000\n+if sys.version_info < REQUIRED_VERSION:\n+    raise Exception("\\n\\npython 3.4 or higher is required!\\n")\n+LOGGER = logging.getLogger(__name__)\n+\n+\n+def dfs(start, graph):\n+    """\n+    helper function for cluster merging.\n+    Does depth-first search, returning a set of all nodes seen.\n+    Takes: a graph in node --> [neighbors] form.\n+    """\n+    visited, worklist = set(), [start]\n+\n+    while worklist:\n+        node = worklist.pop()\n+        if node not in visited:\n+            visited.add(node)\n+            # Add all the neighbors to the worklist.\n+            worklist.extend(graph[node])\n+\n+    return visited\n+\n+\n+def graph_components(edges):\n+    """\n+    Given a graph as a list of edges, divide the nodes into components.\n+    Takes a list of pairs of nodes, where the nodes are integers.\n+    """\n+\n+    # Construct a graph (mapping node --> [neighbors]) from the edges.\n+    graph = defaultdict(list)\n+    nodes = set()\n+\n+    for v1, v2 in edges:\n+        nodes.add(v1)\n+        nodes.add(v2)\n+\n+        graph[v1].append(v2)\n+        graph[v2].append(v1)\n+\n+    # Traverse the graph to find the components.\n+    components = []\n+\n+    # We don\'t care what order we see the nodes in.\n+    while nodes:\n+        component = dfs(nodes.pop(), graph)\n+        components.append(component)\n+\n+        # Remove this component from the nodes under consideration.\n+        nodes -= component\n+\n+    return components\n+\n+\n+class Graph():\n+    \'\'\'\n+    create Graph object stored in sqlite database, either in memory or on disk\n+    structure of table is:\n+    V1 V2 weigth12\n+    V2 V3 weight23\n+    V4 V5 weight45\n+    ...\n+    ...\n+    !! this is undirected simple graph - duplicated edges must\n+    be removed on graph creation\n+\n+    \'\'\'\n+    # seed for random number generator - this is necessary for reproducibility between runs\n+    seed = \'123\'\n+\n+    def __init__(self,\n+                 source=None,\n+                 filename=None,\n+                 new=False,\n+                 paired=True,\n+                 seqids=None):\n+        \'\'\'\n+        filename : fite where to store database, if not defined it is stored in memory\n+        source : ncol file from which describe graph\n+        new : if false and source is not define graph can be loaded from database (filename)\n+\n+        vertices_name must be in correcti order!!!\n+        \'\'\'\n+\n+        self.filename = filename\n+        self.source = source\n+        self.paired = paired\n+        # path to indexed graph - will be set later\n+        self.indexed_file = None\n+        self._cluster_list = None\n+        # these two attributes are set after clustering\n+        # communities before merging\n+        self.graph_2_community0 = None\n+        # communities after merging\n+        self.graph_2_community = None\n+        self.number_of_clusters = None\n+        self.binary_file = None\n+        self.cluster_sizes = None\n+        self.graph_tree = None\n+        self.graph_tree_log = None\n+        self.weights_file = None\n+\n+        if filename:\n+            if os.path.isfile(filename) and (new or source):\n+                os.remove(filename)\n+            self.conn = sqlite3.connect(filename)\n+        else:\n+            self.conn = sqlite3.connect(":memory:")\n+        c = self.conn.cursor()\n+\n+        c.execute("PRAGMA page_size=8192")\n+        c.execute("PRAGMA cache_size = 2000000 ")  # this helps\n+\n+        try:\n+            c.execute((\n+                "create table graph (v1 integ'..b'+            return {\'keys\': list(out.keys()), \'values\': list(out.values())}\n+\n+    def detect_ltr(self, trna_database):\n+        \'\'\'detection of ltr in assembly files, output of analysis is stored in file\'\'\'\n+        CREATE_ANNOTATION = r2py.R(config.RSOURCE_create_annotation, verbose=False)\n+        if self.assembly_files[\'{}.{}.ace\']:\n+            ace_file = self.assembly_files[\'{}.{}.ace\']\n+            print(ace_file, "running LTR detection")\n+            fout = "{}/{}".format(self.dir, config.LTR_DETECTION_FILES[\'BASE\'])\n+            subprocess.check_call([\n+                config.LTR_DETECTION,\n+                \'-i\', ace_file,\n+                \'-o\', fout,\n+                \'-p\', trna_database])\n+            # evaluate LTR presence\n+            fn = "{}/{}".format(self.dir, config.LTR_DETECTION_FILES[\'PBS_BLAST\'])\n+            self.ltr_detection = CREATE_ANNOTATION.evaluate_LTR_detection(fn)\n+\n+\n+    @staticmethod\n+    def _summarize_annotations(annotations_files, size):\n+        \'\'\' will tabulate annotation results \'\'\'\n+        # TODO\n+        summaries = {}\n+        # weight is in percentage\n+        weight = 100 / size\n+        for i in annotations_files:\n+            with open(annotations_files[i]) as f:\n+                header = f.readline().split()\n+                id_index = [\n+                    i for i, item in enumerate(header) if item == "db_id"\n+                ][0]\n+                for line in f:\n+                    classification = line.split()[id_index].split("#")[1]\n+                    if classification in summaries:\n+                        summaries[classification] += weight\n+                    else:\n+                        summaries[classification] = weight\n+        # format summaries for printing\n+        annotation_string = ""\n+        annotation_table = []\n+        for i in sorted(summaries.items(), key=lambda x: x[1], reverse=True):\n+            ## hits with smaller proportion are not shown!\n+            if i[1] > 0.1:\n+                if i[1] > 1:\n+                    annotation_string += "<b>{1:.2f}% {0}</b>\\n".format(*i)\n+                else:\n+                    annotation_string += "{1:.2f}% {0}\\n".format(*i)\n+            annotation_table.append(i)\n+        return [annotation_string, annotation_table]\n+\n+    @staticmethod\n+    def add_cluster_table_to_database(cluster_table, db_path):\n+        \'\'\'get column names from Cluster object and create\n+        correspopnding table in database values from all\n+        clusters are filled to database\'\'\'\n+        column_name_and_type = []\n+        column_list = []\n+\n+        # get all atribute names -> they are column names\n+        # in sqlite table, detect proper sqlite type\n+        def identity(x):\n+            return (x)\n+\n+        for i in cluster_table[1]:\n+            t = type(cluster_table[1][i])\n+            if t == int:\n+                sqltype = "integer"\n+                convert = identity\n+            elif t == float:\n+                sqltype = "real"\n+                convert = identity\n+            elif t == bool:\n+                sqltype = "boolean"\n+                convert = bool\n+            else:\n+                sqltype = "text"\n+                convert = str\n+            column_name_and_type += ["[{}] {}".format(i, sqltype)]\n+            column_list += [tuple((i, convert))]\n+        header = ", ".join(column_name_and_type)\n+        db = sqlite3.connect(db_path)\n+        c = db.cursor()\n+        print("CREATE TABLE cluster_info ({})".format(header))\n+        c.execute("CREATE TABLE cluster_info ({})".format(header))\n+        # file data to cluster_table\n+        buffer = []\n+        for i in cluster_table:\n+            buffer.append(tuple(\'{}\'.format(fun(i[j])) for j, fun in\n+                                column_list))\n+        wildcards = ",".join(["?"] * len(column_list))\n+        print(buffer)\n+        c.executemany("insert into cluster_info values ({})".format(wildcards),\n+                      buffer)\n+        db.commit()\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/htmlheader.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/htmlheader.R Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,8 @@
+htmlheader="
+<html xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">
+  <head>
+    <meta charset=\"utf-8\"/>
+    <title> PAGE_TITLE </title>
+    <link rel=\"stylesheet\" href=\"style1.css\">
+  </head>
+"
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/parallel/.gitignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/parallel/.gitignore Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,1 @@
+__pycache__
\ No newline at end of file
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/parallel/parallel.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/parallel/parallel.py Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,366 @@\n+#!/usr/bin/env python3\n+import multiprocessing\n+import os\n+import time\n+from itertools import cycle\n+\'\'\'\n+functions for parallel processing of data chunks using worker function\n+\'\'\'\n+\n+\n+def run_multiple_pbs_jobs(cmds, status_files, qsub_params=""):\n+    \'\'\'\n+    Example of pbs_params:\n+    -l walltime=1000:00:00,nodes=1:ppn=8,mem=15G\n+    -l walltime=150:00:00,nodes=1:ppn=1\n+\n+    \'\'\'\n+    jobs = []\n+    status_function = []\n+    status_command = []\n+    for cmd, sf in zip(cmds, status_files):\n+        jobs.append(pbs_send_job(cmd, sf, qsub_params))\n+    for p in jobs:\n+        p.join()\n+        status_function.append(p.exitcode)\n+    # collect pbs run status\n+    for sf in status_files:\n+        with open(sf) as f:\n+            status_command.append(f.read().strip())\n+    status = {\'function\': status_function, \'command\': status_command}\n+    return status\n+\n+\n+def pbs_send_job(cmd, status_file, qsub_params):\n+    \'\'\' send job to pbs cluster, require status file\'\'\'\n+    p = multiprocessing.Process(target=pbs_run,\n+                                args=(cmd, status_file, qsub_params))\n+    p.start()\n+    return p\n+\n+\n+def pbs_run(cmd, status_file, qsub_params):\n+    \'\'\'\n+    run shell command cmd on pbs cluster, wait for job to finish\n+    and return status\n+    \'\'\'\n+    print(status_file)\n+    error_file = status_file + ".e"\n+    # test if writable\n+    try:\n+        f = open(status_file, \'w\').close()\n+        f = open(error_file, \'w\').close()\n+    except IOError:\n+        print("cannot write to status files, make sure path exists")\n+        raise IOError\n+\n+    if os.path.exists(status_file):\n+        print("removing old status file")\n+        os.remove(status_file)\n+    cmd_full = ("echo \'{cmd} && echo \\"OK\\" > {status_file} || echo \\"ERROR\\""\n+                " > {status_file}\' | qsub -e {err}"\n+                " {qsub_params} ").format(cmd=cmd, status_file=status_file,\n+                                          err=error_file,\n+                                          qsub_params=qsub_params)\n+    os.system(cmd_full)\n+\n+    while True:\n+        if os.path.exists(status_file):\n+            break\n+        else:\n+            time.sleep(3)\n+    with open(status_file) as f:\n+        status = f.read().strip()\n+    return status\n+\n+\n+def spawn(f):\n+    def fun(pipe, x):\n+        pipe.send(f(x))\n+        pipe.close()\n+    return fun\n+\n+\n+def get_max_proc():\n+    \'\'\'Number of cpu to ise in ether get from config.py is available or\n+    from global PROC or from environment variable PRCO or set to system max\'\'\'\n+    try:\n+        from config import PROC as max_proc\n+    except ImportError:\n+        if "PROC" in globals():\n+            max_proc = PROC\n+        elif "PROC" in os.environ:\n+            max_proc = int(os.environ["PROC"])\n+\n+        else:\n+            max_proc = multiprocessing.cpu_count()\n+    return max_proc\n+\n+\n+def parmap2(f, X, groups, ppn):\n+    max_proc = get_max_proc()\n+    print("running in parallel using ", max_proc, "cpu(s)")\n+    process_pool = []\n+    output = [None] * len(X)\n+    # prepare processes\n+    for x, index in zip(X, list(range(len(X)))):\n+        # status:\n+        # 0: waiting, 1: running, 2:collected\n+        process_pool.append({\n+            \'status\': 0,\n+            \'proc\': None,\n+            \'pipe\': None,\n+            \'index\': index,\n+            \'group\': groups[index],\n+            \'ppn\': ppn[index]\n+\n+        })\n+\n+    # run processes\n+    running = 0\n+    finished = 0\n+    sleep_time = 0.001\n+    while True:\n+        # count alive processes\n+        if not sleep_time:\n+            sleep_time = 0.001\n+        for i in process_pool:\n+            if i[\'status\'] == 1 and not (i[\'proc\'].exitcode is None):\n+                sleep_time = 0.0\n+                # was running now finished --> collect\n+                i[\'status\'] = 2\n+                running -= 1\n+                finished += 1\n+                output[i[\'index\']] = collect(i[\'proc\'], i[\'pipe\'])\n+                del i'..b'ection\n+                del proc[index]\n+                del pipe[index]\n+\n+    # collect the rest:\n+    [pf.join() for pf in proc]\n+    for pf, pp in zip(proc, pipe):\n+        if pf.pid and not pf.exitcode and not pf.is_alive() and (pf.name not in returnvalue):\n+            returnvalue[str(pf.name)] = pp[0].recv()\n+            pp[0].close()\n+            pp[1].close()\n+    # convert to list in input correct order\n+    returnvalue = [returnvalue[str(i)] for i in range(len(X))]\n+    return returnvalue\n+\n+\n+def parallel2(command, *args, groups=None, ppn=None):\n+    \'\'\' same as parallel but groups are used to identifie mutually\n+    exclusive jobs, jobs with the same goup id are never run together\n+    ppn params is \'load\' of the job - sum of loads cannot exceed 1\n+    \'\'\'\n+    # check args, expand if necessary\n+    args = list(args)\n+    N = [len(i) for i in args]  # lengths of lists\n+    Mx = max(N)\n+    if len(set(N)) == 1:\n+        # all good\n+        pass\n+    elif set(N) == set([1, Mx]):\n+        # expand args of length 1\n+        for i in range(len(args)):\n+            if len(args[i]) == 1:\n+                args[i] = args[i] * Mx\n+    else:\n+        raise ValueError\n+    if not groups:\n+        groups = range(Mx)\n+    elif len(groups) != Mx:\n+        print("length of groups must be same as number of job or None")\n+        raise ValueError\n+\n+    if not ppn:\n+        ppn = [0] * Mx\n+    elif len(ppn) != Mx:\n+        print("length of ppn must be same as number of job or None")\n+        raise ValueError\n+    elif max(ppn) > 1 and min(ppn):\n+        print("ppn values must be in 0 - 1 range")\n+        raise ValueError\n+    # convert argument to suitable format - \'transpose\'\n+    argsTuples = list(zip(*args))\n+    args = [list(i) for i in argsTuples]\n+\n+    # multiprocessing.Pool()\n+\n+    def command_star(args):\n+        return(command(*args))\n+\n+    x = parmap2(command_star,  argsTuples, groups, ppn)\n+    return x\n+\n+\n+def parallel(command, *args):\n+    \'\'\' Execute command in parallel using multiprocessing\n+    command is the function to be executed\n+    args is list of list of arguments\n+    execution is :\n+        command(args[0][0],args[1][0],args[2][0],args[3][0],....)\n+        command(args[0][1],args[1][1],args[2][1],args[3][1],....)\n+        command(args[0][2],args[1][2],args[2][2],args[3][2],....)\n+        ...\n+    output of command is returned as list\n+    \'\'\'\n+    # check args, expand if necessary\n+    args = list(args)\n+    N = [len(i) for i in args]  # lengths of lists\n+    Mx = max(N)\n+    if len(set(N)) == 1:\n+        # all good\n+        pass\n+    elif set(N) == set([1, Mx]):\n+        # expand args of length 1\n+        for i in range(len(args)):\n+            if len(args[i]) == 1:\n+                args[i] = args[i] * Mx\n+    else:\n+        raise ValueError\n+\n+    # convert argument to suitable format - \'transpose\'\n+    argsTuples = list(zip(*args))\n+    args = [list(i) for i in argsTuples]\n+\n+    multiprocessing.Pool()\n+\n+    def command_star(args):\n+        return(command(*args))\n+\n+    x = parmap(command_star, argsTuples)\n+    return x\n+\n+\n+def worker(*a):\n+    x = 0\n+    y = 0\n+    for i in a:\n+        if i == 1.1:\n+            print("raising exception")\n+            s = 1 / 0\n+        y += i\n+        for j in range(10):\n+            x += i\n+            for j in range(100000):\n+                x = 1.0 / (float(j) + 1.0)\n+    return(y)\n+\n+# test\n+if __name__ == "__main__":\n+ #   x = parallel2(worker, [1], [2], [3], [4], [1], [1, 2, 3, 7, 10, 1.1, 20, 30, 40, 10, 30, 20, 40, 50, 50], [\n+ #       3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 5, 6, 4, 3, 2])\n+\n+    x = parallel2(\n+        worker, [1], [2], [3], [4], [1],\n+        [1, 2, 3, 7, 10, 1.2, 20, 30, 40, 10, 30, 20, 40, 50, 50],\n+        [3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 5, 6, 4, 3, 2],\n+        groups=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],\n+        ppn=[0.6, 0.6, 0.2, 0.6, 0.2, 0.2, 0.4,\n+             0.1, 0.1, 0.3, 0.3, 0.3, 0.1, 0.1, 0.1]\n+    )\n+    print(x)\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/pylintrc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/pylintrc Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,380 @@\n+[MASTER]\n+\n+# Specify a configuration file.\n+#rcfile=\n+\n+# Python code to execute, usually for sys.path manipulation such as\n+# pygtk.require().\n+\n+init-hook=\'import sys; sys.path.append("..")\'\n+\n+# Add files or directories to the blacklist. They should be base names, not\n+# paths.\n+ignore=CVS\n+\n+# Pickle collected data for later comparisons.\n+persistent=yes\n+\n+# List of plugins (as comma separated values of python modules names) to load,\n+# usually to register additional checkers.\n+load-plugins=\n+\n+# Use multiple processes to speed up Pylint.\n+jobs=1\n+\n+# Allow loading of arbitrary C extensions. Extensions are imported into the\n+# active Python interpreter and may run arbitrary code.\n+unsafe-load-any-extension=no\n+\n+# A comma-separated list of package or module names from where C extensions may\n+# be loaded. Extensions are loading into the active Python interpreter and may\n+# run arbitrary code\n+extension-pkg-whitelist=\n+\n+# Allow optimization of some AST trees. This will activate a peephole AST\n+# optimizer, which will apply various small optimizations. For instance, it can\n+# be used to obtain the result of joining multiple strings with the addition\n+# operator. Joining a lot of strings can lead to a maximum recursion error in\n+# Pylint and this flag can prevent that. It has one side effect, the resulting\n+# AST will be different than the one from reality.\n+optimize-ast=no\n+\n+\n+[MESSAGES CONTROL]\n+\n+# Only show warnings with the listed confidence levels. Leave empty to show\n+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED\n+confidence=\n+\n+# Enable the message, report, category or checker with the given id(s). You can\n+# either give multiple identifier separated by comma (,) or put this option\n+# multiple time (only on the command line, not in the configuration file where\n+# it should appear only once). See also the "--disable" option for examples.\n+#enable=\n+\n+# Disable the message, report, category or checker with the given id(s). You\n+# can either give multiple identifiers separated by comma (,) or put this\n+# option multiple times (only on the command line, not in the configuration\n+# file where it should appear only once).You can also use "--disable=all" to\n+# disable everything first and then reenable specific checks. For example, if\n+# you want to run only the similarities checker, you can use "--disable=all\n+# --enable=similarities". If you want to run only the classes checker, but have\n+# no Warning level messages displayed, use"--disable=all --enable=classes\n+# --disable=W"\n+disable=parameter-unpacking,suppressed-message,intern-builtin,hex-method,next-method-called,useless-suppression,no-absolute-import,execfile-builtin,metaclass-assignment,setslice-method,unichr-builtin,round-builtin,getslice-method,coerce-method,apply-builtin,print-statement,old-ne-operator,cmp-method,long-builtin,cmp-builtin,reload-builtin,filter-builtin-not-iterating,standarderror-builtin,file-builtin,buffer-builtin,indexing-exception,raising-string,old-octal-literal,range-builtin-not-iterating,oct-method,old-raise-syntax,xrange-builtin,zip-builtin-not-iterating,reduce-builtin,unicode-builtin,raw_input-builtin,coerce-builtin,dict-iter-method,basestring-builtin,long-suffix,delslice-method,dict-view-method,old-division,input-builtin,unpacking-in-except,map-builtin-not-iterating,nonzero-method,import-star-module-level,using-cmp-argument,backtick,W1202\n+\n+\n+[REPORTS]\n+\n+# Set the output format. Available formats are text, parseable, colorized, msvs\n+# (visual studio) and html. You can also give a reporter class, eg\n+# mypackage.mymodule.MyReporterClass.\n+output-format=text\n+\n+# Put messages in a separate file for each module / package specified on the\n+# command line instead of printing them on stdout. Reports (if any) will be\n+# written in a file name "pylint_global.[txt|html]".\n+files-output=no\n+\n+# Tells whether to display a full report or only the messages\n+reports=yes\n+\n+# Python expression which should re'..b'=100\n+\n+# Regexp for a line that is allowed to be longer than the limit.\n+ignore-long-lines=^\\s*(# )?<?https?://\\S+>?$\n+\n+# Allow the body of an if to be on the same line as the test if there is no\n+# else.\n+single-line-if-stmt=no\n+\n+# List of optional constructs for which whitespace checking is disabled. `dict-\n+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\\n222: 2}.\n+# `trailing-comma` allows a space between comma and closing bracket: (a, ).\n+# `empty-line` allows space-only lines.\n+no-space-check=trailing-comma,dict-separator\n+\n+# Maximum number of lines in a module\n+max-module-lines=1000\n+\n+# String used as indentation unit. This is usually "    " (4 spaces) or "\\t" (1\n+# tab).\n+indent-string=\'    \'\n+\n+# Number of spaces of indent required inside a hanging  or continued line.\n+indent-after-paren=4\n+\n+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.\n+expected-line-ending-format=\n+\n+\n+[LOGGING]\n+\n+# Logging modules to check that the string format arguments are in logging\n+# function parameter format\n+logging-modules=logging\n+\n+\n+[SIMILARITIES]\n+\n+# Minimum lines number of a similarity.\n+min-similarity-lines=4\n+\n+# Ignore comments when computing similarities.\n+ignore-comments=yes\n+\n+# Ignore docstrings when computing similarities.\n+ignore-docstrings=yes\n+\n+# Ignore imports when computing similarities.\n+ignore-imports=no\n+\n+\n+[VARIABLES]\n+\n+# Tells whether we should check for unused import in __init__ files.\n+init-import=no\n+\n+# A regular expression matching the name of dummy variables (i.e. expectedly\n+# not used).\n+dummy-variables-rgx=_$|dummy\n+\n+# List of additional names supposed to be defined in builtins. Remember that\n+# you should avoid to define new builtins when possible.\n+additional-builtins=\n+\n+# List of strings which can identify a callback function by name. A callback\n+# name must start or end with one of those strings.\n+callbacks=cb_,_cb\n+\n+\n+[IMPORTS]\n+\n+# Deprecated modules which should not be used, separated by a comma\n+deprecated-modules=optparse\n+\n+# Create a graph of every (i.e. internal and external) dependencies in the\n+# given file (report RP0402 must not be disabled)\n+import-graph=\n+\n+# Create a graph of external dependencies in the given file (report RP0402 must\n+# not be disabled)\n+ext-import-graph=\n+\n+# Create a graph of internal dependencies in the given file (report RP0402 must\n+# not be disabled)\n+int-import-graph=\n+\n+\n+[DESIGN]\n+\n+# Maximum number of arguments for function / method\n+max-args=20\n+\n+# Argument names that match this expression will be ignored. Default to name\n+# with leading underscore\n+ignored-argument-names=_.*\n+\n+# Maximum number of locals for function / method body\n+max-locals=20\n+\n+# Maximum number of return / yield for function / method body\n+max-returns=6\n+\n+# Maximum number of branch for function / method body\n+max-branches=12\n+\n+# Maximum number of statements in function / method body\n+max-statements=50\n+\n+# Maximum number of parents for a class (see R0901).\n+max-parents=7\n+\n+# Maximum number of attributes for a class (see R0902).\n+max-attributes=30\n+\n+# Minimum number of public methods for a class (see R0903).\n+min-public-methods=0\n+\n+# Maximum number of public methods for a class (see R0904).\n+max-public-methods=20\n+\n+# Maximum number of boolean expressions in a if statement\n+max-bool-expr=5\n+\n+\n+[CLASSES]\n+\n+# List of method names used to declare (i.e. assign) instance attributes.\n+defining-attr-methods=__init__,__new__,setUp\n+\n+# List of valid names for the first argument in a class method.\n+valid-classmethod-first-arg=cls\n+\n+# List of valid names for the first argument in a metaclass class method.\n+valid-metaclass-classmethod-first-arg=mcs\n+\n+# List of member names, which should be excluded from the protected access\n+# warning.\n+exclude-protected=_asdict,_fields,_replace,_source,_make\n+\n+\n+[EXCEPTIONS]\n+\n+# Exceptions that will emit a warning when being caught. Defaults to\n+# "Exception"\n+overgeneral-exceptions=Exception\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/r2py.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/r2py.py Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+import os
+import atexit
+import socket
+import time
+import config
+import pyRserve
+
+def shutdown(port):
+    try:
+        conn = pyRserve.connect(port=port)
+        print("Shutting down Rserv...", end="")
+        conn.shutdown()
+        print("Done")
+    except pyRserve.rexceptions.RConnectionRefused:
+        print("connection to Rserve refused, server is probably already down")
+
+def get_open_port():
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("", 0))
+    s.listen(1)
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+    # find free port
+def create_connection():
+    '''Start R Rserver and test connection, port is
+    stored in config.RSERVE_PORT
+    '''
+    config.RSERVE_PORT = get_open_port()
+    print('Trying to start Rserve...',)
+    os.system(
+        "R CMD Rserve --RS-port {} -q --no-save ".format(config.RSERVE_PORT))
+        # wait for server to start accepting connections
+    time.sleep(1)
+    try:
+        conn = pyRserve.connect(port=config.RSERVE_PORT)
+        print("connection OK")
+        conn.close()
+        atexit.register(shutdown, config.RSERVE_PORT)
+        return config.RSERVE_PORT
+    except:
+        print("Connection with Rserve was not established!")
+        raise
+
+
+class setFunctionName():
+    # decorator
+
+    def __init__(self, f, name):
+        self.f = f
+        self.name = name
+
+    def __call__(self, *args, **kwargs):
+        return self.f(self.name, *args, **kwargs)
+
+
+def convert_types(fn):
+    ''' decorator function to convert type for r2py'''
+    allowed_classes = [str, int, float, list, bool, type(None)]
+    # everything else is converted to str
+
+    def fn_wrapper(*args, **kwargs):
+        new_args = list(args)
+        new_kwargs = kwargs
+        for i, value in enumerate(args):
+            if any(type(value) is i for i in allowed_classes):
+                new_args[i] = value
+            else:
+                new_args[i] = str(value)
+        for i, value in kwargs.items():
+            if any(type(value) is i for i in allowed_classes):
+                new_kwargs[i] = value
+            else:
+                new_kwargs[i] = str(value)
+        return fn(*new_args, **new_kwargs)
+
+    return fn_wrapper
+
+
+class R():
+
+    def __init__(self, source, verbose=False):
+        ''' Code in file should defined R functions which will be linked to python function
+        purpose of this is to make it memory efficient - rserve connection will be closed
+        after every exetion so memory is released.
+        warning  - Source code is executed each time function is used so it should only
+        contain function definition!!
+
+        '''
+        self.source = os.path.realpath(source)
+        conn = pyRserve.connect(port=config.RSERVE_PORT)
+        conn.voidEval("source('{}', chdir=TRUE)".format(self.source))
+        # if single object is define then fn return str! conversion neccessary
+        object_names = list(conn.r.ls())
+        if verbose:
+            print("R function loaded:", end=" ")
+        for i in object_names:
+            ## skip these objects - not compatible with older version of rserve
+            ## and not needed to be accessed from python
+            if i in ['DT_OPTIONS', 'HTMLHEADER', 'WD', 'options',
+                     'htmlheader', 'options', 'xcolor_code', 'TANDEM_RANKS', 'RANKS_TANDEM',]:
+                continue
+            try:
+                obj = getattr(conn.r, i)
+                if isinstance(obj, pyRserve.rconn.RFuncProxy):
+                    if verbose:
+                        print(i, end=" ")
+                    @convert_types
+                    def rwrapper(fname, *args, **kwargs):
+                        c = pyRserve.connect(port=config.RSERVE_PORT)
+                        c.r.setwd(os.getcwd())
+                        c.voidEval("source('{}',chdir=TRUE)".format(self.source))
+                        fn = getattr(c.r, fname)
+                        out = fn(*args, **kwargs)
+                        c.close()
+                        return out
+                    rwrapper = setFunctionName(rwrapper, i)
+                    setattr(self, i, rwrapper)
+                    del(rwrapper)
+            except:
+                print("skipping :", i)
+                pass
+        if verbose:
+            print("\r")
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/reporting.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/reporting.R Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,402 @@\n+#!/usr/bin/env Rscript\n+library(R2HTML)\n+library(hwriter)\n+library(DT)\n+library(tools)\n+\n+source("htmlheader.R")\n+source("config.R")  # load TANDEM_RANKS\n+source("utils.R")\n+DT_OPTIONS = list(pageLength = 1000, lengthMenu = c(10, 50, 100, 1000, 5000, 10000))\n+HTMLHEADER_TAREAN = gsub("PAGE_TITLE","TAREAN summary", htmlheader)\n+HTMLHEADER_INDEX = gsub("PAGE_TITLE","Clustering summary", htmlheader)\n+\n+WD = getwd()  # to get script directory when run from Rserve\n+\n+reformat_header = function(df){\n+    H = colnames(df)\n+    H[H=="TR_score"] = "TAREAN k-mer_coverage"\n+    H[H=="vcount"] = "|V|"\n+    H[H=="ecount"] = "|E|"\n+    H[H=="Genome_Proportion[%]"] = "Proportion[%]"\n+    H[H=="Proportion_Adjusted[%]"] = "Proportion adjusted[%]"\n+    H[H=="supercluster"] = "Super_cluster"\n+    H[H=="size_real"] = "Number of reads"\n+    H[H=="TR_monomer_length"] = "Consensus_length"\n+    H[H=="TR_consensus"] = "Consensus"\n+    H[H=="pbs_score"] = "PBS score"\n+    H[H=="ltr_detection"] = "LTR detection"\n+    H[H=="kmer_analysis"] = "TAREAN k-mer analysis"\n+    \n+   # H[H=="annotations_summary"] = "Similarity_hits"\n+    H[H=="annotations_summary"] = "Similarity_hits_[above 0.1%]"\n+    H[H=="annotations_summary_custom"] = "Similarity_hits_to_custom_database"\n+    H[H=="loop_index"] = "connected_component_index C"\n+    H[H=="pair_completeness"] = "pair_completeness_index_P"\n+    H = gsub("_","<br>",H)\n+    H=gsub("TR_","",H)\n+    H = capitalize(H)\n+    colnames(df) = H\n+    return(df)\n+}\n+\n+reformat4html=function(df){\n+    for (n in colnames(df)){\n+        if (class(df[,n]) == \'character\'){\n+            df[,n] = gsub("\\n","<br>", df[,n])\n+        }\n+        if (class(df[,n]) == \'numeric\'){\n+            df[,n] = signif(df[,n],3)\n+        }\n+    }\n+    return(df)\n+}\n+\n+capitalize = function(s){\n+    paste(toupper(substring(s, 1, 1)),\n+          substring(s, 2),\n+          sep="")\n+}\n+\n+\n+create_main_reports = function(paths, N_clustering, N_input,N_omit, merge_threshold,\n+                               paired, consensus_files, custom_db, tarean_mode,\n+                               HTML_LINKS, pipeline_version_info, max_memory,\n+                               max_number_reads_for_clustering, mincln){\n+    ## this create main html index and also tarean report ##\n+    ## index and tarean html reports are created always\n+    ## extract all paths and directories\n+    HTML_LINKS = nested2named_list(HTML_LINKS)\n+    paths = nested2named_list(paths)\n+    csvfile = paths[[\'clusters_info\']]\n+    clusters_summary_csv = paths[[\'clusters_summary_csv\']]\n+    profrep_classification_csv = paths[[\'profrep_classification_csv\']]\n+    htmlfile = paths[["tarean_report_html"]]\n+    html_report_dt = paths[["cluster_report_html"]]\n+    main_report = paths[["main_report_html"]]\n+    summarized_annnotation_html = paths[["summarized_annotation_html"]]\n+    libdir = paths[[\'libdir\']]\n+    clusters_dir = paths[["clusters__relative"]]\n+    superclusters_dir = paths[[\'superclusters__relative\']]\n+    seqdb = paths[[\'sequences_db\']]\n+    hitsortdb = paths[[\'hitsort_db\']]\n+    connect_to_databases(seqdb, hitsortdb)\n+    dfraw = read.table(csvfile, as.is=TRUE, header=TRUE, sep="\\t", na.strings = c(\'None\',\'NA\'))\n+    # table must be updated\n+    dfraw$supercluster_best_hit = dbGetQuery(HITSORTDB, "SELECT supercluster_best_hit FROM cluster_info")[, 1]\n+    ## columns to use\n+    selected_cols = c("index", "size_real","size_adjusted", "vcount","ecount",\n+                     "loop_index", "pair_completeness",\n+                    \'satellite_probability\',\'satellite\',\n+                    \'TR_score\',\'pbs_score\',\'ltr_detection\', \'TR_monomer_length\',\n+                    \'TR_consensus\', "annotations_summary", "supercluster", \'tandem_rank\',\n+                    \'supercluster_best_hit\')\n+\n+    ## some columns are added (like Graph_layout, clusters,...)\n+    ## columns for html report\n+    selected_cols_tarean = c(\n+        "Cluster",\n+        "Proportion[%]",\n+        "Pr'..b'error while plotting ", err))\n+      }\n+    )\n+\n+    HTML.title("Comparative analysis - Total number of reads in clustering analysis", file = main_report)\n+    index_html(df2html(\n+      prefix_codes,\n+      header = c("Code", "Total read count"), rounding_function = round),\n+      cat\n+      )\n+    HTML.title("Comparative analysis - Number of reads in individual clusters", file = main_report)\n+\n+    index_html(paste0(\'<img src="\', paths[[\'comparative_summary_map__relative\']],\n+                      \'" usemap ="#clustermap" border="2">\'), cat)\n+\n+    index_html(\n+      "Bar plot on top shows the size of individual clusters. Size of the rectangles in lower panel is proportional to the number of reads in a cluster for each species. Clusters and species were sorted using hierarchical clustering. Bars and rectangles in the plot are hyperlinked to the individual cluster reports.")\n+    index_html(imagemap)\n+  }\n+\n+  how2cite = readLines(paths[["how_to_cite"]])\n+\n+  index_html(how2cite, cat, sep="\\n")\n+  index_html("<br><hr>", cat)\n+  index_html(\'Details:\', HTML.title, HR = 3)\n+  index_html(pipeline_version_info %>% preformatted, cat)\n+  index_html(paste0("Minimal number of reads in cluster to be considered top cluster : ", mincln))\n+  index_html(paste0("Reserved Memory : ", round(max_memory/(1024*1024)), "G"))\n+  index_html(paste0("Maximum number of processable reads with the reserved memory : ", max_number_reads_for_clustering))\n+\n+\n+  ## export to csv\n+  clustering_info$Number_of_analyzed_reads = N_clustering\n+  write.table(t(as.data.frame(clustering_info)),\n+              file = clusters_summary_csv, sep="\\t", col.names = FALSE)\n+  cat("\\n", file = clusters_summary_csv, append = TRUE)\n+  write.table(\n+    df_report_csv, file = clusters_summary_csv,\n+    sep = "\\t", col.names = TRUE, row.names = FALSE, quote = TRUE, append=TRUE)\n+} \n+\n+dummy_function = function(){\n+    print("dummy function")\n+}\n+\n+reformat_df_report = function(df_report){\n+    # for printing to csv  - this should be consise\n+    df_report$TR_consensus = gsub("(<pre>)|(</pre>)","",df_report$TR_consensus)\n+    df_report$tandem_rank = NULL\n+    ## make suitable order and rename\n+    if ("annotations_summary_custom" %in% colnames(df_report)){\n+        custom = "annotations_summary_custom"\n+    }else{\n+        custom=character()\n+    }\n+    df_out = df_report[,c(\'index\',\n+                          \'supercluster\',\n+                          \'size_real\',\n+                          \'size_adjusted\',\n+                          \'supercluster_best_hit\',\n+                          \'TAREAN_annotation\',\n+                          \'annotations_summary\',\n+                          custom)\n+                       ]\n+\n+    colnames(df_out) = c(\'Cluster\',\n+                         \'Supercluster\',\n+                         \'Size\',\n+                         \'Size_adjusted\',\n+                         \'Automatic_annotation\',\n+                         \'TAREAN_annotation\',\n+                         \'Cluster_similarity_hits\',\n+                         custom)\n+    return(df_out)\n+}\n+\n+reformat_df_to_profrep_classification = function(df_report){\n+    CL = df_report$index\n+    best_hit = df_report$supercluster_best_hit\n+    ## format conversion(in order):\n+    replacement = list(\n+        c("/", "|"),\n+        c("Ty1_copia", "Ty1/copia"),\n+        c("Ty3_gypsy", "Ty3/gypsy"),\n+        c("TatIV_Ogre", "TatIV/Ogre"),\n+        c("Ogre_Tat", "Ogre/Tat"),\n+        c("EnSpm_CACTA", "EnSpm/CACTA"),\n+        c("MuDR_Mutator", "MuDR/Mutator"),\n+        c("PIF_Harbinger", "PIF/Harbinger"),\n+        c("Tc1/Mariner", "Tc1/Mariner"),\n+        c("All|", "")\n+    )\n+    for (i in replacement){\n+        best_hit = gsub(i[1], i[2], best_hit, fixed = TRUE)\n+    }\n+    best_hit = gsub("^All", "", best_hit, fixed = FALSE)\n+    best_hit = ifelse(best_hit == "", paste0("unknown_CL", CL), best_hit)\n+    output = data.frame(Cluster = CL, classification = best_hit, stringsAsFactors = FALSE)\n+    return(output)\n+}\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/seqtools.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/seqtools.py Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,824 @@\n+#!/usr/bin/env python3\n+import logging\n+logger = logging.getLogger(__name__)\n+import itertools\n+import os\n+import sys\n+import random\n+import sqlite3\n+import subprocess\n+import shlex  # for command line arguments split\n+from collections import namedtuple\n+from lib.utils import format_query\n+from lib.parallel.parallel import parallel2 as parallel\n+from lib.parallel.parallel import get_max_proc\n+REQUIRED_VERSION = (3, 4)\n+MAX_PRINT = 10\n+\n+if sys.version_info < REQUIRED_VERSION:\n+    raise Exception("\\n\\npython 3.4 or higher is required!\\n")\n+\n+# additional functions\n+\n+\n+\n+def _hitsort_worker(query, blastdb, output, options):\n+\n+    # output from blast is parsed from stdout\n+    cmd = options.all2all_search_params.format(query = query, blastdb = blastdb)\n+    print(cmd)\n+    min_lcov, min_pid, min_ovl, min_scov, evalue_max = options.filtering_threshold\n+    pair2insert = \'\'\n+    signs ={\'plus\':\'+\', \'minus\':\'-\'}\n+    with open(output, \'w\') as f:\n+        with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:\n+            for i in p.stdout:\n+                items = i.decode(\'utf-8\').split()\n+                if options.filter_self_hits:\n+                    if items[4] == items[0]:\n+                        continue\n+                evalue = float(items[10])\n+                ovl_q = abs(float(items[2]) - float(items[3])) + 1\n+                ovl_h = abs(float(items[6]) - float(items[7])) + 1\n+                if (ovl_q >= min_ovl or ovl_h >= min_ovl) and float(items[8]) >= min_pid:\n+                    if float(items[1]) >= float(items[5]):\n+                        lcov = ovl_q * 100.0 / float(items[1])\n+                        scov = ovl_h * 100.0 / float(items[5])\n+                    else:\n+                        lcov = ovl_q * 100.0 / float(items[5])\n+                        scov = ovl_h * 100.0 / float(items[1])\n+                    # positive line:\n+                    if lcov >= min_lcov and scov >= min_scov and evalue_max > evalue:\n+                        pr = [items[0], items[4]]\n+                        # previous HSP\n+                        if pair2insert != "{0}\\t{1}".format(pr[0], pr[1]):\n+                            pair2insert = "{0}\\t{1}".format(pr[0], pr[1])\n+                            if items[11] in [\'plus\', \'minus\']:\n+                                items[11] = signs[items[11]]\n+                            f.write("{0}\\t{1}\\t{2}\\n".format(pair2insert, items[9], "\\t".join(\n+                                items[k] for k in [1, 2, 3, 5, 6, 7, 8, 10, 11])))\n+\n+def blast_with_filter(fasta_file_filter, blastdb):\n+    \'\'\'\n+    Return list of sequences query id which has similarity to filter\n+    It uses mgblast for search\n+    \'\'\'\n+    params = \'-p 85 -W18 -UT -X40 -KT -JF  -F "m D" -v100000000 -b100000000 -D4 -C 30 -H 30\'\n+    positive_hits = set()\n+    min_pid = 90\n+    min_ovl_perc = 90\n+    cmd = " ".join([\'mgblast\',\n+                    \'-i\', fasta_file_filter,\n+                    \'-d\', blastdb,\n+                    params\n+                ])\n+    with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:\n+        for i in p.stdout:\n+            items = i.decode(\'utf-8\').split()\n+            ovl_perc = (abs(float(items[6]) - float(items[7])) + 1) / float(items[5]) * 100\n+            pid = float(items[8])\n+            if pid > min_pid and ovl_perc > min_ovl_perc:\n+                positive_hits.add(items[4])\n+    return(positive_hits)\n+\n+\n+\n+# TODO test task\n+# predefine blast params\n+def blast_worker(query, blastdb, output, params):\n+    if params[\'program\'] in [\'blastx\', \'blastn\']:\n+        default_columns = \' -outfmt "6 {}"\'.format(params[\'output_columns\'])\n+        cmd = "{} -query {} -db {} {} {}".format(\n+            params[\'program\'], query, blastdb, params[\'args\'], default_columns)\n+    elif params[\'program\']==\'diamond blastx\':\n+        # diomand have slightly different format than blastx\n+        default_columns = \' --outfmt 6 {}\'.format(params[\'output_colum'..b'rmat_query(ids_to_drop)\n+        c.execute("CREATE TABLE omited_sequences AS SELECT * FROM sequences  WHERE name IN {}".format(idslist))\n+        c.execute("DELETE FROM sequences WHERE name IN {}".format(\n+            idslist\n+        ))\n+        # new databases must be created - with ommited sequences!\n+        self.save2fasta(fasta_file_name=self.fasta) ## this will replace origninal file!\n+        self._length = len(self)\n+        self.makeblastdb(legacy=True)\n+        self._update_database()\n+\n+    def annotate(self, database, annotation_name, directory="", params=""):\n+        \'\'\'\n+        database : path to blast formated database\n+        method: type of search to use for annotation. it must be\n+        \'blastn\' of \'blastx\'\n+        Annotation is save in directory also into the table with name stored in\n+        annotation_name variable\n+        \'\'\'\n+        logger.info("annotating reads  with {} ".format(annotation_name))\n+        self._check_database(database)\n+        if "parallelize" in params:\n+            parallelize = params[\'parallelize\']\n+        else:\n+            parallelize = True\n+        if parallelize:\n+            if not self.chunks:\n+                self.make_chunks()\n+            output = [\n+                "{}/{}_{}".format(directory, annotation_name,os.path.basename(i)) for i in self.chunks]\n+            parallel(blast_worker, self.chunks, [database], output, [params])\n+        else:\n+            # no explicit parallelization using parallel\n+            single_output =  "{}/{}_hits".format(directory, annotation_name)\n+            params[\'args\'] = params[\'args\'].format(max_proc=get_max_proc())\n+            blast_worker(self.fasta, database, single_output,params)\n+            output = [single_output]\n+\n+        # put into as new attribute and sqlite table\n+        c = self.conn.cursor()\n+        c.execute("create table {} (name text, db_id text, length real, bitscore real , pid real)".format(\n+            annotation_name))\n+        for blast_results in output:\n+            with open(blast_results, \'r\') as f:\n+                for i in f:\n+                    items = i.split()\n+                    types = [str, str, float, float, float, float, float]\n+                    qseqid, sseqid, qlen, slen, length, pident, bitscore = [\n+                        f(i) for i, f in zip(items, types)]\n+                    c.execute(\'insert into {} values ("{}", "{}", "{}", "{}", "{}")\'.format(\n+                        annotation_name, qseqid, sseqid, length, bitscore, pident))\n+        self.conn.commit()\n+        self.annotations.append(annotation_name)\n+\n+    @staticmethod\n+    def fasta_length(fasta_file_name):\n+        \'\'\'\n+        count number of sequences, \n+        \'\'\'\n+        with open(fasta_file_name, \'r\') as f:\n+            N = 0\n+            for i in f:\n+                if i.strip()[0] == ">":\n+                    N += 1\n+            return N\n+\n+\n+# test\n+if __name__ == "__main__":\n+    # get root directory\n+\n+    MAIN_DIR = os.path.realpath(\n+        os.path.dirname(os.path.realpath(__file__)) + "/..")\n+    print("MAIN_DIR:")\n+    print(MAIN_DIR)\n+    TMP_DIR = "{}/tmp".format(MAIN_DIR)\n+    TEST_DATA_DIR = "{}/test_data".format(MAIN_DIR)\n+\n+    # some data:\n+    fn = "{}/pair_reads.fasta".format(TEST_DATA_DIR)\n+    os.chdir(TMP_DIR)\n+    # s = SequenceSet(filename=\'sqlite.db\')\n+    print("number of sequences in fasta file:")\n+    print(SequenceSet.fasta_length(fn))\n+\n+    print("loading sequences...")\n+    s = SequenceSet(source=fn, paired=True, rename=False,\n+                    filename=\'sqlite.db\', prefix_length=4, fasta=\'sample2.fasta\')\n+    print(s)\n+\n+    print("creating blast database")\n+    s.makeblastdb()\n+\n+    print("creating chunks from fasta file: ")\n+    file_list = s.make_chunks("fasta_chunks", chunk_size=500)\n+    print(file_list)\n+\n+    print(\'creating hitsort\')\n+    s.create_hitsort(file_list, "hitsort")\n+\n+    print("saving to fasta file")\n+    s.save2fasta(\'sampleX.fasta\', keep=False)\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/style1.css
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/style1.css Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,253 @@
+body {
+   background: #fff;
+   font-family: verdana, helvetica, arial, sans-serif;
+   margin: 0 20px 0 20px;
+    font-size: 10pt;
+   color: #333333;
+    max-width: 700px;
+}
+
+.floating_img
+{
+    float: left;
+    margin: 1.5% 1.5% 0% 0%;
+    border-style:solid;
+    border-color:gray;
+    border-width:1px
+}
+
+p {
+    max-width: 700px;
+}
+
+#container {
+    background: url(header.jpg) no-repeat top left;
+    position: relative;
+    margin: 0 auto -1px auto;
+    padding-top: 50px;
+    width: 680px;
+}
+
+#pageHeader {
+   text-align: center;
+   padding-right: 0px;
+   color: #AADA50;
+}
+#pageHeader h1 {
+   width: auto;
+   font-size: x-large;
+}
+#pageHeader h2 {
+   width: auto;
+   font-size: large;
+}
+
+#preamble {
+    margin: 140px 0 0 220px;
+    width: auto;
+}
+
+#explanation, #participation, #benefits, #requirements {
+    margin: 25px 0 0 220px;
+    width: auto;
+}
+
+#footer {
+    margin: 0 0 5px 0;
+    padding: 10px 0 0 0;
+    width: 100%;
+    text-align: center;
+}
+
+#linkList {
+    position: absolute;
+    width: 150px;
+    top: 350px;
+    left: 50px;
+}
+
+p {
+    font-size: 12px;
+}
+
+#preamble p.p1, #supportingText p.p1 {
+    margin-top: 10px;
+}
+
+#preamble p, #supportingText p {
+    line-height: 18px;
+}
+
+#quickSummary p.p2 {
+    font-size: 9px;
+    color: #999;
+}
+
+#footer {
+    font-size: 9px;
+}
+
+#linkList ul {
+    font-size: 10px;
+    list-style:none;
+    margin: 5px 0 0 0;
+    padding: 0 0 0 0;
+}
+
+#linkList ul li {
+    margin: 0;
+    padding: 0 0 0 17px;
+    line-height: 14px;
+    color: #ccc;
+}
+
+#linkList #lselect li {
+    font-size: 9px;
+}
+
+#linkList #lselect a:link, #linkList #lselect a:visited { display: block; }
+#linkList #lselect a.c:link, #linkList #lselect a.c:visited { display:inline; }
+
+#larchives li, #lresources li { text-transform: lowercase; }
+
+a, a:link, a:visited {
+               color: #44F;
+           }
+
+a:hover {
+    color: #00F;
+}
+
+#linkList a, #linkList a:link, #linkList a:visited {
+                                   color: #666;
+                                   text-decoration: none;
+                               }
+
+#linkList a:hover {
+    text-decoration: underline;
+    color: #333;
+}
+
+#linkList a.c, #linkList a.c:link, #linkList a.c:visited {
+                                       color: #999;
+                                       text-decoration: none;
+                                   }
+
+#linkList a.c:hover {
+    text-decoration: underline;
+    color: #333;
+}
+
+#linkList #lselect a {
+    font-size: 10px;
+}
+
+#linkList #lselect a.c {
+    font-size: 9px;
+    text-transform: lowercase;
+}
+
+#footer a, #footer a:link, #footer a:visited {
+                               font-weight: bold;
+                               text-transform: uppercase;
+                               text-decoration: none;
+                           }
+
+#footer a:hover {
+    text-decoration: underline;
+}
+
+h3 { margin-bottom: 0px; }
+
+h3 span { }
+
+#supportingText h3 {
+    width: auto;
+    height: 15px;
+}
+
+#linkList h3 {
+    width: 150px;
+    height: 20px;
+    margin-top: 20px;
+}
+
+#preamble h3 {
+    width: auto;
+    height: 26px;
+}
+
+#explanation h3 {
+}
+
+#participation h3 {
+}
+
+#benefits h3 {
+}
+
+#lselect h3 {
+    margin-top: 10px;
+}
+
+table { background:#FFFFFF;
+       border:1px solid gray;
+       border-collapse:collapse;
+       color:#fff;
+       font:normal 10px verdana, arial, helvetica, sans-serif;
+     }
+caption { border:1px solid #5C443A;
+         color:#5C443A;
+         font-weight:bold;
+         font-size:20pt
+                       padding:6px 4px 8px 0px;
+         text-align:center;
+       }
+td, th { color:#363636;
+        padding:.4em;
+      }
+tr { border:1px dotted gray;
+  }
+thead th, tfoot th { background:#5C443A;
+                    color:#FFFFFF;
+                    padding:3px 10px 3px 10px;
+                    text-align:left;
+                    text-transform:uppercase;
+                  }
+tbody td a { color:#0000FF;
+            text-decoration:underline;
+          }
+tbody td a:visited { color:#0000FF;
+                    text-decoration:underline;
+                  }
+tbody td a:hover { text-decoration:underline;
+                }
+tbody th a { color:#3636FF;
+            font-weight:normal;
+            text-decoration:none;
+          }
+tbody th a:hover { color:#363636;
+                }
+tbody td+td+td+td a { background-image:url('bullet_blue.png');
+                     background-position:left center;
+                     background-repeat:no-repeat;
+                     color:#0000FF;
+                     padding-left:15px;
+                   }
+tbody td+td+td+td a:visited { background-image:url('bullet_white.png');
+                             background-position:left center;
+                             background-repeat:no-repeat;
+                           }
+tbody th, tbody td { text-align:left;
+                    vertical-align:top;
+                  }
+tfoot td { background:#5C443A;
+          color:#FFFFFF;
+          padding-top:3px;
+        }
+.odd { background:#fff;
+    }
+tbody tr:hover { background:#EEEEEE;
+                border:1px solid #03476F;
+                color:#000000;
+              }
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/.gitignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/.gitignore Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,1 @@
+test_data/
\ No newline at end of file
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/OGDF/runOGDFlayout
b
Binary file lib/tarean/OGDF/runOGDFlayout has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/OGDF/runOGDFlayout2015.5
b
Binary file lib/tarean/OGDF/runOGDFlayout2015.5 has changed
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/README.md Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,2 @@
+# TAREAN - TAndem Repeat ANalyzer
+Tarean is tools for reconstruction/detection of tandem repeats from NGS data
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/htmlheader.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/htmlheader.R Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,74 @@
+htmlheader="
+ <html xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">
+ <head>
+ <title> Kmer analysis summary </title>
+ <style>
+ <!--
+ table { background:#FFFFFF;
+ border:1px solid gray;
+ border-collapse:collapse;
+ color:#fff;
+ font:normal 10px verdana, arial, helvetica, sans-serif;
+ }
+ caption { border:1px solid #5C443A;
+ color:#5C443A;
+ font-weight:bold;
+ font-size:20pt
+ padding:6px 4px 8px 0px;
+ text-align:center;
+
+ }
+ td, th { color:#363636;
+ padding:.4em;
+ }
+ tr { border:1px dotted gray;
+ }
+ thead th, tfoot th { background:#5C443A;
+ color:#FFFFFF;
+ padding:3px 10px 3px 10px;
+ text-align:left;
+ text-transform:uppercase;
+ }
+ tbody td a { color:#3636FF;
+ text-decoration:underline;
+ }
+ tbody td a:visited { color:gray;
+ text-decoration:line-through;
+ }
+ tbody td a:hover { text-decoration:underline;
+ }
+ tbody th a { color:#3636FF;
+ font-weight:normal;
+ text-decoration:none;
+ }
+ tbody th a:hover { color:#363636;
+ }
+ tbody td+td+td+td a { background-image:url('bullet_blue.png');
+ background-position:left center;
+ background-repeat:no-repeat;
+ color:#FFFFFF;
+ padding-left:15px;
+ }
+ tbody td+td+td+td a:visited { background-image:url('bullet_white.png');
+ background-position:left center;
+ background-repeat:no-repeat;
+ }
+ tbody th, tbody td { text-align:left;
+ vertical-align:top;
+ }
+ tfoot td { background:#5C443A;
+ color:#FFFFFF;
+ padding-top:3px;
+ }
+ .odd { background:#fff;
+ }
+ tbody tr:hover { background:#EEEEEE;
+ border:1px solid #03476F;
+ color:#000000;
+ }
+ -->
+ </style>
+
+ </head>
+
+ "
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/kmer_counting.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/kmer_counting.py Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+import logging
+logger = logging.getLogger(__name__)
+import itertools
+import os
+import sys
+import random
+import sqlite3
+import subprocess
+import shlex  # for command line arguments split
+import operator
+
+REQUIRED_VERSION = (3, 4)
+MAX_PRINT = 10
+MEGABLAST = "-task megablast"
+HITSORT = "-task megablast"
+
+if sys.version_info < REQUIRED_VERSION:
+    raise Exception("\n\npython 3.4 or higher is required!\n")
+
+# additional functions
+
+
+class Sequence:
+
+    def __init__(self, seq, name="", paired=False):
+        # the mode os seq storage can be changed later to make it more
+        # memory efficient
+        self._seq = bytes(str(seq), "ascii")
+        self.name = str(name)
+
+    @property
+    def seq(self):
+        return self._seq.decode("utf-8")
+
+    @seq.setter
+    def seq(self, value):
+        self._seq = bytes(str(value), "ascii")
+
+    def __str__(self):
+        return "{0} : {1}".format(self.name, self.seq)
+
+    @staticmethod
+    def read_fasta(fasta_file_name):
+        '''
+        generator - reads sequences from fasta file
+        return sequence one by one
+        '''
+        with open(fasta_file_name, 'r') as f:
+            header = None
+            seqstr = None
+            for rawline in f:
+                line = rawline.strip()
+                if line == "":
+                    continue
+                if line[0] == ">":
+                    if header and seqstr:
+                        yield Sequence(seqstr, header)
+                        # reset
+                        seqstr = None
+                        header = line[1:]
+                    elif seqstr:
+                        Warning("sequence was not preceeded by header")
+                    else:
+                        header = line[1:]
+                else:
+                    seqstr = line if not seqstr else seqstr + line
+        # skip empty lines:
+        if header and seqstr:
+            yield Sequence(seqstr, header)
+        return
+
+    def write2fasta(self, file_object):
+        file_object.write(">{0}\n{1}\n".format(self.name, self.seq))
+
+
+def get_kmers(string, width=11):
+    L = len(string)
+    parts = [string[i:i + width] for i in range(L - width + 0)]
+    return parts
+
+
+def count_kmers_from_file(f, width=11):
+    counts = {}
+    for i in Sequence.read_fasta(f):
+        a = get_kmers(i.seq, width)
+        for km in a:
+            if "N" in km:
+                continue
+            if km in counts:
+                counts[km] += 1
+            else:
+                counts[km] = 1
+    sorted_counts = sorted(counts.items(),
+                           key=operator.itemgetter(1),
+                           reverse=True)
+    return sorted_counts
+
+
+if __name__ == "__main__":
+    L = len(sys.argv) - 1
+    kmer_length = int(sys.argv[-1])
+    files = sys.argv[1:-1]
+    for fin in files:
+        counts = count_kmers_from_file(fin, kmer_length)
+        fout = "{}_{}.kmers".format(fin, kmer_length)
+        with open(fout, "w") as f:
+            for i in counts:
+                f.write("{}\t{}\n".format(*i))
+        print(fout)
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/logo_methods.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/logo_methods.R Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,255 @@
+#! /usr/bin/env Rscript
+
+## FUNCTIONS:
+letterA <- function(x.pos,y.pos,ht,wt,id=NULL){
+    
+    x <- c(0,4,6,10,8,6.8,3.2,2,0,3.6,5,6.4,3.6)
+    y <- c(0,10,10,0,0,3,3,0,0,4,7.5,4,4)
+    x <- 0.1*x
+    y <- 0.1*y
+    
+    x <- x.pos + wt*x
+    y <- y.pos + ht*y
+    
+    if (is.null(id)){
+        id <- c(rep(1,9),rep(2,4))
+    }else{
+        id <- c(rep(id,9),rep(id+1,4))
+    }
+    
+    fill <- c("green","white")
+    
+    list(x=x,y=y,id=id,fill=fill)
+}
+
+## T
+letterT <- function(x.pos,y.pos,ht,wt,id=NULL){
+    
+    x <- c(0,10,10,6,6,4,4,0)
+    y <- c(10,10,9,9,0,0,9,9)
+    x <- 0.1*x
+    y <- 0.1*y
+    
+    x <- x.pos + wt*x
+    y <- y.pos + ht*y
+    
+    if (is.null(id)){
+        id <- rep(1,8)
+    }else{
+        id <- rep(id,8)
+    }
+    
+    fill <- "red"
+    
+    list(x=x,y=y,id=id,fill=fill)
+}
+
+## C
+letterC <- function(x.pos,y.pos,ht,wt,id=NULL){
+    angle1 <- seq(0.3+pi/2,pi,length=100)
+    angle2 <- seq(pi,1.5*pi,length=100)
+    x.l1 <- 0.5 + 0.5*sin(angle1)
+    y.l1 <- 0.5 + 0.5*cos(angle1)
+    x.l2 <- 0.5 + 0.5*sin(angle2)
+    y.l2 <- 0.5 + 0.5*cos(angle2)
+    
+    x.l <- c(x.l1,x.l2)
+    y.l <- c(y.l1,y.l2)
+    
+    x <- c(x.l,rev(x.l))
+    y <- c(y.l,1-rev(y.l))
+    
+    x.i1 <- 0.5 +0.35*sin(angle1)
+    y.i1 <- 0.5 +0.35*cos(angle1)
+    x.i1 <- x.i1[y.i1<=max(y.l1)]
+    y.i1 <- y.i1[y.i1<=max(y.l1)]
+    y.i1[1] <- max(y.l1)
+    
+    x.i2 <- 0.5 +0.35*sin(angle2)
+    y.i2 <- 0.5 +0.35*cos(angle2)
+    
+    x.i <- c(x.i1,x.i2)
+    y.i <- c(y.i1,y.i2)
+    
+    x1 <- c(x.i,rev(x.i))
+    y1 <- c(y.i,1-rev(y.i))
+    
+    x <- c(x,rev(x1))
+    y <- c(y,rev(y1))
+    
+    x <- x.pos + wt*x
+    y <- y.pos + ht*y
+    
+    if (is.null(id)){
+        id <- rep(1,length(x))
+    }else{
+        id <- rep(id,length(x))
+    }
+    
+    fill <- "blue"
+    
+    list(x=x,y=y,id=id,fill=fill)
+}
+
+
+## G
+letterG <- function(x.pos,y.pos,ht,wt,id=NULL){
+    angle1 <- seq(0.3+pi/2,pi,length=100)
+    angle2 <- seq(pi,1.5*pi,length=100)
+    x.l1 <- 0.5 + 0.5*sin(angle1)
+    y.l1 <- 0.5 + 0.5*cos(angle1)
+    x.l2 <- 0.5 + 0.5*sin(angle2)
+    y.l2 <- 0.5 + 0.5*cos(angle2)
+    
+    x.l <- c(x.l1,x.l2)
+    y.l <- c(y.l1,y.l2)
+    
+    x <- c(x.l,rev(x.l))
+    y <- c(y.l,1-rev(y.l))
+    
+    x.i1 <- 0.5 +0.35*sin(angle1)
+    y.i1 <- 0.5 +0.35*cos(angle1)
+    x.i1 <- x.i1[y.i1<=max(y.l1)]
+    y.i1 <- y.i1[y.i1<=max(y.l1)]
+    y.i1[1] <- max(y.l1)
+    
+    x.i2 <- 0.5 +0.35*sin(angle2)
+    y.i2 <- 0.5 +0.35*cos(angle2)
+    
+    x.i <- c(x.i1,x.i2)
+    y.i <- c(y.i1,y.i2)
+    
+    x1 <- c(x.i,rev(x.i))
+    y1 <- c(y.i,1-rev(y.i))
+    
+    x <- c(x,rev(x1))
+    y <- c(y,rev(y1))
+    
+    h1 <- max(y.l1)
+    r1 <- max(x.l1)
+    
+    h1 <- 0.4
+    x.add <- c(r1,0.5,0.5,r1-0.2,r1-0.2,r1,r1)
+    y.add <- c(h1,h1,h1-0.1,h1-0.1,0,0,h1)
+    
+    
+    
+    if (is.null(id)){
+        id <- c(rep(1,length(x)),rep(2,length(x.add)))
+    }else{
+        id <- c(rep(id,length(x)),rep(id+1,length(x.add)))
+    }
+    
+    x <- c(rev(x),x.add)
+    y <- c(rev(y),y.add)
+    
+    x <- x.pos + wt*x
+    y <- y.pos + ht*y
+    
+    
+    fill <- c("orange","orange")
+    
+    list(x=x,y=y,id=id,fill=fill)
+    
+}
+
+Letter <- function(which,x.pos,y.pos,ht,wt){
+    
+    if (which == "A"){
+        letter <- letterA(x.pos,y.pos,ht,wt)
+    }else if (which == "C"){
+        letter <- letterC(x.pos,y.pos,ht,wt)    
+    }else if (which == "G"){
+        letter <- letterG(x.pos,y.pos,ht,wt)    
+    }else if (which == "T"){
+        letter <- letterT(x.pos,y.pos,ht,wt)    
+    }else{
+        stop("which must be one of A,C,G,T")
+    }
+    
+    letter
+}
+
+
+
+
+plot_multiline_logo = function(cons.logo,read=NULL, W=50, setpar=TRUE, gaps = NULL){
+    ## logo - base order  - A C G T
+    if (ncol(cons.logo)==5){
+        gaps_prob = cons.logo[,5]
+    }else{
+        gaps_prob = NULL
+    }
+    ps=10 # Point_Size
+    tm=4
+    pwm=as.matrix(cons.logo[,1:4])
+    N=nrow(pwm)
+    Nori=N
+    if (N<W){
+        W=N
+    }
+    s1=seq(1,N,by=W)
+    s2=seq(W,N,by=W)
+    if (length(s2)<length(s1)){
+ pwm=rbind(pwm,matrix(0,nrow=W*length(s1)-N,ncol=4,dimnames=list(NULL,c('A','C','G','T'))))
+        if (!is.null(read)){
+            pwm_read = rbind(read,matrix(0,nrow=W*length(s1)-N,ncol=4,dimnames=list(NULL,c('A','C','G','T'))))
+        }
+ N=nrow(pwm)
+ s2=seq(W,N,by=W)
+    }
+    if (setpar){
+        par(mfrow = c(ceiling(N/W),1), mar=c(1,4,1,0))
+    }
+    for (i in seq_along(s1)){
+        if (!is.null(read)){
+            plot.logo(pwm_read[s1[i]:s2[i],],maxh=2)
+        }
+        plot.logo(pwm[s1[i]:s2[i],],maxh=max(rowSums(cons.logo)))
+        if(!is.null(gaps)){
+            ## transparent rectangles
+            rect((gaps[ ,'start']-s1[i]+1),0, (gaps[,'end']-s1[i]+2), max(pwm), col="#00000005")
+            
+        }
+        if(!is.null(gaps_prob)){
+            rect(seq_along(s1[i]:s2[i]),
+                 max(rowSums(cons.logo)),
+                 seq_along(s1[i]:s2[i])+1,
+                 max(rowSums(cons.logo)) - gaps_prob[s1[i]:s2[i]],
+                 col="#00000030")
+
+            
+        }
+        ticks=intersect(intersect(pretty(pretty(s1[i]:s2[i])+1),s1[i]:s2[i]),1:Nori)
+        axis(1,at=ticks+1.5-s1[i],label=ticks,tick=FALSE)
+        y=pretty(c(0,max(pwm)),n=tm)
+        axis(2,at=y,label=y,las=2,cex.axis=.7)
+    }
+}
+
+plot.logo=function(pwm,maxh=NULL){
+    acgt=c("A","C","G","T")
+    pwm = pwm[,acgt]
+    nbp=dim(pwm)[1]
+    if (is.null(maxh)) {maxh=max(rowSums(pwm))}
+    
+    plot(0,0,xlim=c(0,nbp),ylim=c(0,maxh),type="n",axes=F,xlab="",ylab="")
+    for ( i in 1:nbp){
+        S=order(pwm[i,])
+        hgts=pwm[i,S]
+        nts=acgt[S]
+        ypos=c(0,cumsum(hgts)[1:3])
+        for (j in 1:4){
+            if (hgts[j]==0) next
+            L=Letter(which=nts[j],x.pos=i,y.pos=ypos[j],ht=hgts[j],wt=1)
+            Id=L$id==1
+            polygon(L$x[Id],L$y[Id],lty=0,col=L$fill[1])
+            if (sum(L$id==2)>0) {
+                polygon(L$x[!Id],L$y[!Id],lty=0,col=L$fill[2])
+            }
+        }
+    }
+}
+
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/methods.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/methods.R Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,1536 @@\n+#!/user/bin/env Rscript\n+\n+suppressPackageStartupMessages(library(igraph))\n+suppressPackageStartupMessages(library(parallel))\n+suppressPackageStartupMessages(library(Biostrings))\n+suppressPackageStartupMessages(library(scales))\n+suppressPackageStartupMessages(library(stringr))\n+suppressPackageStartupMessages(library(hwriter))\n+suppressPackageStartupMessages(library(R2HTML))\n+suppressPackageStartupMessages(library(plyr))\n+suppressPackageStartupMessages(library(dplyr))\n+\n+max_ORF_length = function(s) {\n+  ## check all frames\n+  L = 0\n+  for (i in 1:3) {\n+    L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*", \n+                                     fixed = TRUE))))\n+    L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s), \n+                                                                   i))), "*", fixed = TRUE))))\n+  }\n+  return(L)\n+}\n+\n+kmers2graph = function(kmers, mode = "strong", prop = NULL) {\n+  kmerLength = nchar(kmers[1, 1])\n+  if (ncol(kmers) == 2) {\n+    kmers$size = kmers[, 2]/sum(kmers[, 2])\n+  }\n+  colnames(kmers) = c("name", "count", "size")\n+  if (!is.null(prop)) {  # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!!\n+    p = cumsum(kmers$size)\n+    kmers = kmers[p < prop, ]\n+  }\n+  N = dim(kmers)[1]\n+  kmers = kmers[order(kmers$size), ]\n+  ## convert kmers to fasta file\n+  kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE)\n+  kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE)\n+\n+  ## df = merge(kms,kme, by = \'kmer\',all=FALSE)[,c(2,3,1)]\n+  df = inner_join(kme,kms, by = \'kmer\')[,c(2,3)]\n+\n+  ## names(kms) = seq_along(kms)\n+  ## kme = substring(kmers$name, 2)\n+  ## names(kme) = seq_along(kme)\n+  ## ## use new blast!\n+  ## database = tempfile()\n+  ## query = tempfile()\n+  ## output = tempfile()\n+  ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta")\n+  ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta")\n+  ## cmd = paste("makeblastdb -in", database, "-dbtype nucl")\n+  ## system(cmd, ignore.stdout = TRUE)\n+  ## cmd = paste("blastn -outfmt \'6 qseqid sseqid pident\'  -strand plus -dust no -perc_identity 100 -query ", \n+  ##     query, "-db", database, "-word_size", kmerLength - 1, "-out", output)\n+  ## system(cmd)\n+  ## df = try({\n+  ##   read.table(output, as.is = TRUE)\n+  ## })\n+  ## if (class(df) == "try-error"){\n+  ##   print("creation of kmer graph failed")\n+  ##   print(query)\n+  ##   print(output)\n+  ##   print(database)\n+  ##   return(NULL)\n+  ## }\n+  ## unlink(query)\n+  ## unlink(paste(database, "*", sep = ""))\n+  ## unlist(output)\n+  gm_mean = function(x, na.rm = TRUE) {\n+    exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x))\n+  }\n+  \n+  whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean)\n+  G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[, \n+                                                                               2]], weight = whg), vertices = kmers[, 1:3])\n+                                        # separate to connected components:\n+  ccs = clusters(G, mode = mode)$membership\n+  sel_cls = which(tabulate(ccs) > 1)\n+  Gs = list()\n+  for (i in seq_along(sel_cls)) {\n+    Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i]))\n+  }\n+  ## reorder!!!\n+  Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)]\n+  return(Gs)\n+}\n+\n+\n+OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) {\n+  ## is ogdf binary available?\n+  if (is.null(OGDF)) {\n+    OGDF = Sys.getenv("OGDF")\n+    if ("" == OGDF) {\n+      options(warn = -1)\n+      OGDF = system("which runOGDFlayout", intern = TRUE)\n+      options(warn = 0)\n+      if (length(OGDF) == 0) {\n+        cat("path to runOGDFlayout not found\\n")\n+        return(NULL)\n+      }\n+      \n+    }\n+  }\n+  if (is.null(ncol)) {\n+    if (is.n'..b'= graph_info,\n+              orf_l = orf_l, tarean_contig_file = tarean_contig_file)))\n+}\n+\n+\n+## graph loop index stability\n+loop_index_instability = function(G) {\n+  N = 50\n+  s = seq(vcount(G), vcount(G)/10, length.out = N)\n+  p = seq(1, 0.1, length.out = N)\n+  li = numeric()\n+  for (i in seq_along(s)) {\n+    print(i)\n+    gs = induced_subgraph(G, sample(1:vcount(G), s[i]))\n+    li[i] = max(clusters(gs, "strong")$csize)/vcount(gs)\n+  }\n+  instability = lm(li ~ p)$coefficient[2]\n+  return(instability)\n+}\n+\n+isSatellite = function(x, y, model) {\n+  p = get_prob(x, y, model)\n+  if (p > model$cutoff) {\n+    return("Putative Satellite")\n+  } else {\n+    return("")\n+  }\n+}\n+\n+get_prob = function(x, y, model) {\n+  pm = model$prob_matrix\n+  N = ncol(pm)\n+  i = round(x * (N - 1)) + 1\n+  j = round(y * (N - 1)) + 1\n+  p = pm[i, j]\n+  return(p)\n+}\n+\n+\n+detectMemUsage = function() {\n+  con = textConnection(gsub(" +", " ", readLines("/proc/meminfo")))\n+  memInfo = read.table(con, fill = TRUE, row.names = 1)\n+  close(con)\n+  memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal", \n+                                                                        1]\n+  return(memUsage)\n+}\n+\n+\n+makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){\n+    lockdir=dirname(lockfile)\n+    if(!file.exists(lockdir)){\n+        if(CreateDirectories) dir.create(lockdir,recursive=TRUE)\n+        else stop("Lock Directory for lockfile ",lockfile," does not exist")\n+    } \n+    if(missing(lockmsg)) lockmsg=paste(system(\'hostname\',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR"))\n+    if (file.exists(lockfile)) return (FALSE)\n+                                        # note the use of paste makes the message writing atomic\n+    cat(paste(lockmsg,"\\n",sep=""),file=lockfile,append=TRUE,sep="")\n+    firstline=readLines(lockfile,n=1)\n+    if(firstline!=lockmsg){\n+                                        # somebody else got there first\n+        return(FALSE)\n+    } else return(TRUE)\n+}\n+\n+\n+removelock<-function(lockfile){\n+  if(unlink(lockfile)!=0) {\n+    warning("Unable to remove ",lockfile)\n+    return (FALSE)\n+  }\n+  return (TRUE)\n+}\n+\n+\n+waitForRAM = function(p = 0.5,lock_file=NULL) {\n+  if (detectMemUsage() < p) {\n+    return(NULL)\n+    ## check lock file:\n+  } else {\n+    cat("waiting for RAM \\n")\n+    free_count = 0\n+    while (TRUE) {\n+        if (makelock(lock_file)){\n+            print("---------locking--------")\n+            return(lock_file)\n+        }\n+      if (detectMemUsage() < p) {\n+        cat("RAM freed \\n")\n+        return(NULL)\n+      }\n+      Sys.sleep(5)\n+      if (evaluate_user_cpu_usage() == \'free\'){\n+        free_count = free_count + 1\n+      }else{\n+        free_count = 0\n+      }\n+      if (detectMemUsage() < 0.8 & free_count > 100){\n+        cat("RAM not free but nothing else is running \\n")\n+        return(NULL)\n+      }\n+    }\n+  }\n+}\n+\n+lsmem = function() {\n+  g = globalenv()\n+  out_all = envs = list()\n+  envs = append(envs, g)\n+  total_size = numeric()\n+  while (environmentName(g) != "R_EmptyEnv") {\n+    g <- parent.env(g)\n+    envs = append(envs, g)\n+  }\n+  for (e in envs) {\n+    \n+    obj = ls(envir = e)\n+    if (length(obj) == 0) {\n+      break\n+    }\n+    obj.size = list()\n+    for (i in obj) {\n+      obj.size[[i]] = object.size(get(i, envir = e))\n+    }\n+    out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE)\n+    out = out[order(out$size, decreasing = TRUE), ]\n+    out_all = append(out_all, out)\n+    total_size = append(total_size, sum(out$size))\n+  }\n+  return(list(objects = out_all, total_size = total_size))\n+} \n+\n+evaluate_user_cpu_usage = function(){\n+  user = Sys.info()["user"]\n+  a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1]))\n+  s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1)\n+  if (a<5 & sum(s %in% \'D\')==0 & sum(s%in% \'R\')<2){\n+    status = \'free\'\n+  }else{\n+    status = \'full\'\n+  }\n+  return(status)\n+}\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/mgblast2GL.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/mgblast2GL.R Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,36 @@
+#!/usr/bin/env Rscript
+## get script dir:
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+script.dir <- normalizePath(dirname(script.name))
+fout <- commandArgs(T)[[2]]
+
+source(paste(script.dir, "/methods.R", sep=''))
+suppressPackageStartupMessages (library(igraph))
+fin <- commandArgs(T)[[1]]
+
+colcls = rep("NULL", 12)
+colcls[c(1,5,11)] = c("character","character","numeric")
+cat("loading mgblast table\n")
+df = read.table(pipe(paste("cut -f1,5,11 ",fin)), sep="\t",comment.char="", as.is=TRUE, header= FALSE, colClasses = c("character","character","numeric"))
+
+cat("creating graph\n")
+GL = list()
+colnames(df) =  c("V1", "V2", "weight")
+GL$G = graph.data.frame(df , directed = FALSE)
+print(summary(GL$G))
+cat("calculating ogdf layouts\n")
+try({
+    L1 <- OGDFlayout(GL$G, alg=c("fmmm"))
+})
+cat("calculating fruchterman reingold layouts\n")
+
+L2 = layout.fruchterman.reingold(GL$G,dim=3)
+if (class(L1) != "try-error"){
+    GL$L <- cbind(L1[[1]],L2)
+}else{
+    GL$L <- L2
+}
+cat("saving output\n")
+save(GL, file=fout)
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/tarean.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/tarean.R Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,66 @@
+#!/usr/bin/env Rscript
+library(optparse, quiet = TRUE)
+library(parallel)
+if (interactive()){
+  ## define functions only and exit
+  ## assume that working directory was changes with source( chdir=TRUE)!!!
+  script.dir=normalizePath('.')
+  source('methods.R')
+  source('logo_methods.R')
+  source('htmlheader.R')
+  options(OGDF = paste0(script.dir,"/OGDF/runOGDFlayout"))
+  
+}else{
+  ## get options from command line
+  initial.options <- commandArgs(trailingOnly = FALSE)
+  file.arg.name <- "--file="
+  script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
+  script.dir <- normalizePath(dirname(script.name))
+  oridir=getwd()
+  ## parse arguments
+  option_list = list(
+    make_option(c('-i', '--input_sequences'),action='store',type='character',help='fasta file with input sequences',default=NA),
+    make_option(c('-o', '--output_dir'),action='store',type='character',help='output directory',default="./kmer_analysis"),
+    make_option(c('-m', '--min_kmer_length'),action='store',type='numeric',help='min kmer length',default=11),
+    make_option(c('-x', '--max_kmer_length'),action='store',type='numeric',help='min kmer length',default=27),
+    make_option(c('-n', '--cpu'),action='store',type='numeric',help='number of cpu to use',default=NULL),
+    make_option(c('-s', '--sample_size'),action='store',type='numeric',help='number of sequences to use for analysis, is set to 0 all sequences are used',default=10000),
+    make_option(c('-r', '--reorient_reads'),action='store_true',type='logical',help='number of cpu to use',default=FALSE),
+    make_option(c('-l', '--no_layout'),action='store_true',type='logical',help='do not calculate graph layout',default=FALSE),
+    make_option(c('-p', '--paired'),action='store_true',type='logical',help='reads are paired',default=FALSE),
+    make_option(c('-t', '--tRNA_database='), action='store',type='character',help='path to tRNA database, is set PBS detection is performed',default=NULL)
+    
+  )
+
+  description = paste (strwrap(" put decription here"), collapse ="\n")
+  epilogue = paste (strwrap(" put epilogue here"), collapse ="\n")
+  parser=OptionParser(
+    option_list=option_list,
+    epilogue=epilogue,
+    description=description,
+    )
+  opt = parse_args(parser, args=commandArgs(TRUE))
+  ## as Rscript
+  options(OGDF = paste0(script.dir,"/OGDF/runOGDFlayout"))
+  CPU = ifelse(is.null(opt$cpu), detectCores(), opt$cpu)
+  source(paste(script.dir,"/","methods.R", sep=''))
+  source(paste(script.dir,"/","logo_methods.R", sep=''))
+  source(paste(script.dir,"/","htmlheader.R", sep=''))
+  ## set number of CPU to use
+
+
+  
+  ## run tarean:
+  tarean(
+    opt$input_sequences,
+    opt$output_dir,
+    opt$min_kmer_length,
+    opt$max_kmer_length,
+    CPU,
+    opt$sample_size,
+    opt$reorient_reads,
+    opt$tRNA_database,
+    !opt$no_layout,
+    paired = opt$paired
+    )
+}
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean/tarean_batch_mode.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean/tarean_batch_mode.R Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,119 @@
+#!/usr/bin/env Rscript
+library(optparse, quiet = TRUE)
+library(parallel)
+initial.options <- commandArgs(trailingOnly = FALSE)
+file.arg.name <- "--file="
+script.name <- sub(file.arg.name,"",
+                   initial.options[grep(file.arg.name, initial.options)]
+)
+script.dir <- normalizePath(dirname(script.name))
+oridir=getwd()
+options(OGDF = paste0(script.dir,"/OGDF/runOGDFlayout2015.5"))
+CPU =  detectCores()
+source(paste(script.dir,"/","methods.R", sep=''))
+source(paste(script.dir,"/","logo_methods.R", sep=''))
+source(paste(script.dir,"/","htmlheader.R", sep=''))
+
+option_list = list(
+    make_option(c('-i', '--input_sequences_list'),
+                action='store',type='character',
+                help='list of fasta sequences file for tarean analysis'
+                ),
+    make_option(c('-o', '--output_dir'),
+                action='store',type='character',
+                help='output directory',
+                default="./kmer_analysis"),
+    make_option(c('-t', '--tRNA_database'),
+                action='store',type='character',
+                help='path to tRNA database',
+                default=NULL),
+    make_option(c('-p', '--parallel'),
+                action='store_true',
+                type='logical',
+                help='run in parallel (faster but can exhaust RAM)',
+                default=FALSE),
+    make_option(c('-N', '--not_paired'),
+                action='store_true',
+                type='logical',
+                help='reads are not paired',
+                default=FALSE)
+
+    )
+
+description = paste (strwrap(" put decription here"), collapse ="\n")
+epilogue = paste (strwrap(" put epilogue here"), collapse ="\n")
+parser=OptionParser(
+    option_list=option_list,
+    epilogue=epilogue,
+    description=description,
+    )
+
+opt = parse_args(parser, args=commandArgs(TRUE))
+paired = !opt$not_paired
+print(opt)
+dir.create(opt$output_dir)
+fl = readLines(opt$input_sequences_list)
+## reorder to avoid running large top graphs at once
+ord = sample(seq_along(fl), length(fl))
+
+
+index=0
+info=list()
+save.image(paste0(opt$output_dir,"/info.RData")) # for debugin purposes
+if (opt$parallel){
+    cat("processing in parallel")
+    info=mcmapply(
+        FUN=tarean,
+        input_sequences = fl[ord],
+        output_dir = paste0(opt$output_dir,"/",sprintf("%04d",ord)),
+        min_kmer_length = 11,
+        max_kmer_length = 27,
+        CPU = CPU,
+        sample_size = 30000,
+        reorient_reads = TRUE,
+        tRNA_database_path = opt$tRNA_database,
+        paired = paired,
+        include_layout=FALSE,
+        mc.cores=round(1+detectCores()/9),
+        mc.set.seed = TRUE,
+        mc.preschedule = FALSE,
+        SIMPLIFY = FALSE
+    )
+}else{
+    for (i in fl){
+        index = index + 1
+        dirout=paste0(opt$output_dir,"/",sprintf("%04d",index))
+        try({
+            info[[i]] = tarean(i, dirout, 11, 27, CPU, 30000, TRUE, opt$tRNA_database, include_layout=FALSE)
+            cat("-----------------------------------------------------\n")
+            print(info[[i]])
+        })
+    }
+}
+save(info, file = paste0(opt$output_dir,"/info.RData"))
+save.image("tmp.RData")
+## export as csv table
+## 'graph_info' is always include:
+
+tr_info = data.frame(do.call(rbind, info[sapply(info,length)>1]))
+if (nrow(tr_info)>0){
+    ## TR detected
+    graph_info = data.frame (do.call(rbind, lapply(info, "[[", "graph_info")))
+    graph_info$source=rownames(graph_info)
+    tr_info$graph_info=NULL
+    tr_info$source = rownames(tr_info)
+    graph_tr_info = merge(graph_info, tr_info, all=TRUE, by='source')
+    if (any(sapply(graph_tr_info,class)=='list')){
+        for (i in colnames(graph_tr_info)){
+            graph_tr_info[,i] = unname(unlist(graph_tr_info[,i]))
+        }
+    }
+    write.table(graph_tr_info, file=paste0(opt$output_dir,"/info.csv"), row.names=FALSE,sep="\t", quote= TRUE)
+}else{
+    ## TR not detected
+    graph_info = data.frame (do.call(rbind, lapply(info, function(x) unlist(x[['graph_info']]))))
+    graph_info$source=rownames(graph_info)
+    write.table(graph_info, file=paste0(opt$output_dir,"/info.csv"), row.names=FALSE,sep="\t", quote = FALSE)
+}
+
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean_output_help.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean_output_help.html Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,399 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">\n+<head>\n+<!-- 2016-10-21 P\xc3\xa1 11:06 -->\n+<meta  http-equiv="Content-Type" content="text/html;charset=utf-8" />\n+<meta  name="viewport" content="width=device-width, initial-scale=1" />\n+<title>TAREAN output description</title>\n+<meta  name="generator" content="Org-mode" />\n+<meta  name="author" content="petr" />\n+<style type="text/css">\n+ <!--/*--><![CDATA[/*><!--*/\n+  .title  { text-align: center;\n+             margin-bottom: .2em; }\n+  .subtitle { text-align: center;\n+              font-size: medium;\n+              font-weight: bold;\n+              margin-top:0; }\n+  .todo   { font-family: monospace; color: red; }\n+  .done   { font-family: monospace; color: green; }\n+  .priority { font-family: monospace; color: orange; }\n+  .tag    { background-color: #eee; font-family: monospace;\n+            padding: 2px; font-size: 80%; font-weight: normal; }\n+  .timestamp { color: #bebebe; }\n+  .timestamp-kwd { color: #5f9ea0; }\n+  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }\n+  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }\n+  .org-center { margin-left: auto; margin-right: auto; text-align: center; }\n+  .underline { text-decoration: underline; }\n+  #postamble p, #preamble p { font-size: 90%; margin: .2em; }\n+  p.verse { margin-left: 3%; }\n+  pre {\n+    border: 1px solid #ccc;\n+    box-shadow: 3px 3px 3px #eee;\n+    padding: 8pt;\n+    font-family: monospace;\n+    overflow: auto;\n+    margin: 1.2em;\n+  }\n+  pre.src {\n+    position: relative;\n+    overflow: visible;\n+    padding-top: 1.2em;\n+  }\n+  pre.src:before {\n+    display: none;\n+    position: absolute;\n+    background-color: white;\n+    top: -10px;\n+    right: 10px;\n+    padding: 3px;\n+    border: 1px solid black;\n+  }\n+  pre.src:hover:before { display: inline;}\n+  pre.src-sh:before    { content: \'sh\'; }\n+  pre.src-bash:before  { content: \'sh\'; }\n+  pre.src-emacs-lisp:before { content: \'Emacs Lisp\'; }\n+  pre.src-R:before     { content: \'R\'; }\n+  pre.src-perl:before  { content: \'Perl\'; }\n+  pre.src-java:before  { content: \'Java\'; }\n+  pre.src-sql:before   { content: \'SQL\'; }\n+\n+  table { border-collapse:collapse; }\n+  caption.t-above { caption-side: top; }\n+  caption.t-bottom { caption-side: bottom; }\n+  td, th { vertical-align:top;  }\n+  th.org-right  { text-align: center;  }\n+  th.org-left   { text-align: center;   }\n+  th.org-center { text-align: center; }\n+  td.org-right  { text-align: right;  }\n+  td.org-left   { text-align: left;   }\n+  td.org-center { text-align: center; }\n+  dt { font-weight: bold; }\n+  .footpara { display: inline; }\n+  .footdef  { margin-bottom: 1em; }\n+  .figure { padding: 1em; }\n+  .figure p { text-align: center; }\n+  .inlinetask {\n+    padding: 10px;\n+    border: 2px solid gray;\n+    margin: 10px;\n+    background: #ffffcc;\n+  }\n+  #org-div-home-and-up\n+   { text-align: right; font-size: 70%; white-space: nowrap; }\n+  textarea { overflow-x: auto; }\n+  .linenr { font-size: smaller }\n+  .code-highlighted { background-color: #ffff00; }\n+  .org-info-js_info-navigation { border-style: none; }\n+  #org-info-js_console-label\n+    { font-size: 10px; font-weight: bold; white-space: nowrap; }\n+  .org-info-js_search-highlight\n+    { background-color: #ffff00; color: #000000; font-weight: bold; }\n+  /*]]>*/-->\n+</style>\n+<link rel="stylesheet" type="text/css" href="style1.css" />\n+<script type="text/javascript">\n+/*\n+@licstart  The following is the entire license notice for the\n+JavaScript code in this tag.\n+\n+Copyright (C) 2012-2013 Free Software Foundation, Inc.\n+\n+The JavaScript code in this tag is free software: you can\n+redistribute it and/or modify it under the terms of the GNU\n+General Public License (GNU GPL) as published by the Free Software\n+Foundation, either version 3 o'..b'4-1">\n+<p>\n+Detailed information for each cluster is stored is subdirectories:\n+</p>\n+\n+<div class="org-src-container">\n+\n+<pre class="src src-folder">dir_CL0011\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 blast.csv        &lt;------------tab delimited file, all-to-all comparison od reads within cluster            \n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11_directed_graph.RData &lt;----directed graph representation of cluster saved as R igraph object\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11.GL     &lt;-----------------undirected graph representation of cluster saved as R igraph object\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11.png         &lt;-----------\xe2\x94\x90- images with graph visualization\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11_tmb.png     &lt;-----------\xe2\x94\x98\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 dna_database_annotation.csv &lt;-- annotation of cluster reads based on the DNA database of repeats\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_all.fas   &lt;---------------- all reads included in the cluster in fasta format\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads.fas      &lt;---------------- subset of reads used for monomer reconstruction\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas &lt;------------ subset of reads all in the same orientation\n+\xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 tarean\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 consensus.fasta &lt;----------- fasta file with tandem repeat consensus variants\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ggmin.RData\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 img\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_11mer_1.png  &lt;-----\xe2\x94\x90  \n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_11mer_2.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_15mer_2.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_15mer_3.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_15mer_4.png  &lt;-----\xe2\x94\x82 images of kmer-based graphs used for reconstruction of\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_19mer_2.png  &lt;-----\xe2\x94\x82 monomer variants\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_19mer_4.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_19mer_5.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_23mer_2.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_27mer_3.png  &lt;-----\xe2\x94\x98\n+    \xe2\x94\x82   \xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_11mer_1.png  &lt;-----\xe2\x94\x90  \n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_11mer_2.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_15mer_2.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_15mer_3.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_15mer_4.png  &lt;-----\xe2\x94\x82 images with DNA logos representing consensus sequences\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_19mer_2.png  &lt;-----\xe2\x94\x82 of monomer variants\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_19mer_4.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_19mer_5.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_23mer_2.png  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 logo_27mer_3.png  &lt;-----\xe2\x94\x98\n+    \xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_11mer_1.csv  &lt;-----\xe2\x94\x90\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_11mer_2.csv  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_15mer_2.csv  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_15mer_3.csv  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_15mer_4.csv  &lt;-----\xe2\x94\x82 position probability matrices for individual monomer\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_19mer_2.csv  &lt;-----\xe2\x94\x82 variants derived from k-mer frequencies\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_19mer_4.csv  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_19mer_5.csv  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_23mer_2.csv  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_27mer_3.csv  &lt;-----\xe2\x94\x98\n+    \xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_11.kmers  &lt;-----\xe2\x94\x90\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_15.kmers  &lt;-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_19.kmers  &lt;-----\xe2\x94\x82 k-mer frequencies calculated on oriented reads\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_23.kmers  &lt;-----\xe2\x94\x82 for k-mer lengths 11 - 27\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_27.kmers  &lt;-----\xe2\x94\x98\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fasblast_out.cvs  &lt;---------\xe2\x94\x90results of blastn search against database of tRNA\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fasblast_out.cvs_L.csv &lt;----\xe2\x94\x82for purposes of LTR detection \n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fasblast_out.cvs_R.csv &lt;----\xe2\x94\x98 \n+    \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 report.html       &lt;--- cluster analysisHTML summary\n+</pre>\n+</div>\n+</div>\n+</div>\n+</div>\n+</div>\n+<div id="postamble" class="status">\n+<p class="author">Author: petr</p>\n+<p class="date">Created: 2016-10-21 P\xc3\xa1 11:06</p>\n+<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>\n+</div>\n+</body>\n+</html>\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/tarean_output_help.org
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/tarean_output_help.org Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,174 @@\n+#+TITLE: TAREAN output description\n+#+HTML_HEAD_EXTRA: <link rel="stylesheet" type="text/css" href="style1.css" />\n+#+LANGUAGE: en\n+\n+* Introduction\n+TAREAN output includes *HTML report* with list of all analyzed clusters; the clusters are classified into five categories:\n++ high confidence satellites\n++ low confidence satellites\n++ potential LTR elements\n++ rDNA\n++ other clusters\n+Each cluster for which consensus sequences was reconstructed has also its own detailed report, linked to the main report.\n+\n+* Main HTML report\n+This report contains basic information about all clusters larger than specified threshold (default value is 0.01% of analyzed reads)\n+** Table legend\n++ Cluster ::  Cluster identifier\n++ Genome Proportion[%] :: /(Number of sequences in cluster/Number of sequences in clustering) x 100%/\n++ Size :: Number of reads in the cluster\n++ Satellite probability :: Empirical probability estimate that cluster sequences\n+     are derived from satellite repeat. This estimate is based on analysis of more\n+     than xxx clusters including yyy manually anotated and zzz experimentaly\n+     validated satellite repeats\n++ Consensus :: Consensus sequence is outcome of kmer-based\n+     analysis and represents the most probable satellite monomer\n+     sequence\n++ Kmer analysis ::\n+     link to analysis report for individual clusters\n++ Graph layout :: Graph-based visualization of similarities among sequence\n+     reads\n++ Connected component index :: Proportion of nodes of the graph which are part\n+     of the the largest strongly connected component\n++ Pair completeness index ::  Proportion of reads with available\n+     mate-pair within the same cluster\n++ Kmer coverage :: Sum of relative frequencies of all kmers used for consensus\n+     sequence reconstruction\n++ |V| :: Number of vertices of the graph\n++ |E| :: Number of edges of the graph\n++ PBS score :: Primer binding site detection score\n++ The longest ORF length :: Length of the longest open reading frame found in\n+     any of the possible six reading frames. Search was done on dimer of\n+     consensus so ORFs can be longer than \'monomer\' length\n++ Similarity-based annotation :: Annotation based on\n+     similarity search using blastn/blastx against database of known\n+     repeats.\n+* Detailed cluster report\n+Cluster report includes a list of major monomer sequence varinats reconstructed from the most frequent k-mers. The reconstructed consensus sequences are sorted based on their significance (that is, what proportion of k-mer they represent).\n+** Table legend\n+- kmer :: length of kmer used for consensus reconstruction.\n+- variant :: identifier of consensus variant.\n+- total score :: measure of significance of consensus variant. Score is calculated as a sum of weights of all k-mers used for consensus reconstruction.\n+- monomer length :: length of the consensus\n+- consensus :: consensus sequence without ambiguous bases. \n+- graph image :: part of de-Bruijn graph based on the abundant k-mers. Size of\n+     vertices corresponds to k-mer frequencies, Paths in the graph which was used\n+     for reconstruction of consensus sequences is gray colored.\n+- logo image :: consensus sequences shown as DNA logo. Height of letters corresponds to kmer frequencies. Logo images are linked to corresponding position probability matrices.\n+\n+* Structure of the output archive\n+Complete results from TAREAN analysis can by downloaded as zip archive which contains the following\n+files and directories:\n+\n+#+BEGIN_SRC files & directories\n+.\n+.\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 clusters_info.csv <------------ list of clusters in tab delimited format \n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 index.html        <------------ main html report\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 seqclust\n+\xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 assembly                  # not implemented yet\n+\xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 blastn        <------------ results of read comparison with DNA database\n+\xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 blastx        <------------ results of read comparison with protein database\n+\xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 clust'..b'05r  120735r 69527r  12235r  176778f 189307f 131952f 163507f 100038r 178475r \n+  :  >CL3    6\n+  :  99835r  222598f 29715r  102023f 99524r  30116f \n+  :  >CL4    6\n+  :  51723r  69073r  218774r 146425f 136314r 41744f \n+  :  >CL5    5\n+  :  70686f  65565f  234078r 50430r  68247r \n+\n+where =CL1 11= is the cluster ID followed by number of reads in the cluster;\n+next line contains list of all read names belonging to the cluster.\n+** structure of cluster directories\n+\n+Detailed information for each cluster is stored is subdirectories:\n+\n+#+BEGIN_SRC folder directories\n+dir_CL0011\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 blast.csv        <------------tab delimited file, all-to-all comparison od reads within cluster            \n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11_directed_graph.RData <----directed graph representation of cluster saved as R igraph object\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11.GL     <-----------------undirected graph representation of cluster saved as R igraph object\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11.png         <-----------\xe2\x94\x90- images with graph visualization\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 CL11_tmb.png     <-----------\xe2\x94\x98\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 dna_database_annotation.csv <-- annotation of cluster reads based on the DNA database of repeats\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_all.fas   <---------------- all reads included in the cluster in fasta format\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads.fas      <---------------- subset of reads used for monomer reconstruction\n+\xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas <------------ subset of reads all in the same orientation\n+\xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 tarean\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 consensus.fasta <----------- fasta file with tandem repeat consensus variants\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ggmin.RData\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 img\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_11mer_1.png  <-----\xe2\x94\x90  \n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_11mer_2.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_15mer_2.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_15mer_3.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_15mer_4.png  <-----\xe2\x94\x82 images of kmer-based graphs used for reconstruction of\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_19mer_2.png  <-----\xe2\x94\x82 monomer variants\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_19mer_4.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_19mer_5.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_23mer_2.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 graph_27mer_3.png  <-----\xe2\x94\x98\n+    \xe2\x94\x82   \xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_11mer_1.png  <-----\xe2\x94\x90  \n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_11mer_2.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_15mer_2.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_15mer_3.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_15mer_4.png  <-----\xe2\x94\x82 images with DNA logos representing consensus sequences\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_19mer_2.png  <-----\xe2\x94\x82 of monomer variants\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_19mer_4.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_19mer_5.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 logo_23mer_2.png  <-----\xe2\x94\x82\n+    \xe2\x94\x82\xc2\xa0\xc2\xa0 \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 logo_27mer_3.png  <-----\xe2\x94\x98\n+    \xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_11mer_1.csv  <-----\xe2\x94\x90\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_11mer_2.csv  <-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_15mer_2.csv  <-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_15mer_3.csv  <-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_15mer_4.csv  <-----\xe2\x94\x82 position probability matrices for individual monomer\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_19mer_2.csv  <-----\xe2\x94\x82 variants derived from k-mer frequencies\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_19mer_4.csv  <-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_19mer_5.csv  <-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_23mer_2.csv  <-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 ppm_27mer_3.csv  <-----\xe2\x94\x98\n+    \xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_11.kmers  <-----\xe2\x94\x90\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_15.kmers  <-----\xe2\x94\x82\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_19.kmers  <-----\xe2\x94\x82 k-mer frequencies calculated on oriented reads\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_23.kmers  <-----\xe2\x94\x82 for k-mer lengths 11 - 27\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fas_27.kmers  <-----\xe2\x94\x98\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fasblast_out.cvs  <---------\xe2\x94\x90results of blastn search against database of tRNA\n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fasblast_out.cvs_L.csv <----\xe2\x94\x82for purposes of LTR detection \n+    \xe2\x94\x9c\xe2\x94\x80\xe2\x94\x80 reads_oriented.fasblast_out.cvs_R.csv <----\xe2\x94\x98 \n+    \xe2\x94\x94\xe2\x94\x80\xe2\x94\x80 report.html       <--- cluster analysisHTML summary\n+#+END_SRC\n+\n+\n+\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/utils.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/utils.R Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,341 @@\n+#!/usr/bin/env Rscript\n+suppressPackageStartupMessages(library(DBI))\n+suppressPackageStartupMessages(library(RSQLite))\n+ \n+CONNECTED = FALSE\n+if (FALSE) {\n+    ## for testing\n+    seqdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/sequences.db"\n+    hitsortdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/hitsort.db"\n+    class_file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree.rds"\n+    ## connect to sqlite databases\n+    SEQDB = dbConnect(RSQLite::SQLite(), seqdb)\n+    HITSORTDB = dbConnect(RSQLite::SQLite(), hitsortdb)\n+    CLS_TREE = readRDS(class_file)\n+}\n+\n+connect_to_databases = function(seqdb, hitsortdb,classification_hierarchy_file = NULL){\n+    if (!CONNECTED){\n+        SEQDB <<- dbConnect(RSQLite::SQLite(), seqdb)\n+        HITSORTDB <<- dbConnect(RSQLite::SQLite(), hitsortdb)\n+        if (!is.null(classification_hierarchy_file)){\n+            CLS_TREE <<- readRDS(classification_hierarchy_file)\n+        }\n+        CONNECTED <<- TRUE\n+    }\n+}\n+\n+disconnect_database = function(){\n+    if (CONNECTED){\n+        dbDisconnect(SEQDB)\n+        dbDisconnect(HITSORTDB)\n+        CONNECTED <<- FALSE\n+    }\n+}\n+\n+nested2named_list = function(x){\n+    y = as.list(unlist(x[[1]]))\n+    names(y) = unlist(x[[2]])\n+    return(y)\n+}\n+\n+is_comparative = function(){\n+    prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes")\n+    if (nrow(prefix_codes) == 0){\n+        return(FALSE)\n+    }else{\n+        return(TRUE)\n+    }\n+}\n+\n+get_comparative_codes = function(){\n+    prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes")\n+    return(prefix_codes)\n+}\n+\n+add_preamble = function(html_file, preamble){\n+  html_content=readLines(html_file)\n+  modified_html_content = gsub("<body>",\n+       paste("<body>\\n", preamble,"\\n"),\n+       html_content)\n+  cat(modified_html_content, file = html_file, sep="\\n")\n+}\n+\n+\n+df2html = function(df, header = NULL, sort_col = NULL, digits = 3, rounding_function=signif, decreasing = TRUE, scroling = FALSE, width = 300){\n+    if (!is.null(sort_col)){\n+        df = df[order(df[,sort_col], decreasing = decreasing),]\n+    }\n+    if (!is.null(digits)){\n+        for (i in seq_along(df)){\n+            if(is.numeric(df[,i])){\n+                df[,i] = rounding_function(df[,i], digits)\n+            }\n+        }\n+    }\n+    if (is.null(header)){\n+        h = ""\n+    }else{\n+        h = paste0("    <th>",header,"</th>\\n", collapse="") %>%\n+            paste0(" <tr>\\n", .,"  </tr>\\n")\n+    }\n+    x = apply(df,1,function(x)paste0("    <td>",x,"</td>\\n", collapse="")) %>%\n+        paste0("  <tr>\\n", .,"  </tr>\\n", collapse = "")\n+    if (scroling){\n+        cols = paste0(\'<col width="\',rep(round(100/ncol(df)),ncol(df)),\'%">\\n\',collapse ="")\n+        height = min(200, 22 * nrow(df))\n+        out = paste0(\n+            \'<table cellspacing="0" cellpadding="0" border="0" width="\',width,\'">\\n\',\n+            \'  <tr>\\n\',\n+            \'    <td>\\n\',\n+            \'      <table cellspacing="0" cellpadding="1" border="1" width="\', width,\'" >\\n\',\n+            cols,\n+            h,\n+            \'      </table>\\n\',\n+            \'   </td>\\n\',\n+            \' </tr>\\n\',\n+            \' <tr>\\n\',\n+            \'   <td>\\n\',\n+            \'     <div style="width:\',width,\'px; height:\',height,\'px; overflow:auto;">\\n\',\n+            \'       <table cellspacing="0" cellpadding="1" border="1" width="\',width,\'" >\\n\',\n+            cols,\n+            x,\n+            \'       </table>\\n\',\n+            \'     </div>\\n\',\n+            \'  </td>\\n\',\n+            \' </tr>\\n\',\n+            \'</table>\\n\'\n+        )\n+\n+    }else{\n+        out = paste ("<table>\\n", h,x, "</table>\\n")\n+    }\n+    return(out)\n+}\n+\n+start_html = function(filename, header){\n+    cat(header, file = filename)\n+    html_writer = function(content, fn=HTML, ...){\n+        fn(content, append = TRUE, file = filename, ...)\n+    }\n+}\n+\n+preformatted = function(x){\n+    ## make preformatted html text\n+    return(\n+        paste(\n+        "'..b'    \n+    text(sum(supercluster_size) + singlets / 2,\n+         max(supercluster_size) * 1.05,\n+         labels = paste(singlets, "singlets"))\n+    \n+    axis(1,at=seq(0,N_clustering,length.out=11),label=seq(0,100,by=10))\n+  dev.off()\n+  clustering_info = list(\n+    Number_of_reads_in_clusters = sum(supercluster_size),\n+    Number_of_clusters =  nrow(communities),\n+    Number_of_superclusters = length(supercluster_size),\n+    Number_of_singlets = singlets\n+  )\n+  return(clustering_info)\n+}\n+\n+\n+rectMap=function(x,scale.by=\'row\',col=1,xlab="",ylab="",grid=TRUE,axis_pos=c(1,4),cexx=NULL,cexy=NULL){\n+  if (scale.by==\'row\'){\n+                                        #x=(x)/rowSums(x)\n+    x=(x)/apply(x,1,max)\n+  }\n+  if (scale.by==\'column\'){\n+    x=t(t(x)/apply(x,2,max))\n+  }\n+  nc=ncol(x)\n+  nr=nrow(x)\n+  coords=expand.grid(1:nr,1:nc)\n+  plot(coords[,1],coords[,2],type=\'n\',axes=F,xlim=range(coords[,1])+c(-.5,.5),ylim=range(coords[,2])+c(-.5,.5),xlab=xlab,ylab=ylab)\n+  axis(axis_pos[1],at=1:nr,labels=rownames(x),lty=0,tick=FALSE,line=0,cex.axis=0.5/log10(nr))\n+  axis(axis_pos[2],at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=0, cex.axis=0.7)\n+  axis(2,at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=1, cex.axis=0.7)\n+\n+  mtext(side = 1, "Cluster id", las=1, line = 3, cex = 0.5)\n+  line = 1.5 + log10(nr)\n+  mtext(side = 2, "Proportions of individual samples", las =0, line = line, cex = 0.5)\n+  s=c(x)/2  # to get it proportional\n+  w = c(x)/2\n+  rect(coords[,1]-0.5,coords[,2]-s,coords[,1]+0.5,coords[,2]+s,col=col,border=NA)\n+  if (grid){\n+    abline(v=0:(nr)+.5,h=0:(nc)+.5,lty=2,col="#60606030")\n+  }\n+  box(col="#60606030",lty=2)\n+}\n+\n+plot_rect_map = function(read_counts,cluster_annotation, output_file,Xcoef=1,Ycoef=1){\n+  counts = read.table(read_counts,header=TRUE,as.is=TRUE)\n+  annot = read.table(cluster_annotation, sep="\\t",header=FALSE,as.is=TRUE)\n+  N = nrow(annot)\n+  colnames(annot) = c("cluster", "Automatic.classification")\n+  annot$number.of.reads = rowSums(counts[1 : nrow(annot) ,-1])\n+  unique_repeats = names(sort(table(c(annot$Automatic.classification,rep(\'nd\',N))),decreasing = TRUE))\n+\n+  M = as.matrix(counts[1:N,-(1:2)])\n+  rownames(M) = paste0("CL",rownames(M))\n+  Mn1=(M)/apply(M,1,max)\n+  Mn2=M/max(M)\n+  Mn2=M/apply(M,1,sum)\n+\n+  ord1 = hclust(dist(Mn1),method = "ward.D")$order\n+  ord2 = hclust(dist(t(Mn2)))$order\n+  wdth = (400 + N*10 ) * Xcoef\n+  hgt = (600 + ncol(M)*50) * Ycoef\n+  ptsize = round((wdth*hgt)^(1/4))\n+  png(output_file, width=wdth,height=hgt, pointsize = ptsize)  # was 50\n+  ploting_area_width = 3 + log10(N)*3\n+  ploting_area_sides = 1\n+  layout(matrix(c(4,2,3,4,1,3),ncol=3,byrow = TRUE),\n+         width=c(ploting_area_sides,ploting_area_width,ploting_area_sides),\n+         height=c(3,ncol(M)*0.5))\n+  par(xaxs=\'i\', yaxs = \'i\')\n+  par(las=2,mar=c(4,0,0,0),cex.axis=0.5)\n+  rectMap(Mn2[ord1,ord2],scale.by=\'none\',col=1, grid=TRUE)\n+  par(las=2,mar=c(1,0,1,0), mgp = c(2,0.5,0))\n+  barplot(annot$number.of.reads[ord1], col = 1)\n+  mtext(side = 2, "Cluster size", las = 3, line = 2, cex = 0.5)\n+  par(mar=c(0,0,10,0))\n+  plot.new()\n+  st = dev.off()\n+  ## calculate coordinated if boxes to create hyperlink\n+  X0 = wdth/(ploting_area_sides * 2 + ploting_area_width)* ploting_area_sides\n+  X1 = wdth/(ploting_area_sides * 2 + ploting_area_width)*(ploting_area_sides + ploting_area_width)\n+  L = round(seq(X0,X1, length.out = N + 1)[1:N])\n+  R = round(seq(X0,X1, length.out = N + 1)[2:(N + 1)])\n+  cn = rownames(Mn2[ord1,ord2])\n+  cluster_links = paste0(\n+    "seqclust/clustering/clusters/dir_CL",\n+    sprintf("%04d", as.integer(substring(cn,3 ))),\n+    "/index.html")\n+  coords = paste0(L, ",", 1, ",", R, ",", hgt)\n+  clustermap = paste0(\n+    \'\\n<map name="clustermap"> \\n\',\n+    paste0(\n+      \'<area shape="rect"\\n      coords="\',coords, \'"\\n\',\n+      \'      href="\', cluster_links, \'"\\n\',\n+      \'      title="\', cn, \'"/>\\n\',\n+      collapse = ""),\n+    "</map>\\n")\n+  return(clustermap)\n+}\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 lib/utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/utils.py Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+import os
+import hashlib
+
+from itertools import chain
+
+
+
+def md5checksum(filename, fail_if_missing=True):
+    try:
+        md5 = hashlib.md5()
+        with open(filename, "rb") as f:
+            for i in iter(lambda: f.read(4096), b""):
+                md5.update(i)
+    except FileNotFoundError as e:
+        if not fail_if_missing:
+            return "Not calculated!!!!  File {} is missing".format(filename)
+        else:
+            raise e
+
+    return md5.hexdigest()
+
+
+class FilePath(str):
+    '''
+    Extension of str - it just contain additional atribute showing that the string is alsp path to file
+    '''
+
+    def __new__(cls, string):
+        obj = super(FilePath, cls).__new__(cls, string)
+        obj.filepath = True
+        return obj
+
+    def relative(self, start):
+        ''' return path relative to start'''
+        return os.path.relpath(self, start)
+
+
+def save_as_table(d, path, header=None, relative=True):
+    ''' takes list of dictionaries and save csv file
+    define header if you want to use specific order!
+    '''
+    pathdir = os.path.dirname(path)
+    if not header:
+        
+        all_keys = [i.keys() for i in d]
+        header = set(chain(*all_keys))
+        print("header: ---------", header)
+    with open(path, 'w') as f:
+        f.write("\t".join(header))
+        f.write("\n")
+        for i in d:
+            istr = []
+            for key in header:
+                if isinstance(i[key], FilePath):
+                    if relative:
+                        istr.append('"' + str(i[key].relative(pathdir)) + '"')
+                    else:
+                        istr.append('"' + str(i[key]) + '"')
+                else:
+                    if isinstance(i[key], str):
+                        istr.append('"' + str(i[key] + '"'))
+                    else:
+                        istr.append(str(i[key]))
+
+            f.write("\t".join(istr))
+            f.write("\n")
+
+
+def export_tandem_consensus(clusters_info, path, rank=1, n=1):
+    ''' export tr consensu to file'''
+    print("exporting fasta files")
+    print(clusters_info)
+    s = None
+    with open(path, 'w') as f:
+        for cl in clusters_info:
+            print(cl)
+            print(dir(cl))
+            if cl.TR_consensus and rank == cl.tandem_rank:
+                s = ">CL{index}_TR_{n}_x_{L}nt\n{sequence}\n".format(
+                    index=cl.index,
+                    n=n,
+                    L=cl.TR_monomer_length,
+                    sequence=n * cl.TR_consensus.replace('<pre>', ''))
+                f.write(s)
+    if s:
+        return path
+    else:
+        return None
+
+
+def file_len(filename):
+    '''count number of lines in file'''
+    with open(filename) as f:
+        i = 0
+        for i in f:
+            i += i
+    return i
+
+def go2line(f, L):
+    ''' find line L in file object f '''
+    f.seek(0)
+    if L == 0:
+        return
+    i = 0
+    pos = f.tell()
+    for line in f:
+        i += 1
+        if i == L:
+            f.seek(pos)
+            return
+        else:
+            pos = pos + len(line)
+
+def format_query(x):
+    '''
+    make list for query in format ("x","y","x",...)
+    '''
+    out = '("'+ '","'.join(
+        map(str, x)
+    ) + '")'
+    return out
b
diff -r c56807be3b72 -r 3bc73f5dc785 licence/Artistic_License
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/licence/Artistic_License Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,123 @@
+                  The Artistic License
+
+                       Preamble
+
+The intent of this document is to state the conditions under
+which a Package may be copied, such that the Copyright Holder
+maintains some semblance of artistic control over the
+development of the package, while giving the users of the
+package the right to use and distribute the Package in a
+more-or-less customary fashion, plus the right to make
+reasonable modifications.
+
+Definitions:
+
+    "Package" refers to the collection of files distributed by
+    the Copyright Holder, and derivatives of that collection of
+    files created through textual modification.
+    
+    "Standard Version" refers to such a Package if it has not
+    been modified, or has been modified in accordance with the
+    wishes of the Copyright Holder. 
+    
+    "Copyright Holder" is whoever is named in the copyright or
+    copyrights for the package.
+    
+    "You" is you, if you're thinking about copying or
+    distributing this Package.
+    
+    "Reasonable copying fee" is whatever you can justify on the
+    basis of media cost, duplication charges, time of people
+    involved, and so on. (You will not be required to justify it
+    to the Copyright Holder, but only to the computing community
+    at large as a market that must bear the fee.)
+    
+    "Freely Available" means that no fee is charged for the item
+    itself, though there may be fees involved in handling the
+    item. It also means that recipients of the item may
+    redistribute it under the same conditions they received it.
+
+1. You may make and give away verbatim copies of the source form
+of the Standard Version of this Package without restriction,
+provided that you duplicate all of the original copyright
+notices and associated disclaimers.
+
+2. You may apply bug fixes, portability fixes and other
+modifications derived from the Public Domain or from the
+Copyright Holder. A Package modified in such a way shall still
+be considered the Standard Version.
+
+3. You may otherwise modify your copy of this Package in any
+way, provided that you insert a prominent notice in each changed
+file stating how and when you changed that file, and provided
+that you do at least ONE of the following:
+
+    a) place your modifications in the Public Domain or
+    otherwise make them Freely Available, such as by posting
+    said modifications to Usenet or an equivalent medium, or
+    placing the modifications on a major archive site such
+    as ftp.uu.net, or by allowing the Copyright Holder to
+    include your modifications in the Standard Version of
+    the Package.
+
+    b) use the modified Package only within your corporation
+    or organization.
+
+    c) rename any non-standard executables so the names do
+    not conflict with standard executables, which must also
+    be provided, and provide a separate manual page for each
+    non-standard executable that clearly documents how it
+    differs from the Standard Version.
+
+    d) make other distribution arrangements with the
+    Copyright Holder.
+
+4. You may distribute the programs of this Package in object
+code or executable form, provided that you do at least ONE of
+the following:
+
+    a) distribute a Standard Version of the executables and
+    library files, together with instructions (in the manual
+    page or equivalent) on where to get the Standard
+    Version.
+
+    b) accompany the distribution with the machine-readable
+    source of the Package with your modifications.
+
+    c) accompany any non-standard executables with their
+    corresponding Standard Version executables, giving the
+    non-standard executables non-standard names, and clearly
+    documenting the differences in manual pages (or
+    equivalent), together with instructions on where to get
+    the Standard Version.
+
+    d) make other distribution arrangements with the
+    Copyright Holder.
+
+5. You may charge a reasonable copying fee for any distribution
+of this Package. You may charge any fee you choose for support
+of this Package. You may not charge a fee for this Package
+itself. However, you may distribute this Package in aggregate
+with other (possibly commercial) programs as part of a larger
+(possibly commercial) software distribution provided that you do
+not advertise this Package as a product of your own.
+
+6. The scripts and library files supplied as input to or
+produced as output from the programs of this Package do not
+automatically fall under the copyright of this Package, but
+belong to whomever generated them, and may be sold commercially,
+and may be aggregated with this Package.
+
+7. C or perl subroutines supplied by you and linked into this
+Package shall not be considered part of this Package.
+
+8. The name of the Copyright Holder may not be used to endorse
+or promote products derived from this software without specific
+prior written permission.
+
+9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.
+
+                           The End
b
diff -r c56807be3b72 -r 3bc73f5dc785 licence/README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/licence/README.md Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,44 @@
+RepeatExplorer include programs distributed under other licenses:
+
+formatdb binary from legacy blast
+=================================
+license for blast:
+PUBLIC DOMAIN NOTICE
+
+National Center for Biotechnology Information
+
+This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as a United States Government employee and thus cannot be copyrighted. This software/database is freely available to the public for use. The National Library of Medicine and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NLM and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NLM and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the author in any work or product based on this material.
+
+
+
+
+mgblast and cap3 binaries from TGICL package
+============================================
+
+Copyright
+-----------
+Copyright (c) 2002-2003, The Institute for Genomic Research, All Rights Reserved
+This software is OSI Certified Open Source Software.
+OSI Certified is a certification mark of the Open Source Initiative.
+
+The folowing programs are distributed as binaries:
+
+cap3 
+----
+Contig Assembly Program version 3
+Huang, X. and Madan, A. (1999) 
+ CAP3: A DNA Sequence Assembly Program. 
+ Genome Research, 9: 868-877. 
+
+mgblast 
+-------
+ Modified version of the megablast program from the NCBI toolkit.
+ Zhang, Schwartz, Wagner, and Miller, 
+  A Greedy Algorithm for Aligning DNA Sequences, 
+  J. Comp. Biol. 2000, Feb-Apr;7(1-2):203-14 
+
+licence
+--------
+Artistic license
+
+  
\ No newline at end of file
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/Makefile Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+CC=g++
+CFLAGS= -ansi -O5 -Wall
+LDFLAGS= -ansi -lm -Wall
+EXEC=louvain_community louvain_convert louvain_hierarchy
+OBJ1= graph_binary.o community.o
+OBJ2= graph.o
+
+all: $(EXEC)
+
+louvain_community : $(OBJ1) main_community.o
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+louvain_convert : $(OBJ2) main_convert.o
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+louvain_hierarchy : main_hierarchy.o
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+##########################################
+# Generic rules
+##########################################
+
+%.o: %.cpp %.h
+ $(CC) -o $@ -c $< $(CFLAGS)
+
+%.o: %.cpp
+ $(CC) -o $@ -c $< $(CFLAGS)
+
+clean:
+ rm -f *.o *~ $(EXEC)
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/community.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/community.cpp Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,341 @@\n+// File: community.h\r\n+// -- community detection source file\r\n+//-----------------------------------------------------------------------------\r\n+// Community detection\r\n+// Based on the article "Fast unfolding of community hierarchies in large networks"\r\n+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre\r\n+//\r\n+// This program must not be distributed without agreement of the above mentionned authors.\r\n+//-----------------------------------------------------------------------------\r\n+// Author   : E. Lefebvre, adapted by J.-L. Guillaume\r\n+// Email    : jean-loup.guillaume@lip6.fr\r\n+// Location : Paris, France\r\n+// Time\t    : February 2008\r\n+//-----------------------------------------------------------------------------\r\n+// see readme.txt for more details\r\n+\r\n+#include "community.h"\r\n+\r\n+using namespace std;\r\n+\r\n+Community::Community(char * filename, char * filename_w, int type, int nbp, double minm) {\r\n+  g = Graph(filename, filename_w, type);\r\n+  size = g.nb_nodes;\r\n+\r\n+  neigh_weight.resize(size,-1);\r\n+  neigh_pos.resize(size);\r\n+  neigh_last=0;\r\n+\r\n+  n2c.resize(size);\r\n+  in.resize(size);\r\n+  tot.resize(size);\r\n+\r\n+  for (int i=0 ; i<size ; i++) {\r\n+    n2c[i] = i;\r\n+    tot[i] = g.weighted_degree(i);\r\n+    in[i]  = g.nb_selfloops(i);\r\n+  }\r\n+\r\n+  nb_pass = nbp;\r\n+  min_modularity = minm;\r\n+}\r\n+\r\n+Community::Community(Graph gc, int nbp, double minm) {\r\n+  g = gc;\r\n+  size = g.nb_nodes;\r\n+\r\n+  neigh_weight.resize(size,-1);\r\n+  neigh_pos.resize(size);\r\n+  neigh_last=0;\r\n+\r\n+  n2c.resize(size);\r\n+  in.resize(size);\r\n+  tot.resize(size);\r\n+\r\n+  for (int i=0 ; i<size ; i++) {\r\n+    n2c[i] = i;\r\n+    in[i]  = g.nb_selfloops(i);\r\n+    tot[i] = g.weighted_degree(i);\r\n+  }\r\n+\r\n+  nb_pass = nbp;\r\n+  min_modularity = minm;\r\n+}\r\n+\r\n+void\r\n+Community::init_partition(char * filename) {\r\n+  ifstream finput;\r\n+  finput.open(filename,fstream::in);\r\n+\r\n+  // read partition\r\n+  while (!finput.eof()) {\r\n+    unsigned int node, comm;\r\n+    finput >> node >> comm;\r\n+    \r\n+    if (finput) {\r\n+      int old_comm = n2c[node];\r\n+      neigh_comm(node);\r\n+\r\n+      remove(node, old_comm, neigh_weight[old_comm]);\r\n+\r\n+      unsigned int i=0;\r\n+      for ( i=0 ; i<neigh_last ; i++) {\r\n+\tunsigned int best_comm     = neigh_pos[i];\r\n+\tfloat best_nblinks  = neigh_weight[neigh_pos[i]];\r\n+\tif (best_comm==comm) {\r\n+\t  insert(node, best_comm, best_nblinks);\r\n+\t  break;\r\n+\t}\r\n+      }\r\n+      if (i==neigh_last)\r\n+\tinsert(node, comm, 0);\r\n+    }\r\n+  }\r\n+  finput.close();\r\n+}\r\n+\r\n+// inline void\r\n+// Community::remove(int node, int comm, double dnodecomm) {\r\n+//   assert(node>=0 && node<size);\r\n+\r\n+//   tot[comm] -= g.weighted_degree(node);\r\n+//   in[comm]  -= 2*dnodecomm + g.nb_selfloops(node);\r\n+//   n2c[node]  = -1;\r\n+// }\r\n+\r\n+// inline void\r\n+// Community::insert(int node, int comm, double dnodecomm) {\r\n+//   assert(node>=0 && node<size);\r\n+\r\n+//   tot[comm] += g.weighted_degree(node);\r\n+//   in[comm]  += 2*dnodecomm + g.nb_selfloops(node);\r\n+//   n2c[node]=comm;\r\n+// }\r\n+\r\n+void\r\n+Community::display() {\r\n+  for (int i=0 ; i<size ; i++)\r\n+    cerr << " " << i << "/" << n2c[i] << "/" << in[i] << "/" << tot[i] ;\r\n+  cerr << endl;\r\n+}\r\n+\r\n+\r\n+double\r\n+Community::modularity() {\r\n+  double q  = 0.;\r\n+  double m2 = (double)g.total_weight;\r\n+\r\n+  for (int i=0 ; i<size ; i++) {\r\n+    if (tot[i]>0)\r\n+      q += (double)in[i]/m2 - ((double)tot[i]/m2)*((double)tot[i]/m2);\r\n+  }\r\n+\r\n+  return q;\r\n+}\r\n+\r\n+void\r\n+Community::neigh_comm(unsigned int node) {\r\n+  for (unsigned int i=0 ; i<neigh_last ; i++)\r\n+    neigh_weight[neigh_pos[i]]=-1;\r\n+  neigh_last=0;\r\n+\r\n+  pair<vector<unsigned int>::iterator, vector<float>::iterator> p = g.neighbors(node);\r\n+\r\n+  unsigned int deg = g.nb_neighbors(node);\r\n+\r\n+  neigh_pos[0]=n2c[node];\r\n+  neigh_weight[neigh_pos[0]]=0;\r\n+  neigh_last=1;\r\n+\r\n+  for (unsigned int i=0 ; i<deg ; i++) {\r\n+    unsigned int neigh        = *(p.first+i);\r\n+    unsigned int neigh_comm   = n2c[neigh];\r\n+    double'..b'<int> renumber(size, -1);\r\n+  for (int node=0 ; node<size ; node++) {\r\n+    renumber[n2c[node]]++;\r\n+  }\r\n+\r\n+  int final=0;\r\n+  for (int i=0 ; i<size ; i++)\r\n+    if (renumber[i]!=-1)\r\n+      renumber[i]=final++;\r\n+\r\n+  // Compute communities\r\n+  vector<vector<int> > comm_nodes(final);\r\n+  for (int node=0 ; node<size ; node++) {\r\n+    comm_nodes[renumber[n2c[node]]].push_back(node);\r\n+  }\r\n+\r\n+  // Compute weighted graph\r\n+  Graph g2;\r\n+  g2.nb_nodes = comm_nodes.size();\r\n+  g2.degrees.resize(comm_nodes.size());\r\n+\r\n+  int comm_deg = comm_nodes.size();\r\n+  for (int comm=0 ; comm<comm_deg ; comm++) {\r\n+    map<int,float> m;\r\n+    map<int,float>::iterator it;\r\n+\r\n+    int comm_size = comm_nodes[comm].size();\r\n+    for (int node=0 ; node<comm_size ; node++) {\r\n+      pair<vector<unsigned int>::iterator, vector<float>::iterator> p = g.neighbors(comm_nodes[comm][node]);\r\n+      int deg = g.nb_neighbors(comm_nodes[comm][node]);\r\n+      for (int i=0 ; i<deg ; i++) {\r\n+\tint neigh        = *(p.first+i);\r\n+\tint neigh_comm   = renumber[n2c[neigh]];\r\n+\tdouble neigh_weight = (g.weights.size()==0)?1.:*(p.second+i);\r\n+\r\n+\tit = m.find(neigh_comm);\r\n+\tif (it==m.end())\r\n+\t  m.insert(make_pair(neigh_comm, neigh_weight));\r\n+\telse\r\n+\t  it->second+=neigh_weight;\r\n+      }\r\n+    }\r\n+    g2.degrees[comm]=(comm==0)?m.size():g2.degrees[comm-1]+m.size();\r\n+    g2.nb_links+=m.size();\r\n+\r\n+    \r\n+    for (it = m.begin() ; it!=m.end() ; it++) {\r\n+      g2.total_weight  += it->second;\r\n+      g2.links.push_back(it->first);\r\n+      g2.weights.push_back(it->second);\r\n+    }\r\n+  }\r\n+\r\n+  return g2;\r\n+}\r\n+\r\n+\r\n+bool\r\n+Community::one_level() {\r\n+  bool improvement=false ;\r\n+  int nb_moves;\r\n+  int nb_pass_done = 0;\r\n+  double new_mod   = modularity();\r\n+  double cur_mod   = new_mod;\r\n+\r\n+  vector<int> random_order(size);\r\n+  for (int i=0 ; i<size ; i++)\r\n+    random_order[i]=i;\r\n+  for (int i=0 ; i<size-1 ; i++) {\r\n+    int rand_pos = rand()%(size-i)+i;\r\n+    int tmp      = random_order[i];\r\n+    random_order[i] = random_order[rand_pos];\r\n+    random_order[rand_pos] = tmp;\r\n+  }\r\n+\r\n+  // repeat while \r\n+  //   there is an improvement of modularity\r\n+  //   or there is an improvement of modularity greater than a given epsilon \r\n+  //   or a predefined number of pass have been done\r\n+  do {\r\n+    cur_mod = new_mod;\r\n+    nb_moves = 0;\r\n+    nb_pass_done++;\r\n+\r\n+    // for each node: remove the node from its community and insert it in the best community\r\n+    for (int node_tmp=0 ; node_tmp<size ; node_tmp++) {\r\n+//      int node = node_tmp;\r\n+      int node = random_order[node_tmp];\r\n+      int node_comm     = n2c[node];\r\n+      double w_degree = g.weighted_degree(node);\r\n+\r\n+      // computation of all neighboring communities of current node\r\n+      neigh_comm(node);\r\n+      // remove node from its current community\r\n+      remove(node, node_comm, neigh_weight[node_comm]);\r\n+\r\n+      // compute the nearest community for node\r\n+      // default choice for future insertion is the former community\r\n+      int best_comm        = node_comm;\r\n+      double best_nblinks  = 0.;\r\n+      double best_increase = 0.;\r\n+      for (unsigned int i=0 ; i<neigh_last ; i++) {\r\n+        double increase = modularity_gain(node, neigh_pos[i], neigh_weight[neigh_pos[i]], w_degree);\r\n+        if (increase>best_increase) {\r\n+          best_comm     = neigh_pos[i];\r\n+          best_nblinks  = neigh_weight[neigh_pos[i]];\r\n+          best_increase = increase;\r\n+        }\r\n+      }\r\n+\r\n+      // insert node in the nearest community\r\n+      insert(node, best_comm, best_nblinks);\r\n+     \r\n+      if (best_comm!=node_comm)\r\n+        nb_moves++;\r\n+    }\r\n+\r\n+    double total_tot=0;\r\n+    double total_in=0;\r\n+    for (unsigned int i=0 ; i<tot.size() ;i++) {\r\n+      total_tot+=tot[i];\r\n+      total_in+=in[i];\r\n+    }\r\n+\r\n+    new_mod = modularity();\r\n+    if (nb_moves>0)\r\n+      improvement=true;\r\n+    \r\n+  } while (nb_moves>0 && new_mod-cur_mod>min_modularity);\r\n+\r\n+  return improvement;\r\n+}\r\n+\r\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/community.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/community.h Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,133 @@
+// File: community.h
+// -- community detection header file
+//-----------------------------------------------------------------------------
+// Community detection
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#ifndef COMMUNITY_H
+#define COMMUNITY_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+#include <map>
+
+#include "graph_binary.h"
+
+using namespace std;
+
+class Community {
+ public:
+  vector<double> neigh_weight;
+  vector<unsigned int> neigh_pos;
+  unsigned int neigh_last;
+
+  Graph g; // network to compute communities for
+  int size; // nummber of nodes in the network and size of all vectors
+  vector<int> n2c; // community to which each node belongs
+  vector<double> in,tot; // used to compute the modularity participation of each community
+
+  // number of pass for one level computation
+  // if -1, compute as many pass as needed to increase modularity
+  int nb_pass;
+
+  // a new pass is computed if the last one has generated an increase 
+  // greater than min_modularity
+  // if 0. even a minor increase is enough to go for one more pass
+  double min_modularity;
+
+  // constructors:
+  // reads graph from file using graph constructor
+  // type defined the weighted/unweighted status of the graph file
+  Community (char *filename, char *filename_w, int type, int nb_pass, double min_modularity);
+  // copy graph
+  Community (Graph g, int nb_pass, double min_modularity);
+
+  // initiliazes the partition with something else than all nodes alone
+  void init_partition(char *filename_part);
+
+  // display the community of each node
+  void display();
+
+  // remove the node from its current community with which it has dnodecomm links
+  inline void remove(int node, int comm, double dnodecomm);
+
+  // insert the node in comm with which it shares dnodecomm links
+  inline void insert(int node, int comm, double dnodecomm);
+
+  // compute the gain of modularity if node where inserted in comm
+  // given that node has dnodecomm links to comm.  The formula is:
+  // [(In(comm)+2d(node,comm))/2m - ((tot(comm)+deg(node))/2m)^2]-
+  // [In(comm)/2m - (tot(comm)/2m)^2 - (deg(node)/2m)^2]
+  // where In(comm)    = number of half-links strictly inside comm
+  //       Tot(comm)   = number of half-links inside or outside comm (sum(degrees))
+  //       d(node,com) = number of links from node to comm
+  //       deg(node)   = node degree
+  //       m           = number of links
+  inline double modularity_gain(int node, int comm, double dnodecomm, double w_degree);
+
+  // compute the set of neighboring communities of node
+  // for each community, gives the number of links from node to comm
+  void neigh_comm(unsigned int node);
+
+  // compute the modularity of the current partition
+  double modularity();
+
+  // displays the graph of communities as computed by one_level
+  void partition2graph();
+  // displays the current partition (with communities renumbered from 0 to k-1)
+  void display_partition();
+
+  // generates the binary graph of communities as computed by one_level
+  Graph partition2graph_binary();
+
+  // compute communities of the graph for one level
+  // return true if some nodes have been moved
+  bool one_level();
+};
+
+inline void
+Community::remove(int node, int comm, double dnodecomm) {
+  assert(node>=0 && node<size);
+
+  tot[comm] -= g.weighted_degree(node);
+  in[comm]  -= 2*dnodecomm + g.nb_selfloops(node);
+  n2c[node]  = -1;
+}
+
+inline void
+Community::insert(int node, int comm, double dnodecomm) {
+  assert(node>=0 && node<size);
+
+  tot[comm] += g.weighted_degree(node);
+  in[comm]  += 2*dnodecomm + g.nb_selfloops(node);
+  n2c[node]=comm;
+}
+
+inline double
+Community::modularity_gain(int node, int comm, double dnodecomm, double w_degree) {
+  assert(node>=0 && node<size);
+
+  double totc = (double)tot[comm];
+  double degc = (double)w_degree;
+  double m2   = (double)g.total_weight;
+  double dnc  = (double)dnodecomm;
+  
+  return (dnc - totc*degc/m2);
+}
+
+
+#endif // COMMUNITY_H
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/graph.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/graph.cpp Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,157 @@
+// File: graph.cpp
+// -- simple graph handling source file
+//-----------------------------------------------------------------------------
+// Community detection
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#include "graph.h"
+
+using namespace std;
+
+Graph::Graph(char *filename, int type) {
+  ifstream finput;
+  finput.open(filename,fstream::in);
+
+  int nb_links=0;
+
+  while (!finput.eof()) {
+    unsigned int src, dest;
+    double weight=1.;
+
+    if (type==WEIGHTED) {
+      finput >> src >> dest >> weight;
+    } else {
+      finput >> src >> dest;
+    }
+    
+    if (finput) {
+      if (links.size()<=max(src,dest)+1) {
+        links.resize(max(src,dest)+1);
+      }
+      
+      links[src].push_back(make_pair(dest,weight));
+      if (src!=dest)
+        links[dest].push_back(make_pair(src,weight));
+
+      nb_links++;
+    }
+  }
+
+  finput.close();
+}
+
+void
+Graph::renumber(int type) {
+  vector<int> linked(links.size(),-1);
+  vector<int> renum(links.size(),-1);
+  int nb=0;
+  
+  for (unsigned int i=0 ; i<links.size() ; i++) {
+    for (unsigned int j=0 ; j<links[i].size() ; j++) {
+      linked[i]=1;
+      linked[links[i][j].first]=1;
+    }
+  }
+  
+  for (unsigned int i=0 ; i<links.size() ; i++) {
+    if (linked[i]==1)
+      renum[i]=nb++;
+  }
+
+  for (unsigned int i=0 ; i<links.size() ; i++) {
+    if (linked[i]==1) {
+      for (unsigned int j=0 ; j<links[i].size() ; j++) {
+ links[i][j].first = renum[links[i][j].first];
+      }
+      links[renum[i]]=links[i];
+    }
+  }
+  links.resize(nb);
+}
+
+void
+Graph::clean(int type) {
+  for (unsigned int i=0 ; i<links.size() ; i++) {
+    map<int, float> m;
+    map<int, float>::iterator it;
+
+    for (unsigned int j=0 ; j<links[i].size() ; j++) {
+      it = m.find(links[i][j].first);
+      if (it==m.end())
+ m.insert(make_pair(links[i][j].first, links[i][j].second));
+      else if (type==WEIGHTED)
+       it->second+=links[i][j].second;
+    }
+    
+    vector<pair<int,float> > v;
+    for (it = m.begin() ; it!=m.end() ; it++)
+      v.push_back(*it);
+    links[i].clear();
+    links[i]=v;
+  }
+}
+
+void
+Graph::display(int type) {
+  for (unsigned int i=0 ; i<links.size() ; i++) {
+    for (unsigned int j=0 ; j<links[i].size() ; j++) {
+      int dest   = links[i][j].first;
+      float weight = links[i][j].second;
+      if (type==WEIGHTED)
+ cout << i << " " << dest << " " << weight << endl;
+      else
+ cout << i << " " << dest << endl;
+    }
+  }
+}
+
+void
+Graph::display_binary(char *filename, char *filename_w, int type) {
+  ofstream foutput;
+  foutput.open(filename, fstream::out | fstream::binary);
+
+  unsigned int s = links.size();
+
+  // outputs number of nodes
+  foutput.write((char *)(&s),4);
+
+  // outputs cumulative degree sequence
+  long tot=0;
+  for (unsigned int i=0 ; i<s ; i++) {
+    tot+=(long)links[i].size();
+    foutput.write((char *)(&tot),8);
+  }
+
+  // outputs links
+  for (unsigned int i=0 ; i<s ; i++) {
+    for (unsigned int j=0 ; j<links[i].size() ; j++) {
+      int dest = links[i][j].first;
+      foutput.write((char *)(&dest),4);
+    }
+  }
+  foutput.close();
+
+  // outputs weights in a separate file
+  if (type==WEIGHTED) {
+    ofstream foutput_w;
+    foutput_w.open(filename_w,fstream::out | fstream::binary);
+    for (unsigned int i=0 ; i<s ; i++) {
+      for (unsigned int j=0 ; j<links[i].size() ; j++) {
+ float weight = links[i][j].second;
+ foutput_w.write((char *)(&weight),4);
+      }
+    }
+    foutput_w.close();
+  }
+}
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/graph.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/graph.h Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,47 @@
+// File: graph.h
+// -- simple graph handling header file
+//-----------------------------------------------------------------------------
+// Community detection
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#ifndef GRAPH_H
+#define GRAPH_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <set>
+#include <algorithm>
+
+#define WEIGHTED   0
+#define UNWEIGHTED 1
+
+using namespace std;
+
+class Graph {
+ public:
+  vector<vector<pair<int,float> > > links;
+  
+  Graph (char *filename, int type);
+  
+  void clean(int type);
+  void renumber(int type);
+  void display(int type);
+  void display_binary(char *filename, char *filename_w, int type);
+};
+
+#endif // GRAPH_H
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/graph_binary.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/graph_binary.cpp Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,150 @@
+// File: graph_binary.cpp
+// -- graph handling source
+//-----------------------------------------------------------------------------
+// Community detection 
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#include <sys/mman.h>
+#include <fstream>
+#include "graph_binary.h"
+#include "math.h"
+
+Graph::Graph() {
+  nb_nodes     = 0;
+  nb_links     = 0;
+  total_weight = 0;
+}
+
+Graph::Graph(char *filename, char *filename_w, int type) {
+  ifstream finput;
+  finput.open(filename,fstream::in | fstream::binary);
+
+  // Read number of nodes on 4 bytes
+  finput.read((char *)&nb_nodes, 4);
+  assert(finput.rdstate() == ios::goodbit);
+
+  // Read cumulative degree sequence: 8 bytes for each node
+  // cum_degree[0]=degree(0); cum_degree[1]=degree(0)+degree(1), etc.
+  degrees.resize(nb_nodes);
+  finput.read((char *)&degrees[0], nb_nodes*8);
+
+  // Read links: 4 bytes for each link (each link is counted twice)
+  nb_links=degrees[nb_nodes-1];
+  links.resize(nb_links);
+  finput.read((char *)(&links[0]), (long)nb_links*4);  
+
+  // IF WEIGHTED : read weights: 4 bytes for each link (each link is counted twice)
+  weights.resize(0);
+  total_weight=0;
+  if (type==WEIGHTED) {
+    ifstream finput_w;
+    finput_w.open(filename_w,fstream::in | fstream::binary);
+    weights.resize(nb_links);
+    finput_w.read((char *)&weights[0], (long)nb_links*4);  
+  }    
+
+  // Compute total weight
+  for (unsigned int i=0 ; i<nb_nodes ; i++) {
+    total_weight += (double)weighted_degree(i);
+  }
+}
+
+Graph::Graph(int n, int m, double t, int *d, int *l, float *w) {
+/*  nb_nodes     = n;
+  nb_links     = m;
+  total_weight = t;
+  degrees      = d;
+  links        = l;
+  weights      = w;*/
+}
+
+
+void
+Graph::display() {
+/*  for (unsigned int node=0 ; node<nb_nodes ; node++) {
+    pair<vector<unsigned int>::iterator, vector<float>::iterator > p = neighbors(node);
+    for (unsigned int i=0 ; i<nb_neighbors(node) ; i++) {
+      if (node<=*(p.first+i)) {
+ if (weights.size()!=0)
+   cout << node << " " << *(p.first+i) << " " << *(p.second+i) << endl;
+ else
+   cout << node << " " << *(p.first+i) << endl;
+      }
+    }   
+  }*/
+  for (unsigned int node=0 ; node<nb_nodes ; node++) {
+    pair<vector<unsigned int>::iterator, vector<float>::iterator > p = neighbors(node);
+    cout << node << ":" ;
+    for (unsigned int i=0 ; i<nb_neighbors(node) ; i++) {
+      if (true) {
+ if (weights.size()!=0)
+   cout << " (" << *(p.first+i) << " " << *(p.second+i) << ")";
+ else
+   cout << " " << *(p.first+i);
+      }
+    }
+    cout << endl;
+  }
+}
+
+void
+Graph::display_reverse() {
+  for (unsigned int node=0 ; node<nb_nodes ; node++) {
+    pair<vector<unsigned int>::iterator, vector<float>::iterator > p = neighbors(node);
+    for (unsigned int i=0 ; i<nb_neighbors(node) ; i++) {
+      if (node>*(p.first+i)) {
+ if (weights.size()!=0)
+   cout << *(p.first+i) << " " << node << " " << *(p.second+i) << endl;
+ else
+   cout << *(p.first+i) << " " << node << endl;
+      }
+    }   
+  }
+}
+
+
+bool
+Graph::check_symmetry() {
+  int error=0;
+  for (unsigned int node=0 ; node<nb_nodes ; node++) {
+    pair<vector<unsigned int>::iterator, vector<float>::iterator > p = neighbors(node);
+    for (unsigned int i=0 ; i<nb_neighbors(node) ; i++) {
+      unsigned int neigh = *(p.first+i);
+      float weight = *(p.second+i);
+      
+      pair<vector<unsigned int>::iterator, vector<float>::iterator > p_neigh = neighbors(neigh);
+      for (unsigned int j=0 ; j<nb_neighbors(neigh) ; j++) {
+ unsigned int neigh_neigh = *(p_neigh.first+j);
+ float neigh_weight = *(p_neigh.second+j);
+
+ if (node==neigh_neigh && weight!=neigh_weight) {
+   cout << node << " " << neigh << " " << weight << " " << neigh_weight << endl;
+   if (error++==10)
+     exit(0);
+ }
+      }
+    }
+  }
+  return (error==0);
+}
+
+
+void
+Graph::display_binary(char *outfile) {
+  ofstream foutput;
+  foutput.open(outfile ,fstream::out | fstream::binary);
+
+  foutput.write((char *)(&nb_nodes),4);
+  foutput.write((char *)(&degrees[0]),4*nb_nodes);
+  foutput.write((char *)(&links[0]),8*nb_links);
+}
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/graph_binary.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/graph_binary.h Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,134 @@
+// File: graph_binary.h
+// -- graph handling header file
+//-----------------------------------------------------------------------------
+// Community detection 
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#ifndef GRAPH_H
+#define GRAPH_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <malloc.h>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <algorithm>
+
+#define WEIGHTED   0
+#define UNWEIGHTED 1
+
+using namespace std;
+
+class Graph {
+ public:
+  unsigned int nb_nodes;
+  unsigned long nb_links;
+  double total_weight;  
+
+  vector<unsigned long> degrees;
+  vector<unsigned int> links;
+  vector<float> weights;
+
+  Graph();
+
+  // binary file format is
+  // 4 bytes for the number of nodes in the graph
+  // 8*(nb_nodes) bytes for the cumulative degree for each node:
+  //    deg(0)=degrees[0]
+  //    deg(k)=degrees[k]-degrees[k-1]
+  // 4*(sum_degrees) bytes for the links
+  // IF WEIGHTED 4*(sum_degrees) bytes for the weights in a separate file
+  Graph(char *filename, char *filename_w, int type);
+  
+  Graph(int nb_nodes, int nb_links, double total_weight, int *degrees, int *links, float *weights);
+
+  void display(void);
+  void display_reverse(void);
+  void display_binary(char *outfile);
+  bool check_symmetry();
+
+
+  // return the number of neighbors (degree) of the node
+  inline unsigned int nb_neighbors(unsigned int node);
+
+  // return the number of self loops of the node
+  inline double nb_selfloops(unsigned int node);
+
+  // return the weighted degree of the node
+  inline double weighted_degree(unsigned int node);
+
+  // return pointers to the first neighbor and first weight of the node
+  inline pair<vector<unsigned int>::iterator, vector<float>::iterator > neighbors(unsigned int node);
+};
+
+
+inline unsigned int
+Graph::nb_neighbors(unsigned int node) {
+  assert(node>=0 && node<nb_nodes);
+
+  if (node==0)
+    return degrees[0];
+  else
+    return degrees[node]-degrees[node-1];
+}
+
+inline double
+Graph::nb_selfloops(unsigned int node) {
+  assert(node>=0 && node<nb_nodes);
+
+  pair<vector<unsigned int>::iterator, vector<float>::iterator > p = neighbors(node);
+  for (unsigned int i=0 ; i<nb_neighbors(node) ; i++) {
+    if (*(p.first+i)==node) {
+      if (weights.size()!=0)
+ return (double)*(p.second+i);
+      else 
+ return 1.;
+    }
+  }
+  return 0.;
+}
+
+inline double
+Graph::weighted_degree(unsigned int node) {
+  assert(node>=0 && node<nb_nodes);
+
+  if (weights.size()==0)
+    return (double)nb_neighbors(node);
+  else {
+    pair<vector<unsigned int>::iterator, vector<float>::iterator > p = neighbors(node);
+    double res = 0;
+    for (unsigned int i=0 ; i<nb_neighbors(node) ; i++) {
+      res += (double)*(p.second+i);
+    }
+    return res;
+  }
+}
+
+inline pair<vector<unsigned int>::iterator, vector<float>::iterator >
+Graph::neighbors(unsigned int node) {
+  assert(node>=0 && node<nb_nodes);
+
+  if (node==0)
+    return make_pair(links.begin(), weights.begin());
+  else if (weights.size()!=0)
+    return make_pair(links.begin()+degrees[node-1], weights.begin()+degrees[node-1]);
+  else
+    return make_pair(links.begin()+degrees[node-1], weights.begin());
+}
+
+
+#endif // GRAPH_H
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/main_community.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/main_community.cpp Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,169 @@
+// File: main_community.cpp
+// -- community detection, sample main file
+//-----------------------------------------------------------------------------
+// Community detection 
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+#include <iostream> 
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "graph_binary.h"
+#include "community.h"
+
+using namespace std;
+
+char *filename = NULL;
+char *filename_w = NULL;
+char *filename_part = NULL;
+int type       = UNWEIGHTED;
+int nb_pass    = 0;
+double precision = 0.000001;
+int display_level = -2;
+int k1 = 16;
+int seed = 123;
+
+bool verbose = false;
+
+void
+usage(char *prog_name, const char *more) {
+  cerr << more;
+  cerr << "usage: " << prog_name << " input_file [-w weight_file] [-p part_file] [-q epsilon] [-l display_level] [-v] [-h]" << endl << endl;
+  cerr << "input_file: file containing the graph to decompose in communities." << endl;
+  cerr << "-w file\tread the graph as a weighted one (weights are set to 1 otherwise)." << endl;
+  cerr << "-p file\tstart the computation with a given partition instead of the trivial partition." << endl;
+  cerr << "\tfile must contain lines \"node community\"." << endl;
+  cerr << "-q eps\ta given pass stops when the modularity is increased by less than epsilon." << endl;
+  cerr << "-l k\tdisplays the graph of level k rather than the hierachical structure." << endl;
+  cerr << "\tif k=-1 then displays the hierarchical structure rather than the graph at a given level." << endl;
+  cerr << "-v\tverbose mode: gives computation time, information about the hierarchy and modularity." << endl;
+  cerr << "-s\tseed for rundom number generator setting, integer(123 deafault)" << endl;
+  cerr << "-h\tshow this usage message." << endl;
+  exit(0);
+}
+
+void
+parse_args(int argc, char **argv) {
+  if (argc<2)
+    usage(argv[0], "Bad arguments number\n");
+
+  for (int i = 1; i < argc; i++) {
+    if(argv[i][0] == '-') {
+      switch(argv[i][1]) {
+      case 'w':
+ type = WEIGHTED;
+        filename_w = argv[i+1];
+ i++;
+ break;
+      case 'p':
+        filename_part = argv[i+1];
+ i++;
+ break;
+      case 'q':
+ precision = atof(argv[i+1]);
+ i++;
+ break;
+      case 'l':
+ display_level = atoi(argv[i+1]);
+ i++;
+ break;
+      case 's':
+ seed = atoi(argv[i+1]);
+ i++;
+ break;
+      case 'k':
+ k1 = atoi(argv[i+1]);
+ i++;
+ break;
+      case 'v':
+ verbose=true;
+ break;
+      default:
+ usage(argv[0], "Unknown option\n");
+      }
+    } else {
+      if (filename==NULL)
+        filename = argv[i];
+      else
+        usage(argv[0], "More than one filename\n");
+    }
+  }
+}
+
+void
+display_time(const char *str) {
+  time_t rawtime;
+  time ( &rawtime );
+  cerr << str << ": " << ctime (&rawtime);
+}
+
+int
+main(int argc, char **argv) {
+  parse_args(argc, argv);
+  srand(seed);
+  time_t time_begin, time_end;
+  time(&time_begin);
+  if (verbose)
+    display_time("Begin");
+
+  Community c(filename, filename_w, type, -1, precision);
+  if (filename_part!=NULL)
+    c.init_partition(filename_part);
+  Graph g;
+  bool improvement=true;
+  double mod=c.modularity(), new_mod;
+  int level=0;
+
+  do {
+    if (verbose) {
+      cerr << "level " << level << ":\n";
+      display_time("  start computation");
+      cerr << "  network size: " 
+    << c.g.nb_nodes << " nodes, " 
+    << c.g.nb_links << " links, "
+    << c.g.total_weight << " weight." << endl;
+    }
+
+    improvement = c.one_level();
+    new_mod = c.modularity();
+    if (++level==display_level)
+      g.display();
+    if (display_level==-1)
+      c.display_partition();
+    g = c.partition2graph_binary();
+    c = Community(g, -1, precision);
+
+    if (verbose)
+      cerr << "  modularity increased from " << mod << " to " << new_mod << endl;
+
+    mod=new_mod;
+    if (verbose)
+      display_time("  end computation");
+
+    if (filename_part!=NULL && level==1) // do at least one more computation if partition is provided
+      improvement=true;
+  } while(improvement);
+
+  time(&time_end);
+  if (verbose) {
+    display_time("End");
+    cerr << "Total duration: " << (time_end-time_begin) << " sec." << endl;
+  }
+  cerr << new_mod << endl;
+}
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/main_convert.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/main_convert.cpp Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,87 @@
+// File: main_convert.cpp
+// -- conversion of a graph from ascii to binary, sample main file
+//-----------------------------------------------------------------------------
+// Community detection 
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#include "graph.h"
+
+using namespace std;
+
+char *infile   = NULL;
+char *outfile  = NULL;
+char *outfile_w  = NULL;
+int type       = UNWEIGHTED;
+bool do_renumber = false;
+
+void
+usage(char *prog_name, const char *more) {
+  cerr << more;
+  cerr << "usage: " << prog_name << " -i input_file -o outfile [-r] [-w outfile_weight]" << endl << endl;
+  cerr << "read the graph and convert it to binary format." << endl;
+  cerr << "-r\tnodes are renumbered from 0 to nb_nodes-1 (the order is kept)." << endl;
+  cerr << "-w filename\tread the graph as a weighted one and writes the weights in a separate file." << endl;
+  cerr << "-h\tshow this usage message." << endl;
+  exit(0);
+}
+
+void
+parse_args(int argc, char **argv) {
+  for (int i = 1; i < argc; i++) {
+    if(argv[i][0] == '-') {
+      switch(argv[i][1]) {
+      case 'i':
+ if (i==argc-1)
+   usage(argv[0], "Infile missing\n");
+ infile = argv[i+1];
+ i++;
+ break;
+      case 'o':
+ if (i==argc-1)
+   usage(argv[0], "Outfile missing\n");
+        outfile = argv[i+1];
+ i++;
+ break;
+      case 'w' :
+ type = WEIGHTED;
+        outfile_w = argv[i+1];
+ i++;
+ break;
+      case 'r' :
+ do_renumber=true;
+ break;
+      default:
+ usage(argv[0], "Unknown option\n");
+      }
+    } else {
+      usage(argv[0], "More than one filename\n");
+    }
+  }
+  if (infile==NULL || outfile==NULL)
+    usage(argv[0], "In or outfile missing\n");
+}
+
+int
+main(int argc, char **argv) {
+  parse_args(argc, argv);
+
+  Graph g(infile, type);
+
+  g.clean(type);
+
+  if (do_renumber)
+    g.renumber(type);
+
+  g.display_binary(outfile, outfile_w, type);
+
+}
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/main_hierarchy.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/main_hierarchy.cpp Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,116 @@
+// File: main_hierarchy.cpp
+// -- output community structure handling (number of levels, communities of one level)
+//-----------------------------------------------------------------------------
+// Community detection 
+// Based on the article "Fast unfolding of community hierarchies in large networks"
+// Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+//
+// This program must not be distributed without agreement of the above mentionned authors.
+//-----------------------------------------------------------------------------
+// Author   : E. Lefebvre, adapted by J.-L. Guillaume
+// Email    : jean-loup.guillaume@lip6.fr
+// Location : Paris, France
+// Time     : February 2008
+//-----------------------------------------------------------------------------
+// see readme.txt for more details
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <set>
+#include <algorithm>
+
+using namespace std;
+
+int display_level = -1;
+char *filename = NULL;
+
+void
+usage(char *prog_name, const char *more) {
+  cerr << more;
+  cerr << "usage: " << prog_name << " input_file [options]" << endl << endl;
+  cerr << "input_file: read the community tree from this file." << endl;
+  cerr << "-l xx\t display the community structure for the level xx." << endl;
+  cerr << "\t outputs the community for each node." << endl;
+  cerr << "\t xx must belong to [-1,N] if N is the number of levels." << endl;
+  cerr << "-n\t displays the number of levels and the size of each level." << endl;
+  cerr << "\t equivalent to -l -1." << endl;
+  cerr << "-h\tshow this usage message." << endl;
+  exit(0);
+}
+
+void
+parse_args(int argc, char **argv) {
+  if (argc<2)
+    usage(argv[0], "Bad arguments number\n");
+
+  for (int i = 1; i < argc; i++) {
+    if(argv[i][0] == '-') {
+      switch(argv[i][1]) {
+      case 'l':
+ display_level = atoi(argv[i+1]);
+ i++;
+ break;
+      case 'n':
+ display_level = -1;
+ break;
+      default:
+ usage(argv[0], "Unknown option\n");
+      }
+    } else {
+      if (filename==NULL)
+        filename = argv[i];
+      else
+        usage(argv[0], "More than one filename\n");
+    }
+  }
+  if (filename==NULL)
+    usage(argv[0], "No input file has been provided.\n");
+}
+int
+main(int argc, char **argv) {
+  parse_args(argc, argv);
+
+  vector<vector<int> >levels;
+
+  ifstream finput;
+  finput.open(filename,fstream::in);
+
+  int l=-1;
+  while (!finput.eof()) {
+    int node, nodecomm;
+    finput >> node >> nodecomm;
+
+    if (finput) {
+      if (node==0) {
+ l++;
+ levels.resize(l+1);
+      }
+      levels[l].push_back(nodecomm);
+    }
+  }
+
+  if (display_level==-1) {
+    cout << "Number of levels: " << levels.size() << endl;
+    for (unsigned int i=0 ; i<levels.size();i++)
+      cout << "level " << i << ": " << levels[i].size() << " nodes" << endl;
+  } else if (display_level<0 || (unsigned)display_level>=levels.size()) {
+    cerr << "Incorrect level\n";
+  } else {
+    vector<int> n2c(levels[0].size());
+
+    for (unsigned int i=0 ; i<levels[0].size() ; i++)
+      n2c[i]=i;
+    
+    for (l=0 ; l<display_level ; l++)
+      for (unsigned int node=0 ; node<levels[0].size() ; node++)
+ n2c[node] = levels[l][n2c[node]];
+    
+    for (unsigned int node=0 ; node<levels[0].size() ; node++)
+      cout << node << " " << n2c[node] << endl;
+  }
+}
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/main_random.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/main_random.cpp Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,19 @@
+using namespace std;
+
+char *outfile  = NULL;
+
+int
+main(int argc, char **argv) {
+  srand(time(NULL)+getpid());
+
+  int n = atoi(argv[1]);
+  int degree = atoi(argv[2]);
+
+  for (unsigned int i=0 ; i<n ; i++) {
+    for (unsigned int j=0 ; j<n ; j++) {
+      int r  = rand()%n;
+      if (r<degree)
+        cout << i << " " << j << endl;
+    }
+  }
+}
b
diff -r c56807be3b72 -r 3bc73f5dc785 louvain/readme.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/louvain/readme.txt Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,108 @@
+-----------------------------------------------------------------------------
+
+Community detection
+Version 0.2 - not compatible with the previous version, see below.
+
+Based on the article "Fast unfolding of community hierarchies in large networks"
+Copyright (C) 2008 V. Blondel, J.-L. Guillaume, R. Lambiotte, E. Lefebvre
+
+
+This file is part of Louvain algorithm.
+
+Louvain algorithm is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Louvain algorithm is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Louvain algorithm.  If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------------
+
+Author   : E. Lefebvre, adapted by J.-L. Guillaume
+Email    : jean-loup.guillaume@lip6.fr
+Location : Paris, France
+Time  : February 2008
+
+-----------------------------------------------------------------------------
+
+Disclaimer:
+If you find a bug, please send a bug report to jean-loup.guillaume@lip6.fr
+including if necessary the input file and the parameters that caused the bug.
+You can also send me any comment or suggestion about the program.
+
+Note that the program is expecting a friendly use and therefore does not make
+much verifications about the arguments.
+
+-----------------------------------------------------------------------------
+
+
+This package offers a set of functions to use in order to compute 
+communities on graphs weighted or unweighted. A typical sequence of 
+actions is:
+
+1. Conversion from a text format (each line contains a couple "src dest")
+./convert -i graph.txt -o graph.bin
+This program can also be used to convert weighted graphs (each line contain
+a triple "src dest w") using -w option:
+./convert -i graph.txt -o graph.bin -w graph.weights
+Finally, nodes can be renumbered from 0 to nb_nodes - 1 using -r option
+(less space wasted in some cases):
+./convert -i graph.txt -o graph.bin -r
+
+
+2. Computes communities and displays hierarchical tree:
+./community graph.bin -l -1 -v > graph.tree
+
+To ensure a faster computation (with a loss of quality), one can use
+the -q option to specify that the program must stop if the increase of
+modularity is below epsilon for a given iteration or pass:
+./community graph.bin -l -1 -q 0.0001 > graph.tree
+
+The program can deal with weighted networks using -w option:
+./community graph.bin -l -1 -w graph.weights > graph.tree
+In this specific case, the convertion step must also use the -w option.
+
+The program can also start with any given partition using -p option
+./community graph.bin -p graph.part -v
+
+
+3. Displays information on the tree structure (number of hierarchical
+levels and nodes per level):
+./hierarchy graph.tree
+
+Displays the belonging of nodes to communities for a given level of
+the tree:
+./hierarchy graph.tree -l 2 > graph_node2comm_level2
+
+-----------------------------------------------------------------------------
+
+Known bugs or restrictions:
+- the number of nodes is stored on 4 bytes and the number of links on 8 bytes.
+
+-----------------------------------------------------------------------------
+
+Version history:
+The following modifications have been made from version 0.1:
+- weights are now stored using floats (integer in V0.1)
+- degrees are stored on 8 bytes allowing large graphs to be decomposed
+- weights are stored in a separate file, which allows disk usage reduction if
+  different weights are to be used on the same topology
+- any given partition can be used as a seed for the algorithm rather than just
+  the trivial partition where each node belongs to its own community
+- initial network can contain loops is network is considered weighted
+- graph is not renumbered by default in the convert program
+- an optional verbose mode has been added and the program is silent by default
+- some portions of the code have been c++ improved (type * -> vector<type>)
+These modifications imply that any binary graph file created with the previous
+version of the code is not comptabile with this version. You must therefore
+regenerate all the binary files.
+
+Version 0.1:
+- initial community detection algorithm
+
b
diff -r c56807be3b72 -r 3bc73f5dc785 pylintrc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pylintrc Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,380 @@\n+[MASTER]\n+\n+# Specify a configuration file.\n+#rcfile=\n+\n+# Python code to execute, usually for sys.path manipulation such as\n+# pygtk.require().\n+\n+init-hook=\'import sys; sys.path.append(".")\'\n+\n+# Add files or directories to the blacklist. They should be base names, not\n+# paths.\n+ignore=CVS\n+\n+# Pickle collected data for later comparisons.\n+persistent=yes\n+\n+# List of plugins (as comma separated values of python modules names) to load,\n+# usually to register additional checkers.\n+load-plugins=\n+\n+# Use multiple processes to speed up Pylint.\n+jobs=1\n+\n+# Allow loading of arbitrary C extensions. Extensions are imported into the\n+# active Python interpreter and may run arbitrary code.\n+unsafe-load-any-extension=no\n+\n+# A comma-separated list of package or module names from where C extensions may\n+# be loaded. Extensions are loading into the active Python interpreter and may\n+# run arbitrary code\n+extension-pkg-whitelist=\n+\n+# Allow optimization of some AST trees. This will activate a peephole AST\n+# optimizer, which will apply various small optimizations. For instance, it can\n+# be used to obtain the result of joining multiple strings with the addition\n+# operator. Joining a lot of strings can lead to a maximum recursion error in\n+# Pylint and this flag can prevent that. It has one side effect, the resulting\n+# AST will be different than the one from reality.\n+optimize-ast=no\n+\n+\n+[MESSAGES CONTROL]\n+\n+# Only show warnings with the listed confidence levels. Leave empty to show\n+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED\n+confidence=\n+\n+# Enable the message, report, category or checker with the given id(s). You can\n+# either give multiple identifier separated by comma (,) or put this option\n+# multiple time (only on the command line, not in the configuration file where\n+# it should appear only once). See also the "--disable" option for examples.\n+#enable=\n+\n+# Disable the message, report, category or checker with the given id(s). You\n+# can either give multiple identifiers separated by comma (,) or put this\n+# option multiple times (only on the command line, not in the configuration\n+# file where it should appear only once).You can also use "--disable=all" to\n+# disable everything first and then reenable specific checks. For example, if\n+# you want to run only the similarities checker, you can use "--disable=all\n+# --enable=similarities". If you want to run only the classes checker, but have\n+# no Warning level messages displayed, use"--disable=all --enable=classes\n+# --disable=W"\n+disable=parameter-unpacking,suppressed-message,intern-builtin,hex-method,next-method-called,useless-suppression,no-absolute-import,execfile-builtin,metaclass-assignment,setslice-method,unichr-builtin,round-builtin,getslice-method,coerce-method,apply-builtin,print-statement,old-ne-operator,cmp-method,long-builtin,cmp-builtin,reload-builtin,filter-builtin-not-iterating,standarderror-builtin,file-builtin,buffer-builtin,indexing-exception,raising-string,old-octal-literal,range-builtin-not-iterating,oct-method,old-raise-syntax,xrange-builtin,zip-builtin-not-iterating,reduce-builtin,unicode-builtin,raw_input-builtin,coerce-builtin,dict-iter-method,basestring-builtin,long-suffix,delslice-method,dict-view-method,old-division,input-builtin,unpacking-in-except,map-builtin-not-iterating,nonzero-method,import-star-module-level,using-cmp-argument,backtick,W1202\n+\n+\n+[REPORTS]\n+\n+# Set the output format. Available formats are text, parseable, colorized, msvs\n+# (visual studio) and html. You can also give a reporter class, eg\n+# mypackage.mymodule.MyReporterClass.\n+output-format=text\n+\n+# Put messages in a separate file for each module / package specified on the\n+# command line instead of printing them on stdout. Reports (if any) will be\n+# written in a file name "pylint_global.[txt|html]".\n+files-output=no\n+\n+# Tells whether to display a full report or only the messages\n+reports=yes\n+\n+# Python expression which should ret'..b'=100\n+\n+# Regexp for a line that is allowed to be longer than the limit.\n+ignore-long-lines=^\\s*(# )?<?https?://\\S+>?$\n+\n+# Allow the body of an if to be on the same line as the test if there is no\n+# else.\n+single-line-if-stmt=no\n+\n+# List of optional constructs for which whitespace checking is disabled. `dict-\n+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\\n222: 2}.\n+# `trailing-comma` allows a space between comma and closing bracket: (a, ).\n+# `empty-line` allows space-only lines.\n+no-space-check=trailing-comma,dict-separator\n+\n+# Maximum number of lines in a module\n+max-module-lines=1000\n+\n+# String used as indentation unit. This is usually "    " (4 spaces) or "\\t" (1\n+# tab).\n+indent-string=\'    \'\n+\n+# Number of spaces of indent required inside a hanging  or continued line.\n+indent-after-paren=4\n+\n+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.\n+expected-line-ending-format=\n+\n+\n+[LOGGING]\n+\n+# Logging modules to check that the string format arguments are in logging\n+# function parameter format\n+logging-modules=logging\n+\n+\n+[SIMILARITIES]\n+\n+# Minimum lines number of a similarity.\n+min-similarity-lines=4\n+\n+# Ignore comments when computing similarities.\n+ignore-comments=yes\n+\n+# Ignore docstrings when computing similarities.\n+ignore-docstrings=yes\n+\n+# Ignore imports when computing similarities.\n+ignore-imports=no\n+\n+\n+[VARIABLES]\n+\n+# Tells whether we should check for unused import in __init__ files.\n+init-import=no\n+\n+# A regular expression matching the name of dummy variables (i.e. expectedly\n+# not used).\n+dummy-variables-rgx=_$|dummy\n+\n+# List of additional names supposed to be defined in builtins. Remember that\n+# you should avoid to define new builtins when possible.\n+additional-builtins=\n+\n+# List of strings which can identify a callback function by name. A callback\n+# name must start or end with one of those strings.\n+callbacks=cb_,_cb\n+\n+\n+[IMPORTS]\n+\n+# Deprecated modules which should not be used, separated by a comma\n+deprecated-modules=optparse\n+\n+# Create a graph of every (i.e. internal and external) dependencies in the\n+# given file (report RP0402 must not be disabled)\n+import-graph=\n+\n+# Create a graph of external dependencies in the given file (report RP0402 must\n+# not be disabled)\n+ext-import-graph=\n+\n+# Create a graph of internal dependencies in the given file (report RP0402 must\n+# not be disabled)\n+int-import-graph=\n+\n+\n+[DESIGN]\n+\n+# Maximum number of arguments for function / method\n+max-args=20\n+\n+# Argument names that match this expression will be ignored. Default to name\n+# with leading underscore\n+ignored-argument-names=_.*\n+\n+# Maximum number of locals for function / method body\n+max-locals=20\n+\n+# Maximum number of return / yield for function / method body\n+max-returns=6\n+\n+# Maximum number of branch for function / method body\n+max-branches=12\n+\n+# Maximum number of statements in function / method body\n+max-statements=50\n+\n+# Maximum number of parents for a class (see R0901).\n+max-parents=7\n+\n+# Maximum number of attributes for a class (see R0902).\n+max-attributes=30\n+\n+# Minimum number of public methods for a class (see R0903).\n+min-public-methods=0\n+\n+# Maximum number of public methods for a class (see R0904).\n+max-public-methods=20\n+\n+# Maximum number of boolean expressions in a if statement\n+max-bool-expr=5\n+\n+\n+[CLASSES]\n+\n+# List of method names used to declare (i.e. assign) instance attributes.\n+defining-attr-methods=__init__,__new__,setUp\n+\n+# List of valid names for the first argument in a class method.\n+valid-classmethod-first-arg=cls\n+\n+# List of valid names for the first argument in a metaclass class method.\n+valid-metaclass-classmethod-first-arg=mcs\n+\n+# List of member names, which should be excluded from the protected access\n+# warning.\n+exclude-protected=_asdict,_fields,_replace,_source,_make\n+\n+\n+[EXCEPTIONS]\n+\n+# Exceptions that will emit a warning when being caught. Defaults to\n+# "Exception"\n+overgeneral-exceptions=Exception\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 repex_full_clustering.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/repex_full_clustering.xml Fri Dec 20 14:17:59 2019 +0000
b
b'@@ -0,0 +1,290 @@\n+<tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " version="2.3.6" >\n+    <stdio>\n+      <regex match="lastdb: can\'t open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\\n" />\n+      <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />\n+      <regex match="error" source="stderr" level="fatal" description="Unknown error" />\n+      <regex match="Warning" source="stderr" level="warning" description="Unknown error" />\n+      <exit_code range="1:" level="fatal" description="Error" />\n+    </stdio>\n+    <description>Improved version or repeat discovery and characterization using graph based sequence clustering</description>\n+   <requirements>\n+     <requirement type="package" version="3.7">python</requirement>\n+     <requirement type="package" version="0.9.1" >pyrserve</requirement>\n+     <requirement type="package">last</requirement>\n+     <requirement type="package">mafft</requirement>\n+     <requirement type="package">imagemagick</requirement>\n+     <requirement type="package">blast</requirement>\n+     <requirement type="package">diamond</requirement>\n+     <requirement type="package">blast-legacy</requirement>\n+     <requirement type="package">r-igraph</requirement>\n+     <requirement type="package">r-data.tree</requirement>\n+     <requirement type="package">r-stringr</requirement>\n+     <requirement type="package">r-r2html</requirement>\n+     <requirement type="package">r-hwriter</requirement>\n+     <requirement type="package">r-dt</requirement>\n+     <requirement type="package">r-scales</requirement>\n+     <requirement type="package">r-plotrix</requirement>\n+     <requirement type="package">r-png</requirement>\n+     <requirement type="package">r-plyr</requirement>\n+     <requirement type="package">r-dplyr</requirement>\n+     <requirement type="package">r-optparse</requirement>\n+     <requirement type="package">r-dbi</requirement>\n+     <requirement type="package">r-rsqlite</requirement>\n+     <requirement type="package">r-rserve</requirement>\n+     <requirement type="package">bioconductor-biostrings</requirement>\n+   </requirements>\n+    <command >\n+      make -C ${__tool_directory__};\n+      export PYTHONHASHSEED=0;\n+      ${__tool_directory__}/seqclust --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon\n+\n+      #if $advanced_options.advanced:\n+      --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering  -D $advanced_options.blastx.options_blastx\n+      --assembly_min $advanced_options.assembly_min_cluster_size\n+\n+        #if $advanced_options.comparative.options_comparative:\n+          --prefix_length $advanced_options.comparative.prefix_length\n+        #end if\n+      \n+        #if $advanced_options.custom_library.options_custom_library:\n+       \t  -d $advanced_options.custom_library.library extra_database\n+        #end if\n+        \n+        #if $advanced_options.options.options:\n+         -opt $advanced_options.options.options\n+        #end if \n+      #end if\n+      ${FastaFile}  >stdout.log 2> stderr.log ;\n+      echo "STDOUT CONTENT:" >> ${log} ;\n+      cat stdout.log >> ${log} ;\n+      echo "STDERR CONTENT:" >> ${log};\n+      cat stderr.log >> ${log} &amp;&amp;\n+      cd tarean_output &amp;&amp;\n+      zip -r  ${ReportArchive}.zip * &amp;&amp;\n+      mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;\n+      cp index.html ${ReportFile} &amp;&amp;\n+      mkdir ${ReportFile.files_path} &amp;&amp;\n+      cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;\n+      cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;\n+      cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;\n+      cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;\n+      cp *.png ${ReportFile.files_path}/ &amp;&amp;\n+      '..b"AAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT\n+        \n+\n+      To prepare quality filtered and interlaced input fasta file from fastq\n+      files, use `Preprocessing of paired-reads`__  tool.\n+\n+      .. __: tool_runner?tool_id=paired_fastq_filtering\n+\n+\n+      **Additional parameters**\n+\n+      **Sample size** defines how many reads should be used in calculation.\n+      Default setting with 500,000 reads will enable detection of high copy\n+      repeats within several hours of computation time. For higher\n+      sensitivity the sample size can be set higher. Since sample size affects\n+      the memory usage, this parameter may be automatically adjusted to lower\n+      value during the run. Maximum sample size which can be processed depends on\n+      the repetitiveness of analyzed genome.\n+\n+      \n+      **Select taxon and protein domain database version (REXdb)**. Classification\n+      of transposable elements is based on the similarity to our reference database\n+      of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species\n+      can be obtained on `repeatexplorer.org`__. Classification\n+      system used in REXdb is described in article `Systematic survey of plant\n+      LTR-retrotransposons elucidates phylogenetic relationships of their\n+      polyprotein domains and provides a reference for element classification`__\n+      Database for Metazoa species is still under development so use it with caution.\n+\n+      .. __: http://repeatexplorer.org\n+      .. __: https://doi.org/10.1186/s13100-018-0144-1\n+\n+      **Select parameters for protein domain search** REXdb is compared with s\n+      equence clusters either using blastx or diamond aligner. Diamond program\n+      is about three time faster than blastx with word size 3.\n+\n+      **Similarity search options** By default sequence reads are compared using\n+      mgblast program. Default threshold is explicitly set to 90% sequence\n+      similarity spanning at least 55% of the read length (in the case of reads\n+      differing in length it applies to the longer one). Additionally, sequence\n+      overlap must be at least 55 nt. If you select option for shorter reads\n+      than 100 nt,  minimum overlap 55 nt is not required.\n+\n+      By default,\n+      mgblast search use DUST program to filter out\n+      low-complexity sequences. If you want\n+      to increase sensitivity of detection of satellites with shorter monomer\n+      use option with '*no masking of low complexity repeats*'. Note that omitting\n+      DUST filtering will significantly increase running times\n+     \n+\n+      **Automatic filtering of abundant satellite repeats** perform clustering on\n+      smaller dataset of sequence reads to detect abundant high confidence\n+      satellite repeats. If such satellites are detected, sequence reads derived\n+      from these satellites are depleted from input dataset. This step enable more\n+      sensitive detection of less abundant repeats as more reads can be used\n+      in clustering step.\n+\n+      **Use custom repeat database**. This option allows users to perform similarity\n+      comparison of identified repeats to their custom databases. The repeat class must\n+      be encoded in FASTA headers of database entries in order to allow correct \n+      parsing of similarity hits. Required format for custom database sequence name is: ::\n+\n+        >reapeatname#class/subclass\n+\n+\n+      **Output**\n+\n+      List of clusters identified as putative satellite repeats, their genomic\n+      abundance and various cluster characteristics. \n+\n+      Output includes a **HTML summary** with table listing of all analyzed\n+      clusters. More detailed information about clusters is provided in\n+      additional files and directories. All results are also provided as\n+      downloadable **zip archive**. Additionally a **log file** reporting\n+      the progress of the computational pipeline is provided.\n+      \n+    </help>\n+\n+</tool>\n"
b
diff -r c56807be3b72 -r 3bc73f5dc785 repex_tarean.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/repex_tarean.xml Fri Dec 20 14:17:59 2019 +0000
b
b'@@ -0,0 +1,235 @@\n+<tool id="tarean" name="Tandem Repeat Analyzer"  version="2.3.6" >\n+    <stdio>\n+      <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />\n+      <regex match="error" source="stderr" level="fatal" description="Unknown error" />\n+      <regex match="warning" source="stderr" level="warning" description="Unknown warning" />\n+      <exit_code range="1:" level="fatal" description="Error" />\n+    </stdio>\n+    <description>Identification of genomic tandem repeats from NGS data</description>\n+    <requirements>\n+      <requirement type="package" version="3.7">python</requirement>\n+      <requirement type="package" version="0.9.1" >pyrserve</requirement>\n+      <requirement type="package" >last</requirement>\n+      <requirement type="package">mafft</requirement>\n+      <requirement type="package">imagemagick</requirement>\n+      <requirement type="package">blast</requirement>\n+      <requirement type="package">diamond</requirement>\n+      <requirement type="package">blast-legacy</requirement>\n+      <requirement type="package">r-igraph</requirement>\n+      <requirement type="package">r-data.tree</requirement>\n+      <requirement type="package">r-stringr</requirement>\n+      <requirement type="package">r-r2html</requirement>\n+      <requirement type="package">r-hwriter</requirement>\n+      <requirement type="package">r-dt</requirement>\n+      <requirement type="package">r-scales</requirement>\n+      <requirement type="package">r-plotrix</requirement>\n+      <requirement type="package">r-png</requirement>\n+      <requirement type="package">r-plyr</requirement>\n+      <requirement type="package">r-dplyr</requirement>\n+      <requirement type="package">r-optparse</requirement>\n+      <requirement type="package">r-dbi</requirement>\n+      <requirement type="package">r-rsqlite</requirement>\n+      <requirement type="package">r-rserve</requirement>\n+      <requirement type="package">bioconductor-biostrings</requirement>\n+    </requirements>\n+\n+  <command detect_errors="exit_code">\n+    make -C ${__tool_directory__};\n+    export PYTHONHASHSEED=0;\n+    ${__tool_directory__}/seqclust --paired --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup --tarean_mode\n+    #if $advanced_options.advanced:\n+      --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -M $advanced_options.merging\n+      #if $advanced_options.custom_library.options_custom_library :\n+     \t  -d $advanced_options.custom_library.library extra_database\n+      #end if\n+      #if $advanced_options.options.options:\n+        -opt $advanced_options.options.options\n+      #end if   \n+    #else:\n+      -M 0.2\n+\n+    #end if\n+    ${FastaFile} >stdout.log 2> stderr.log ;\n+    echo "STDOUT CONTENT:" >> ${log} ;\n+    cat stdout.log >> ${log} ;\n+    echo "STDERR CONTENT:" >> ${log} ;\n+    cat stderr.log >> ${log} &amp;&amp;\n+    ${__tool_directory__}/stderr_filter.py stderr.log &amp;&amp;\n+    cd tarean_output &amp;&amp;\n+    zip -r  ${ReportArchive}.zip * &amp;&amp;\n+    mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;\n+    cp index.html ${ReportFile} &amp;&amp;\n+    mkdir ${ReportFile.files_path} &amp;&amp;\n+    cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;\n+    cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;\n+    cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;\n+    cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;\n+    cp *.png ${ReportFile.files_path}/ &amp;&amp;\n+    cp *.csv ${ReportFile.files_path}/ &amp;&amp;\n+    cp *.html ${ReportFile.files_path}/  &amp;&amp;\n+    cp *.css ${ReportFile.files_path}/  &amp;&amp;\n+    cp *.fasta ${ReportFile.files_path}/ 2>>$log  &amp;&amp; rm -r ../tarean_output || :\n+\n+    \n+  </command>\n+\n+  <inputs>\n+\t  <param name="FastaFile" label="paired-end NGS reads" type="data" format="fasta"\n+\t'..b"AAACTTAAAAAGGTTTCTGCACATGAATCG\n+      >0002_r\n+      TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC\n+      >0003_f\n+      TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT\n+      >0003_r\n+      TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT\n+      ...\n+\n+\n+    To perform the quality filtering on your fastQ formatted data as described\n+    above, and to interlace your paired-end sequence reads,\n+    please use the `Preprocessing of paired-reads`__  tool.\n+\n+    .. __: tool_runner?tool_id=paired_fastq_filtering\n+\n+\n+    **Additional parameters**\n+\n+    **Sample size** defines how many reads will be used during the computation.\n+    The default setting of 500,000 reads will enable detection of high copy\n+    number satellites within several hours. For higher\n+    sensitivity the sample size can be increased. Since the sample size affects\n+    memory usage, this parameter may be automatically adjusted to a lower value\n+    during the run. The maximum sample size which can be processed depends on the\n+    repetitiveness of the analyzed genome. This significantly limits the number of reads\n+    that can be analyzed with the TAREAN pipeline.\n+\n+    **Perform cluster merging**. Families of repetitive elements are\n+    frequently split into multiple clusters rather than being represented as a\n+    single one. If you do not want to merge clusters based on the presence\n+    of broken read pairs, disable this option. \n+    \n+    **Use custom repeat database**. This option allows users to perform similarity\n+    comparison of identified repeats to their custom databases. The repeat class should\n+    be encoded in FASTA headers of database entries in order to allow correct \n+    parsing of similarity hits.\n+\n+    **Similarity search options** By default sequence reads are compared using\n+    mgblast program. Default threshold is explicitly set to 90% sequence\n+    similarity spanning at least 55% of the read length (in the case of reads\n+    differing in length it applies to the longer one). Additionally, sequence\n+    overlap must be at least 55 nt. If you select option for shorter reads\n+    than 100 nt,  minimum overlap 55 nt is not required.\n+    \n+    By default,\n+    mgblast search use DUST program to filter out\n+    low-complexity sequences. If you want\n+    to increase sensitivity of detection of satellites with shorter monomer\n+    use option with '*no masking of low complexity repeats*'. Note that omitting\n+    DUST filtering will significantly increase running times\n+    \n+    **Output**\n+\n+    A list of clusters identified as putative satellite repeats, their genomic\n+    abundance and various cluster characteristics are provided. Length and\n+    consensus sequences of reconstructed monomers are also shown and\n+    accompanied by a detailed output from kmer-based reconstruction including\n+    sequences and sequence logos of alternative variants of monomer sequences.\n+\n+    The output includes an **HTML summary** with a table listing all analyzed\n+    clusters. More detailed information about clusters is provided in\n+    additional files and directories. All results are also provided as a\n+    downloadable **zip archive**. Since read clustering results in\n+    thousands of clusters, the search for satellite repeats is limited to\n+    a subset of the largest ones corresponding to the most abundant genomic\n+    repeats. The default setting of the pipeline is to analyze all clusters containing at least\n+    0.01% of the input reads. Besides the satellite repeats, three other\n+    groups of clusters are reported in the output (1) LTR-retrotransposons,\n+    (2) 45S and 5S rDNA and (3) all remaining clusters passing the size\n+    threshold. As (1) and (2) contain sequences with circular\n+    graphs, their consensus is calculated in the same way as for satellite\n+    repeats. Additionally a **log file** reporting the progress of the\n+    computational pipeline is provided.\n+\n+    \n+  </help>\n+\n+</tool>\n"
b
diff -r c56807be3b72 -r 3bc73f5dc785 seqclust
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/seqclust Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,800 @@\n+#!/usr/bin/env python3\n+\'\'\' TAndem REpeat ANalyzer  \'\'\'\n+import os\n+import sys\n+import shutil\n+import subprocess\n+import argparse\n+from argparse import RawTextHelpFormatter\n+import logging\n+import shlex\n+import multiprocessing\n+# config must be loaded before seqtools,...\n+import config\n+import re\n+from lib import seqtools, graphtools, utils, assembly_tools\n+from lib import r2py\n+\n+REQUIRED_VERSION = (3, 4)\n+if sys.version_info < REQUIRED_VERSION:\n+    raise Exception("\\n\\npython 3.4 or higher is required!\\n")\n+\n+# append path to louvain clustering and other binaries\n+os.environ[\'PATH\'] = "{}:{}:{}".format(config.BINARIES, config.LOUVAIN,\n+                                       os.environ[\'PATH\'])\n+\n+LOGGER = logging.getLogger(__name__)\n+\n+\n+def get_version(path, tarean_mode):\n+    # get git version\n+    branch = "?"\n+    shorthash = "?"\n+    revcount = "?"\n+    tag = "?"\n+    try:\n+        branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",\n+                                         shell=True,\n+                                         cwd=path).decode(\'ascii\').strip()\n+        shorthash = subprocess.check_output(\n+            "git log --pretty=format:\'%h\' -n 1  ",\n+            shell=True,\n+            cwd=path).decode(\'ascii\').strip()\n+        revcount = len(subprocess.check_output(\n+            "git log --oneline", shell=True,\n+            cwd=path).decode(\'ascii\').split())\n+        tag = subprocess.check_output("git describe --tags --abbrev=0",\n+                                          cwd=path,\n+                                          shell=True).decode(\'ascii\').strip()\n+        version_info = "{branch}-{tag}-{revcount}({shorthash})".format(\n+            branch=branch,\n+            shorthash=shorthash,\n+            tag=tag,\n+            revcount=revcount\n+        )\n+    except:\n+        # alernativelly - read it from file\n+        try:\n+            with open(path + "/version_info.txt", \'r\') as f:\n+                version_info = f.read()\n+        except FileNotFoundError:\n+            version_info = "version of pipeline not available!"\n+\n+    ## get database versions:\n+    PD = "?" \n+    PDmd5 = "?"\n+    DD = "?"\n+    DDmd5 = "?"\n+    try:\n+        PD = os.path.basename(config.PROTEIN_DATABASE)\n+        PDmd5 = utils.md5checksum(config.PROTEIN_DATABASE + ".psq",\n+                                  fail_if_missing=not tarean_mode)\n+        DD = os.path.basename(config.DNA_DATABASE)\n+        DDmd5 = utils.md5checksum(config.DNA_DATABASE + ".nsq")\n+    except:\n+        ## some problem with databases\n+        pass\n+    version_string = (\n+        "-------------------------------------"\n+        "-------------------------------------\\n"\n+        "PIPELINE VERSION         : "\n+        "{version_info}\\n\\n"\n+        "PROTEIN DATABASE VERSION : {PD}\\n"\n+        "            md5 checksum : {PDmd5}\\n\\n"\n+        "DNA DATABASE VERSION     : {DD}\\n"\n+        "            md5 checksum : {DDmd5}\\n"\n+        "-------------------------------------"\n+        "-------------------------------------\\n").format(\n+\n+            version_info=version_info,\n+            PD=PD,\n+            PDmd5=PDmd5,\n+            DD=DD,\n+            DDmd5=DDmd5\n+        )\n+\n+    LOGGER.info(version_string)\n+    return version_string\n+\n+\n+def valid_database(database_file):\n+    with open(database_file, \'r\', encoding=\'ascii\') as f:\n+        for i in f:\n+            if i[0] == ">":\n+                if not re.match(">.+#.+/*", i):\n+                    # TODO - make edits to correct fomating of custom database???\n+                    return False\n+    return True\n+\n+\n+def add_databases(databases, custom_databases_dir, dbtype=\'nucl\'):\n+    \'\'\'custom databases are copied to directory tree and blast\n+    database is created using makeblastdb\n+    \'\'\'\n+\n+    databases_ok = []\n+    print(databases)\n+    for db_path, db_name in databases:\n+        db_destination = "{}/{}".format(custom_databases_dir, db_name)\n+        shutil.copyfile(db_p'..b'ssembly_tools.assembly(sequences,\n+                                    hitsort,\n+                                    clusters_info,\n+                                    assembly_dir=paths.assembly,\n+                                    contigs_file=paths.contigs,\n+                                    min_size_of_cluster_for_assembly=args.assembly_min)\n+\n+            LOGGER.info("detecting LTR in assembly..")\n+            for i in clusters_info:\n+                i.detect_ltr(config.TRNA_DATABASE)\n+\n+        run_info.max_annotated_clusters = max([i.index for i in clusters_info])\n+        run_info.max_annotated_superclusters = max([i.supercluster\n+                                                    for i in clusters_info])\n+        # make reports\n+        cluster_listing = [i.listing() for i in clusters_info]\n+        # make path relative to paths.cluster_info\n+        utils.save_as_table(cluster_listing, paths.clusters_info)\n+        # creates table cluster_info in hitsort database\n+        graphtools.Cluster.add_cluster_table_to_database(cluster_listing,\n+                                                         paths.hitsort_db)\n+        # export files for consensus sequences, one for each ranks\n+        consensus_files = []\n+        for i in config.TANDEM_RANKS:\n+            consensus_files.append(utils.export_tandem_consensus(\n+                clusters_info,\n+                path=paths.TR_consensus_fasta.format(i),\n+                rank=i))\n+\n+        if not args.tarean_mode:\n+            LOGGER.info("Creating report for superclusters")\n+            create_annotation.create_all_superclusters_report(\n+                max_supercluster=run_info.max_annotated_superclusters,\n+                paths=paths.as_list(),\n+                libdir=paths.libdir,\n+                superclusters_dir=paths.superclusters,\n+                seqdb=paths.sequences_db,\n+                hitsortdb=paths.hitsort_db,\n+                classification_hierarchy_file=config.CLASSIFICATION_HIERARCHY,\n+                HTML_LINKS=dict2lists(config.HTML_LINKS))\n+\n+            LOGGER.info("Creating report for individual clusters")\n+            for cluster in clusters_info:\n+                create_annotation.create_cluster_report(\n+                    cluster.index,\n+                    seqdb=paths.sequences_db,\n+                    hitsortdb=paths.hitsort_db,\n+                    classification_hierarchy_file=\n+                    config.CLASSIFICATION_HIERARCHY,\n+                    HTML_LINKS=dict2lists(config.HTML_LINKS))\n+\n+        LOGGER.info("Creating main html report")\n+        reporting.create_main_reports(\n+            paths=paths.as_list(),\n+            N_clustering=run_info.number_of_reads_for_clustering,\n+            N_input=run_info.number_of_input_sequences,\n+            N_omit=run_info.number_of_omitted_reads,\n+            merge_threshold=args.merge_threshold,\n+            paired=run_info.paired,\n+            consensus_files=consensus_files,\n+            custom_db=bool(config.CUSTOM_DNA_DATABASE),\n+            tarean_mode=args.tarean_mode,\n+            HTML_LINKS=dict2lists(config.HTML_LINKS),\n+            pipeline_version_info=pipeline_version_info,\n+            max_memory=run_info.max_memory,\n+            max_number_reads_for_clustering=run_info.max_number_reads_for_clustering,\n+            mincln=run_info.mincln\n+        )\n+\n+        LOGGER.info("Html report reports created")\n+\n+    except:\n+        r2py.shutdown(config.RSERVE_PORT)\n+        raise\n+    finally:\n+        if args.cleanup:\n+            paths.cleanup(config.FILES_TO_DISCARD_AT_CLEANUP)\n+        else:\n+            LOGGER.info("copy databases to working directory")\n+            shutil.copy(paths.sequences_db, paths.working_dir)\n+            shutil.copy(paths.hitsort_db, paths.working_dir)\n+        # copy log file inside working directory\n+        if logfile:\n+            shutil.copyfile(logfile, paths.logfile)\n+\n+\n+if __name__ == "__main__":\n+    main()\n+    # some error handling here:\n'
b
diff -r c56807be3b72 -r 3bc73f5dc785 stderr_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/stderr_filter.py Fri Dec 20 14:17:59 2019 +0000
[
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+'''
+Purpose of this script is to filters some massages output
+stderr to prevent galaxy to raise the error
+'''
+import sys
+
+string_to_detect = [
+    'Karlin-Altschul parameters',
+    'slippage may introduce errors',
+    'Examining 5 or more matches is recommended',
+    'DeprecationWarning: The binary mode of fromstring is deprecated',
+]
+
+string_to_remove = [
+    ('error', 'errour'),
+    ('warning', 'alert')
+]
+input_file = sys.argv[1]
+
+with open(input_file) as f:
+    for line in f:
+        for s in string_to_detect:
+            if s in line:
+                new_line = "--" + line.lower()
+                for r in string_to_remove:
+                    new_line = new_line.replace(r[0], r[1])
+                line = new_line
+        print("parsed line:", line, file=sys.stderr)
b
diff -r c56807be3b72 -r 3bc73f5dc785 test_repex_pipeline.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test_repex_pipeline.py Fri Dec 20 14:17:59 2019 +0000
[
b'@@ -0,0 +1,229 @@\n+#!/usr/bin/env python3\n+\'\'\'\n+Basic Tarean and RepeatExplorer tests\n+\'\'\'\n+import subprocess\n+import tempfile\n+import unittest\n+import os\n+import shutil\n+\n+def check_for_missing_files(directory, file_list):\n+    \'\'\' check if files exists in the directory \'\'\'\n+    missing_files = []\n+    for f in file_list:\n+        path = os.path.join(directory, f)\n+        if os.path.exists(path):\n+            continue\n+        else:\n+            missing_files.append(f)\n+    return missing_files\n+\n+\n+class TestBasic(unittest.TestCase):\n+    \'\'\' basic repex-tarean testcase \'\'\'\n+    EXECUTABLE = "./seqclust"\n+\n+    # file lists to check\n+    FILE_LIST_BASIC = [\n+        "./seqclust/clustering/clusters/dir_CL0001/hitsort_part.csv",\n+        "./seqclust/clustering/clusters/dir_CL0001/reads.fasta",\n+        "./seqclust/clustering/clusters/dir_CL0001/reads_selection.fasta",\n+        "./seqclust/clustering/clusters/dir_CL0001/dna_database_annotation.csv",\n+        "./seqclust/clustering/clusters/dir_CL0001/graph_layout.GL",\n+        "./seqclust/clustering/clusters/dir_CL0001/graph_layout.png",\n+        "./seqclust/clustering/clusters/dir_CL0001/graph_layout_tmb.png",\n+        "./seqclust/clustering/clusters/dir_CL0001/graph_layout_directed.RData",\n+        "./logfile.txt", "./style1.css", "./documentation.html",\n+        "./tarean_report.html", "./cluster_report.html",\n+        "./summary_histogram.png", "./index.html", "./sequences.db",\n+        "./hitsort.db", "./TAREAN_consensus_rank_1.fasta",\n+        "./TAREAN_consensus_rank_2.fasta", "./TAREAN_consensus_rank_3.fasta",\n+        "./TAREAN_consensus_rank_4.fasta", "./seqclust/clustering/hitsort",\n+        "./seqclust/clustering/hitsort.cls"\n+    ]\n+    FILE_LIST_ASSEMBLY = [\n+        "./seqclust/small_clusters_assembly/small_clusters.aln",\n+        "./seqclust/small_clusters_assembly/small_clusters.ace",\n+        "./seqclust/small_clusters_assembly/small_clusters.fasta"\n+    ]\n+    FILE_LIST_FILTERING = ["./seqclust/prerun/filter_sequences.fasta"]\n+    FILE_LIST_COMPARATIVE = ["COMPARATIVE_ANALYSIS_COUNTS.csv"]\n+    FILE_LIST_CUSTOM_DATABASE = [\n+        "./seqclust/custom_databases/extra_database",\n+        "./seqclust/clustering/clusters/dir_CL0001/custom_db_extra_database_annotation.csv"\n+    ]\n+    def setUp(self):\n+        pass\n+\n+    # helper function\n+    def tarean_run(self, cmd_options, file_list):\n+        \'\'\' Basic taren run \'\'\'\n+        # output goes to tmp directory\n+        tmpdir = tempfile.mkdtemp()\n+        logfile = tempfile.NamedTemporaryFile(delete=False)\n+        print("\\n------------------------------------------------------")\n+        print("Temp files:")\n+        print("   tmpdir : ", tmpdir)\n+        print("  logfile : ", logfile.name)\n+        print("------------------------------------------------------")\n+        print([self.EXECUTABLE] + [\'-l\', logfile.name, \'-v\', tmpdir] + cmd_options)\n+        p = subprocess.Popen(\n+            args=[self.EXECUTABLE] + [\'-l\', logfile.name, \'-v\', tmpdir\n+                                     ] + cmd_options)\n+        p.wait()\n+        status = p.returncode\n+        missing_files = check_for_missing_files(directory=tmpdir,\n+                                                file_list=file_list)\n+        if status:\n+            # print log file\n+            print("Non zero exit status!")\n+            with open(logfile.name) as f:\n+                print(f.read())\n+\n+        self.assertEqual(status, 0)\n+        self.assertEqual(\n+            len(missing_files),\n+            0,\n+            msg="\\n missing files: \\n" + "\\n".join(missing_files))\n+        shutil.rmtree(tmpdir)\n+        os.remove(logfile.name)\n+\n+\n+    def test_help(self):\n+        \'\'\'Test if help option works \'\'\'\n+        p = subprocess.Popen(args=[self.EXECUTABLE, "-h"],\n+                             stdout=subprocess.PIPE)\n+        output = str(p.stdout.readlines())\n+        p.stdout.close()\n+        p.wait()\n+        status = p.returncode\n+        self.asse'..b"erging_re_diamond(self):\n+        ''' Basic taren run '''\n+        cmd_options = ['-p', '-s', '6000','-D','DIAMOND', 'test_data/LAS_paired_10k.fas']\n+        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)\n+\n+\n+\n+    def test_basic_with_merging_re(self):\n+        ''' Basic taren run '''\n+        cmd_options = ['-p', '-M', '0.2', '-s', '6000',\n+                       'test_data/LAS_paired_10k.fas']\n+        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)\n+\n+    def test_long_with_merging_re(self):\n+        '''Using more data with tarean'''\n+        cmd_options = ['-p', '-M', '0.1', '-m', '0.01',\n+                       'test_data/LAS_paired_25k.fas']\n+        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)\n+\n+    def test_long_with_merging_re_diamond(self):\n+        '''Using more data with tarean and using diamond'''\n+        cmd_options = ['-p', '-M', '0.1', '-m', '0.01','-D','DIAMOND',\n+                       'test_data/LAS_paired_25k.fas']\n+        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)\n+\n+    def test_long_with_merging2_re(self):\n+        '''Using more data with tarean 300k reads'''\n+        cmd_options = ['-p', '-M', '0.1', '-m', '0.01',\n+                       'test_data/LAS_paired_300k.fas']\n+        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)\n+\n+    def test_long_with_merging_and_filtering_re(self):\n+        '''Using more data with tarean, test of automatic filtering'''\n+        cmd_options = ['-A', '-p', '-M', '0.2', '-m', '0.01',\n+                       'test_data/ceu_200k.fasta']\n+        self.tarean_run(\n+            cmd_options,\n+            file_list=self.FILE_LIST_BASIC + self.FILE_LIST_FILTERING + self.FILE_LIST_ASSEMBLY)\n+\n+    def test_custom_database_re(self):\n+        ''' Basic taren run '''\n+        cmd_options = ['-p', '-d', 'test_data/extra_database', 'extra_database', 'test_data/LAS_paired_10k.fas']\n+        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_CUSTOM_DATABASE)\n+\n+    def tearDown(self):\n+        pass\n+\n+\n+SHORT_TASK_NAME_LIST_TAREAN = ['test_help', 'test_basic_no_merging_tarean',\n+                               'test_basic_with_merging_tarean',\n+                               'test_basic_with_merging_tarean_dust_off']\n+LONG_TASK_NAME_LIST_TAREAN = ['test_long_with_merging_tarean',\n+                              'test_long_with_merging2_tarean']\n+SHORT_TASK_NAME_LIST_RE = ['test_basic_no_merging_re',\n+                           'test_basic_with_merging_re',\n+                           'test_basic_no_merging_re_diamond']\n+LONG_TASK_NAME_LIST_RE = ['test_long_with_merging_re',\n+                          'test_long_with_merging2_re',\n+                          'test_long_with_merging_and_filtering_re',\n+                          'test_long_with_merging_re_diamond']\n+\n+COMPARATIVE_LIST = ['test_short_comparative_re']\n+CUSTOM_DATABASE_LIST = ['test_short_custom_database']\n+\n+# Test suites:\n+SHORT_TAREAN_SUITE = unittest.TestSuite([TestBasic(i)\n+                                   for i in SHORT_TASK_NAME_LIST_TAREAN])\n+LONG_TAREAN_SUITE = unittest.TestSuite([TestBasic(i)\n+                                  for i in LONG_TASK_NAME_LIST_TAREAN])\n+COMPARATIVE_SUITE = unittest.TestSuite([TestBasic(i) for i in COMPARATIVE_LIST])\n+CUSTOM_DB_SUITE = unittest.TestSuite([TestBasic('test_custom_database_re')])\n+\n+SHORT_RE_SUITE = unittest.TestSuite([TestBasic(i) for i in SHORT_TASK_NAME_LIST_RE])\n+LONG_RE_SUITE = unittest.TestSuite([TestBasic(i) for i in LONG_TASK_NAME_LIST_RE])\n+\n+SHORT_SUITE = unittest.TestSuite([SHORT_RE_SUITE, SHORT_TAREAN_SUITE,\n+                                  COMPARATIVE_SUITE, CUSTOM_DB_SUITE])\n+\n+LONG_LONG = unittest.TestSuite([LONG_RE_SUITE, LONG_TAREAN_SUITE])\n+\n+# for single test tesing\n+if __name__ == '__main__':\n+    unittest.main(verbosity=2)\n"
b
diff -r c56807be3b72 -r 3bc73f5dc785 tool_dependencies.xml
--- a/tool_dependencies.xml Fri Dec 20 14:12:11 2019 +0000
+++ b/tool_dependencies.xml Fri Dec 20 14:17:59 2019 +0000
b
@@ -1,20 +1,9 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <tool_dependency>
-    <package name="profrep_databases" version="1.0">
-        <install version="1.0">
-          <actions>
-            <action type="download_by_url">https://bitbucket.org/petrnovak/re_databases/get/8cfaac841fc2.zip</action>
-             <action type="move_directory_files">
-              <source_directory>$TMP_WORK_DIR/petrnovak-re_databases-8cfaac841fc2</source_directory>
-              <destination_directory>$INSTALL_DIR</destination_directory>
-             </action>
-             <action type="set_environment">
-               <environment_variable action="set_to" name="PROFREP_DATABASES">$INSTALL_DIR</environment_variable>
-             </action>
-          </actions>
-        </install>
+    <package name="repex_tarean" version="1.0">
+        <repository changeset_revision="78ad965c1721" name="package_repex_tarean_1_0" owner="petrn" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu"/>
         <readme>
-          Profrep databases
-        </readme>
+      prepare repex database and scripts
+    </readme>
     </package>
-</tool_dependency>
+</tool_dependency>
\ No newline at end of file
b
diff -r c56807be3b72 -r 3bc73f5dc785 version_info.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/version_info.txt Fri Dec 20 14:17:59 2019 +0000
b
@@ -0,0 +1,1 @@
+version: 0.3.6-431(e059d93) branch: conda