Mercurial > repos > petrn > repeatexplorer
comparison config.py @ 0:f6ebec6e235e draft
Uploaded
| author | petrn |
|---|---|
| date | Thu, 19 Dec 2019 13:46:43 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f6ebec6e235e |
|---|---|
| 1 ''' | |
| 2 All configuration for clustering | |
| 3 ''' | |
| 4 import os | |
| 5 import tempfile | |
| 6 from math import exp | |
| 7 from collections import namedtuple | |
| 8 MAIN_DIR = os.path.dirname(os.path.realpath(__file__)) | |
| 9 def add_base_path(base): | |
| 10 '''automates generating absolute path in config''' | |
| 11 def joined_path(p): | |
| 12 '''create absolute path function ''' | |
| 13 return os.path.join(base, p) | |
| 14 return joined_path | |
| 15 | |
| 16 PATH = add_base_path(MAIN_DIR) | |
| 17 | |
| 18 # clustering general settings | |
| 19 DIRECTORY_TREE = {'libdir': 'libdir', | |
| 20 'seqclust': 'seqclust', | |
| 21 'assembly': 'seqclust/small_clusters_assembly', | |
| 22 'blastx': 'seqclust/blastx', | |
| 23 'clustering': 'seqclust/clustering', | |
| 24 'clusters': 'seqclust/clustering/clusters', | |
| 25 'superclusters': 'seqclust/clustering/superclusters', | |
| 26 'mgblast': 'seqclust/mgblast', | |
| 27 'blastn': 'seqclust/blastn', | |
| 28 'prerun': 'seqclust/prerun', | |
| 29 'prerun_clusters': 'seqclust/prerun/clusters', | |
| 30 'sequences': 'seqclust/reads', | |
| 31 'custom_databases': 'seqclust/custom_databases'} | |
| 32 | |
| 33 if "TEMP" in os.environ: | |
| 34 DIRECTORY_TREE['TEMP'] = os.environ["TEMP"] | |
| 35 else: | |
| 36 DIRECTORY_TREE['TEMP'] = tempfile.TemporaryDirectory().name | |
| 37 | |
| 38 FILES = {'sample_db': DIRECTORY_TREE['TEMP'] + "/sample.db", | |
| 39 'sample_fasta': DIRECTORY_TREE['prerun'] + "/sample.fasta", | |
| 40 'prerun_cls_file' : DIRECTORY_TREE['prerun'] + "/sample_hitsort.cls", | |
| 41 'filter_sequences_file' : DIRECTORY_TREE['prerun'] + "/filter_sequences.fasta", | |
| 42 'sequences_db': DIRECTORY_TREE['TEMP'] + "/sequences.db", | |
| 43 'sequences_fasta': DIRECTORY_TREE['sequences'] + "/reads.fasta", | |
| 44 'hitsort': DIRECTORY_TREE['clustering'] + "/hitsort", | |
| 45 'hitsort_db': DIRECTORY_TREE['TEMP'] + "/hitsort.db", | |
| 46 'cls_file': DIRECTORY_TREE['clustering'] + "/hitsort.cls", | |
| 47 'clusters_summary_csv': "CLUSTER_TABLE.csv", | |
| 48 'profrep_classification_csv': "PROFREP_CLASSIFICATION_TEMPLATE.csv", | |
| 49 'superclusters_csv_summary': "SUPERCLUSTER_TABLE.csv", | |
| 50 'comparative_analysis_counts_csv': "COMPARATIVE_ANALYSIS_COUNTS.csv", | |
| 51 'clusters_info': ".clusters_info.csv", | |
| 52 'tarean_report_html': "tarean_report.html", | |
| 53 'cluster_report_html' : "cluster_report.html", | |
| 54 'supercluster_report_html' : 'supercluster_report.html', | |
| 55 'repeat_annotation_summary_rds' : 'repeat_annotation_summary.rds', | |
| 56 'summarized_annotation_html' :'summarized_annotation.html', | |
| 57 'main_report_html' : 'index.html', | |
| 58 'TR_consensus_fasta': "TAREAN_consensus_rank_{}.fasta", | |
| 59 'summary_histogram' : 'summary_histogram.png', | |
| 60 'comparative_summary_map': 'comparative_summary.png', | |
| 61 "how_to_cite" : "HOW_TO_CITE.html", | |
| 62 'logfile' : "logfile.txt", | |
| 63 'contigs' : "contigs.fasta", | |
| 64 'filter_omitted' : DIRECTORY_TREE['sequences'] + "/removed_filtering_positive_reads.fasta", | |
| 65 'filter_kept' : DIRECTORY_TREE['sequences'] + "/kept_filtering_positive_reads.fasta" | |
| 66 } | |
| 67 | |
| 68 | |
| 69 # include in output- [source, destination] | |
| 70 INCLUDE = [ | |
| 71 [PATH("HOW_TO_CITE.html"), FILES["how_to_cite"]] | |
| 72 ] | |
| 73 | |
| 74 # this is attribute of path - not a file name! | |
| 75 FILES_TO_DISCARD_AT_CLEANUP = [ | |
| 76 'prerun', 'mgblast', 'blastn', "blastx", | |
| 77 'hitsort', "repeat_annotation_summary_rds" | |
| 78 ] | |
| 79 | |
| 80 # relative links for html files | |
| 81 HTML_LINKS = { | |
| 82 "CLUSTER_TO_SUPERCLUSTER" : "../../superclusters/dir_SC%04d/index.html", | |
| 83 "SUPERCLUSTER_TO_CLUSTER" : "../../clusters/dir_CL%04d/index.html", | |
| 84 "CLUSTER_TO_CLUSTER" : "../dir_CL%04d/index.html", | |
| 85 "SUPERCLUSTER_TO_SUPERCLUSTER" : "../dir_SC%04d/index.html", | |
| 86 "CLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html", | |
| 87 "SEPERCLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html", | |
| 88 "ROOT_TO_CLUSTER" : "seqclust/clustering/clusters/dir_CL%04d/index.html", | |
| 89 "ROOT_TO_SUPERCLUSTER" : "seqclust/clustering/superclusters/dir_SC%04d/index.html", | |
| 90 "ROOT_TO_TAREAN" : "seqclust/clustering/clusters/dir_CL%04d/tarean/report.html", | |
| 91 "CLUSTER_TO_KMER_REPORT" : "tarean/report.html", | |
| 92 "INDEX_TO_TAREAN": "tarean_report.html", | |
| 93 "INDEX_TO_CLUSTER_REPORT": "cluster_report.html", | |
| 94 "INDEX_TO_SUPERCLUSTER_REPORT" : "supercluster_report.html", | |
| 95 "INDEX_TO_SUMMARIZED_ANNOTATION" : "summarized_annotation.html" | |
| 96 } | |
| 97 | |
| 98 | |
| 99 EMAX = 42.6 # define how many graph edges can be processed in 1Kb RAM | |
| 100 # MINIMUM_NUMBER_OF_INPUT_SEQUENCES = 5000 | |
| 101 # FOR TESTING: | |
| 102 MINIMUM_NUMBER_OF_INPUT_SEQUENCES = 1000 | |
| 103 MINIMUM_NUMBER_OF_READS_IN_CLUSTER = 20 # smaller clusters are not analyzed | |
| 104 MINIMUM_NUMBER_OF_READS_FOR_MERGING = 20 # smaller clusters will not be merged | |
| 105 MINIMUM_NUMBER_OF_SHARED_PAIRS_FOR_MERGING = 20 # min size of W param | |
| 106 | |
| 107 | |
| 108 NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING = 50000 | |
| 109 NUMBER_OF_SEQUENCES_FOR_PRERUN_WITHOUT_FILTERING = 20000 | |
| 110 NUMBER_OF_SEQUENCES_FOR_PRERUN = NUMBER_OF_SEQUENCES_FOR_PRERUN_WITHOUT_FILTERING | |
| 111 CHUNK_SIZE = 20000 | |
| 112 | |
| 113 | |
| 114 CLUSTER_EMAX = 2E7 # this parameter higle affect memory usage! | |
| 115 | |
| 116 CLUSTER_VMAX = 40000 | |
| 117 SUPERCLUSTER_THRESHOLD = 0.1 | |
| 118 # Number of processors to use - it will be set at runtime | |
| 119 PROC = None | |
| 120 RSERVE_PORT = 6311 | |
| 121 | |
| 122 #some settings related to repeats annotation | |
| 123 ORF_THRESHOLD = 1200 | |
| 124 PBS_THRESHOLD = 2 | |
| 125 # threshold for rDNA detection - percentage of similarity hits | |
| 126 RDNA_THRESHOLD = 20 | |
| 127 # threshold for contamination detection | |
| 128 CONTAMINATION_THRESHOLD = 10 | |
| 129 # tandem ranks codes: | |
| 130 # 1 : putative tandem repeats - high confidence | |
| 131 # 2 : putative tandem repeats - low confidence | |
| 132 # 3 : potential LTR element | |
| 133 # 4 : rDNA | |
| 134 TANDEM_RANKS = [1, 2, 3, 4] | |
| 135 SKIP_CAP3_ASSEMBLY_TANDEM_RANKS = [1] | |
| 136 FILTER_MIN_PROP_THRESHOLD = 0.03 # this is minimal proportion of graph edges! | |
| 137 FILTER_MIN_SIZE_THRESHOLD = 1000 # minimal size of the cluster to be consider for filtering | |
| 138 FILTER_PROPORTION_OF_KEPT = 0.1 | |
| 139 | |
| 140 R = 'lib' | |
| 141 # external scripts | |
| 142 RSOURCE_tarean = PATH('lib/tarean/tarean.R') | |
| 143 RSOURCE_reporting = PATH('lib/reporting.R') | |
| 144 RSOURCE_create_annotation = PATH('lib/create_annotation.R') | |
| 145 LTR_DETECTION = PATH("lib/detect_LTR_insertion_sites.pl") | |
| 146 | |
| 147 #PATH to DATABASES: | |
| 148 DNA_DATABASE = PATH("databases/dna_database_masked.fasta") | |
| 149 TRNA_DATABASE = PATH("databases/tRNA_database.fasta") | |
| 150 SATELLITE_MODEL = PATH("databases/satellite_model.rds") | |
| 151 LASTAL_PARAMS = PATH("databases/lastal_params") | |
| 152 # for testing | |
| 153 PROTEIN_DATABASE = None | |
| 154 CLASSIFICATION_HIERARCHY = None | |
| 155 CUSTOM_DNA_DATABASE = None | |
| 156 | |
| 157 # when modifying this section check if makefile has most recent target for protein database | |
| 158 PROTEIN_DATABASE_DEFAULT = "VIRIDIPLANTAE3.0" | |
| 159 PROTEIN_DATABASE_OPTIONS = {'VIRIDIPLANTAE3.0' : | |
| 160 (PATH("databases/protein_database_viridiplantae_v3.0.fasta"), # change according if you use custom protein database | |
| 161 PATH("databases/classification_tree_viridiplantae_v3.0.rds")), # classification schem - data.tree object | |
| 162 'VIRIDIPLANTAE2.2' : | |
| 163 (PATH("databases/protein_database_viridiplantae_v2.2.fasta"), # change according if you use custom protein database | |
| 164 PATH("databases/classification_viridiplantae_tree.rds")), # classification schem - data.tree object | |
| 165 'METAZOA2.0' : | |
| 166 (PATH("databases/protein_database_metazoa_v3.fasta"), # change according if you use custom protein database | |
| 167 PATH("databases/classification_tree_metazoa_v3.rds")), # classification schem - data.tree object | |
| 168 'METAZOA3.0' : | |
| 169 (PATH("databases/protein_database_metazoa_v3.fasta"), # change according if you use custom protein database | |
| 170 PATH("databases/classification_tree_metazoa_v3.rds")) # classification schem - data.tree object | |
| 171 } | |
| 172 # if you change PROTEIN_DATABASE_OPTIONS, do not forget to use 'makeblastdb' build blast database | |
| 173 # and 'diamond makedb' to build diamond database | |
| 174 | |
| 175 # PATH to binaries | |
| 176 LOUVAIN = PATH("louvain") | |
| 177 BINARIES = PATH("bin") | |
| 178 | |
| 179 CAP3_PATTERNS_REPLACE = {"{}.{}.contigs" : [">Contig", ">{}Contig"], | |
| 180 "{}.{}.aln" : ["* Contig", "* {}Contig"], | |
| 181 "{}.{}.contigs.qual" : [">Contig", ">{}Contig"], | |
| 182 "{}.{}.ace" : ["CO Contig", "CO {}Contig"], | |
| 183 "{}.{}.singlets" : None, | |
| 184 "{}.{}.info" : None, | |
| 185 "{}.{}.contigs.links" : None} | |
| 186 | |
| 187 CAP3_FILES_MAPPING = {"{}.{}.contigs" : "small_clusters.fasta", | |
| 188 "{}.{}.contigs.qual" : "small_clusters.contigs.qual", | |
| 189 "{}.{}.aln" : "small_clusters.aln", | |
| 190 "{}.{}.ace" : "small_clusters.ace", | |
| 191 "{}.{}.singlets" : None, | |
| 192 "{}.{}.info" : None, | |
| 193 "{}.{}.contigs.links" : None} | |
| 194 | |
| 195 CAP3_FILENAMES = list(CAP3_PATTERNS_REPLACE.keys()) | |
| 196 | |
| 197 CAP3_FILES_GOODNAMES = {"{}.{}.contigs" : "contigs.fasta", | |
| 198 "{}.{}.contigs.qual" : "contigs.qual", | |
| 199 "{}.{}.aln" : "contigs.aln", | |
| 200 "{}.{}.ace" : "contigs.ace", | |
| 201 "{}.{}.singlets" : "singlets.fasta", | |
| 202 "{}.{}.info" : "assembly.info", | |
| 203 "{}.{}.contigs.links" : "contigs.links"} | |
| 204 | |
| 205 | |
| 206 | |
| 207 | |
| 208 CAP3_PARAMS = " -p 80 -o 40 " ## not implemented yet | |
| 209 | |
| 210 LTR_DETECTION_FILES = {'ADJ': 'LTR_info.ADJ', | |
| 211 'LTR': 'LTR_info.LTR', | |
| 212 'PBS_BLAST': 'LTR_info.with_PBS_blast.csv', | |
| 213 'BASE' : 'LTR_info'} | |
| 214 | |
| 215 # options for all-2-all search and annotations | |
| 216 FilteringThreshod = namedtuple("FilteringThreshold", | |
| 217 "min_lcov min_pid min_ovl min_scov evalue") | |
| 218 AnnotationParams = namedtuple("AnnotationParams", "blastn blastx blastn_trna") | |
| 219 Option = namedtuple('Options', | |
| 220 ('name database all2all_search_params filtering_threshold ' | |
| 221 'filter_self_hits legacy_database lastdb annotation_search_params')) | |
| 222 | |
| 223 # protein domain search options: | |
| 224 DIAMOND = { | |
| 225 'args': ' -p {max_proc} --max-target-seqs 1 --min-score 30 --freq-sd 1000 --more-sensitive', | |
| 226 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
| 227 'column_types' : [str, str, float, float, float, float, float], | |
| 228 'program': 'diamond blastx', | |
| 229 'filter_function' : lambda x: x.bitscore >= 30, | |
| 230 'parallelize' : False | |
| 231 } | |
| 232 BLASTX_W3 = { | |
| 233 'args': ' -num_alignments 1 -word_size 2 -evalue 0.01 ', | |
| 234 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
| 235 'column_types' : [str, str, float, float, float, float, float], | |
| 236 'program': 'blastx', | |
| 237 'filter_function' : lambda x: x.bitscore >= 33 | |
| 238 } | |
| 239 BLASTX_W2 = BLASTX_W3 | |
| 240 BLASTX_W2['args'] = ' -num_alignments 1 -word_size 3 -evalue 0.01 ' | |
| 241 | |
| 242 | |
| 243 ARGS = None | |
| 244 | |
| 245 ILLUMINA = Option( | |
| 246 name="illumina", | |
| 247 database='blastdb_legacy', | |
| 248 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F ' | |
| 249 '"m D" -v100000000 -b100000000' | |
| 250 ' -D4 -C 30 -H 30 -i {query} -d {blastdb}'), | |
| 251 filtering_threshold=FilteringThreshod(55, 90, 0, 0, 1), | |
| 252 filter_self_hits=False, | |
| 253 legacy_database=True, | |
| 254 lastdb=False, | |
| 255 annotation_search_params=AnnotationParams( | |
| 256 blastn={ | |
| 257 'args': ' -task blastn -num_alignments 1 -evalue 0.01 ', | |
| 258 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
| 259 'column_types' : [str, str, float, float, float, float, float], | |
| 260 'program': 'blastn', | |
| 261 'filter_function' : lambda x: x.length > 30 and x.bitscore > 60 | |
| 262 | |
| 263 }, | |
| 264 blastx=BLASTX_W3, | |
| 265 blastn_trna={ | |
| 266 'args': ' -task blastn -num_alignments 1 -word_size 7', | |
| 267 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
| 268 'column_types' : [str, str, float, float, float, float, float], | |
| 269 'program': 'blastn', | |
| 270 'filter_function' : lambda x: x.length > 18 and x.bitscore > 60 | |
| 271 } | |
| 272 ) | |
| 273 ) | |
| 274 | |
| 275 ILLUMINA_DUST_OFF = ILLUMINA._replace( | |
| 276 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F ' | |
| 277 'F -v100000000 -b100000000' | |
| 278 ' -D4 -C 30 -H 30 -i {query} -d {blastdb}'), | |
| 279 ) | |
| 280 | |
| 281 ILLUMINA_SHORT = ILLUMINA._replace( | |
| 282 name="illumina_short", | |
| 283 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F ' | |
| 284 '"m D" -v100000000 -b100000000' | |
| 285 ' -D4 -C 20 -H 30 -i {query} -d {blastdb}'), | |
| 286 filtering_threshold=FilteringThreshod(40, 90, 0, 0, 0.1) | |
| 287 ) | |
| 288 | |
| 289 | |
| 290 OXFORD_NANOPORE = Option( | |
| 291 name="oxford_nanopore", | |
| 292 database='lastdb', | |
| 293 all2all_search_params=('last_wrapper.py -f blasttab+ -P1 ' | |
| 294 ' -m 700 -p {} ' | |
| 295 ' {{blastdb}} {{query}} ').format(LASTAL_PARAMS), | |
| 296 filtering_threshold=FilteringThreshod(40, 50, 0, 0, 0.01), | |
| 297 filter_self_hits=True, | |
| 298 legacy_database=False, | |
| 299 lastdb=True, | |
| 300 annotation_search_params=AnnotationParams( | |
| 301 blastn={ | |
| 302 'args': ' -task blastn -num_alignments 1 -evalue 0.01 -word_size 11', | |
| 303 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
| 304 'column_types' : [str, str, float, float, float, float, float], | |
| 305 'program': 'blastn', | |
| 306 'filter_function' : lambda x: x.length > 30 and x.bitscore > 50 | |
| 307 }, | |
| 308 blastx={ | |
| 309 'args': ' -num_alignments 1 -word_size 2 -evalue 0.1', | |
| 310 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
| 311 'column_types' : [str, str, float, float, float, float, float], | |
| 312 'program': 'blastx', | |
| 313 'filter_function' : lambda x: x.bitscore >= 30 | |
| 314 }, | |
| 315 blastn_trna={ | |
| 316 'args': ' -task blastn -num_alignments 1 -word_size 7', | |
| 317 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
| 318 'column_types' : [str, str, float, float, float, float, float], | |
| 319 'program': 'blastn', | |
| 320 'filter_function' : lambda x: x.length > 18 and x.length > 60 | |
| 321 } | |
| 322 ) | |
| 323 ) |
