Mercurial > repos > petrn > repeatexplorer
comparison config.py @ 0:f6ebec6e235e draft
Uploaded
author | petrn |
---|---|
date | Thu, 19 Dec 2019 13:46:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f6ebec6e235e |
---|---|
1 ''' | |
2 All configuration for clustering | |
3 ''' | |
4 import os | |
5 import tempfile | |
6 from math import exp | |
7 from collections import namedtuple | |
8 MAIN_DIR = os.path.dirname(os.path.realpath(__file__)) | |
9 def add_base_path(base): | |
10 '''automates generating absolute path in config''' | |
11 def joined_path(p): | |
12 '''create absolute path function ''' | |
13 return os.path.join(base, p) | |
14 return joined_path | |
15 | |
16 PATH = add_base_path(MAIN_DIR) | |
17 | |
18 # clustering general settings | |
19 DIRECTORY_TREE = {'libdir': 'libdir', | |
20 'seqclust': 'seqclust', | |
21 'assembly': 'seqclust/small_clusters_assembly', | |
22 'blastx': 'seqclust/blastx', | |
23 'clustering': 'seqclust/clustering', | |
24 'clusters': 'seqclust/clustering/clusters', | |
25 'superclusters': 'seqclust/clustering/superclusters', | |
26 'mgblast': 'seqclust/mgblast', | |
27 'blastn': 'seqclust/blastn', | |
28 'prerun': 'seqclust/prerun', | |
29 'prerun_clusters': 'seqclust/prerun/clusters', | |
30 'sequences': 'seqclust/reads', | |
31 'custom_databases': 'seqclust/custom_databases'} | |
32 | |
33 if "TEMP" in os.environ: | |
34 DIRECTORY_TREE['TEMP'] = os.environ["TEMP"] | |
35 else: | |
36 DIRECTORY_TREE['TEMP'] = tempfile.TemporaryDirectory().name | |
37 | |
38 FILES = {'sample_db': DIRECTORY_TREE['TEMP'] + "/sample.db", | |
39 'sample_fasta': DIRECTORY_TREE['prerun'] + "/sample.fasta", | |
40 'prerun_cls_file' : DIRECTORY_TREE['prerun'] + "/sample_hitsort.cls", | |
41 'filter_sequences_file' : DIRECTORY_TREE['prerun'] + "/filter_sequences.fasta", | |
42 'sequences_db': DIRECTORY_TREE['TEMP'] + "/sequences.db", | |
43 'sequences_fasta': DIRECTORY_TREE['sequences'] + "/reads.fasta", | |
44 'hitsort': DIRECTORY_TREE['clustering'] + "/hitsort", | |
45 'hitsort_db': DIRECTORY_TREE['TEMP'] + "/hitsort.db", | |
46 'cls_file': DIRECTORY_TREE['clustering'] + "/hitsort.cls", | |
47 'clusters_summary_csv': "CLUSTER_TABLE.csv", | |
48 'profrep_classification_csv': "PROFREP_CLASSIFICATION_TEMPLATE.csv", | |
49 'superclusters_csv_summary': "SUPERCLUSTER_TABLE.csv", | |
50 'comparative_analysis_counts_csv': "COMPARATIVE_ANALYSIS_COUNTS.csv", | |
51 'clusters_info': ".clusters_info.csv", | |
52 'tarean_report_html': "tarean_report.html", | |
53 'cluster_report_html' : "cluster_report.html", | |
54 'supercluster_report_html' : 'supercluster_report.html', | |
55 'repeat_annotation_summary_rds' : 'repeat_annotation_summary.rds', | |
56 'summarized_annotation_html' :'summarized_annotation.html', | |
57 'main_report_html' : 'index.html', | |
58 'TR_consensus_fasta': "TAREAN_consensus_rank_{}.fasta", | |
59 'summary_histogram' : 'summary_histogram.png', | |
60 'comparative_summary_map': 'comparative_summary.png', | |
61 "how_to_cite" : "HOW_TO_CITE.html", | |
62 'logfile' : "logfile.txt", | |
63 'contigs' : "contigs.fasta", | |
64 'filter_omitted' : DIRECTORY_TREE['sequences'] + "/removed_filtering_positive_reads.fasta", | |
65 'filter_kept' : DIRECTORY_TREE['sequences'] + "/kept_filtering_positive_reads.fasta" | |
66 } | |
67 | |
68 | |
69 # include in output- [source, destination] | |
70 INCLUDE = [ | |
71 [PATH("HOW_TO_CITE.html"), FILES["how_to_cite"]] | |
72 ] | |
73 | |
74 # this is attribute of path - not a file name! | |
75 FILES_TO_DISCARD_AT_CLEANUP = [ | |
76 'prerun', 'mgblast', 'blastn', "blastx", | |
77 'hitsort', "repeat_annotation_summary_rds" | |
78 ] | |
79 | |
80 # relative links for html files | |
81 HTML_LINKS = { | |
82 "CLUSTER_TO_SUPERCLUSTER" : "../../superclusters/dir_SC%04d/index.html", | |
83 "SUPERCLUSTER_TO_CLUSTER" : "../../clusters/dir_CL%04d/index.html", | |
84 "CLUSTER_TO_CLUSTER" : "../dir_CL%04d/index.html", | |
85 "SUPERCLUSTER_TO_SUPERCLUSTER" : "../dir_SC%04d/index.html", | |
86 "CLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html", | |
87 "SEPERCLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html", | |
88 "ROOT_TO_CLUSTER" : "seqclust/clustering/clusters/dir_CL%04d/index.html", | |
89 "ROOT_TO_SUPERCLUSTER" : "seqclust/clustering/superclusters/dir_SC%04d/index.html", | |
90 "ROOT_TO_TAREAN" : "seqclust/clustering/clusters/dir_CL%04d/tarean/report.html", | |
91 "CLUSTER_TO_KMER_REPORT" : "tarean/report.html", | |
92 "INDEX_TO_TAREAN": "tarean_report.html", | |
93 "INDEX_TO_CLUSTER_REPORT": "cluster_report.html", | |
94 "INDEX_TO_SUPERCLUSTER_REPORT" : "supercluster_report.html", | |
95 "INDEX_TO_SUMMARIZED_ANNOTATION" : "summarized_annotation.html" | |
96 } | |
97 | |
98 | |
99 EMAX = 42.6 # define how many graph edges can be processed in 1Kb RAM | |
100 # MINIMUM_NUMBER_OF_INPUT_SEQUENCES = 5000 | |
101 # FOR TESTING: | |
102 MINIMUM_NUMBER_OF_INPUT_SEQUENCES = 1000 | |
103 MINIMUM_NUMBER_OF_READS_IN_CLUSTER = 20 # smaller clusters are not analyzed | |
104 MINIMUM_NUMBER_OF_READS_FOR_MERGING = 20 # smaller clusters will not be merged | |
105 MINIMUM_NUMBER_OF_SHARED_PAIRS_FOR_MERGING = 20 # min size of W param | |
106 | |
107 | |
108 NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING = 50000 | |
109 NUMBER_OF_SEQUENCES_FOR_PRERUN_WITHOUT_FILTERING = 20000 | |
110 NUMBER_OF_SEQUENCES_FOR_PRERUN = NUMBER_OF_SEQUENCES_FOR_PRERUN_WITHOUT_FILTERING | |
111 CHUNK_SIZE = 20000 | |
112 | |
113 | |
114 CLUSTER_EMAX = 2E7 # this parameter higle affect memory usage! | |
115 | |
116 CLUSTER_VMAX = 40000 | |
117 SUPERCLUSTER_THRESHOLD = 0.1 | |
118 # Number of processors to use - it will be set at runtime | |
119 PROC = None | |
120 RSERVE_PORT = 6311 | |
121 | |
122 #some settings related to repeats annotation | |
123 ORF_THRESHOLD = 1200 | |
124 PBS_THRESHOLD = 2 | |
125 # threshold for rDNA detection - percentage of similarity hits | |
126 RDNA_THRESHOLD = 20 | |
127 # threshold for contamination detection | |
128 CONTAMINATION_THRESHOLD = 10 | |
129 # tandem ranks codes: | |
130 # 1 : putative tandem repeats - high confidence | |
131 # 2 : putative tandem repeats - low confidence | |
132 # 3 : potential LTR element | |
133 # 4 : rDNA | |
134 TANDEM_RANKS = [1, 2, 3, 4] | |
135 SKIP_CAP3_ASSEMBLY_TANDEM_RANKS = [1] | |
136 FILTER_MIN_PROP_THRESHOLD = 0.03 # this is minimal proportion of graph edges! | |
137 FILTER_MIN_SIZE_THRESHOLD = 1000 # minimal size of the cluster to be consider for filtering | |
138 FILTER_PROPORTION_OF_KEPT = 0.1 | |
139 | |
140 R = 'lib' | |
141 # external scripts | |
142 RSOURCE_tarean = PATH('lib/tarean/tarean.R') | |
143 RSOURCE_reporting = PATH('lib/reporting.R') | |
144 RSOURCE_create_annotation = PATH('lib/create_annotation.R') | |
145 LTR_DETECTION = PATH("lib/detect_LTR_insertion_sites.pl") | |
146 | |
147 #PATH to DATABASES: | |
148 DNA_DATABASE = PATH("databases/dna_database_masked.fasta") | |
149 TRNA_DATABASE = PATH("databases/tRNA_database.fasta") | |
150 SATELLITE_MODEL = PATH("databases/satellite_model.rds") | |
151 LASTAL_PARAMS = PATH("databases/lastal_params") | |
152 # for testing | |
153 PROTEIN_DATABASE = None | |
154 CLASSIFICATION_HIERARCHY = None | |
155 CUSTOM_DNA_DATABASE = None | |
156 | |
157 # when modifying this section check if makefile has most recent target for protein database | |
158 PROTEIN_DATABASE_DEFAULT = "VIRIDIPLANTAE3.0" | |
159 PROTEIN_DATABASE_OPTIONS = {'VIRIDIPLANTAE3.0' : | |
160 (PATH("databases/protein_database_viridiplantae_v3.0.fasta"), # change according if you use custom protein database | |
161 PATH("databases/classification_tree_viridiplantae_v3.0.rds")), # classification schem - data.tree object | |
162 'VIRIDIPLANTAE2.2' : | |
163 (PATH("databases/protein_database_viridiplantae_v2.2.fasta"), # change according if you use custom protein database | |
164 PATH("databases/classification_viridiplantae_tree.rds")), # classification schem - data.tree object | |
165 'METAZOA2.0' : | |
166 (PATH("databases/protein_database_metazoa_v3.fasta"), # change according if you use custom protein database | |
167 PATH("databases/classification_tree_metazoa_v3.rds")), # classification schem - data.tree object | |
168 'METAZOA3.0' : | |
169 (PATH("databases/protein_database_metazoa_v3.fasta"), # change according if you use custom protein database | |
170 PATH("databases/classification_tree_metazoa_v3.rds")) # classification schem - data.tree object | |
171 } | |
172 # if you change PROTEIN_DATABASE_OPTIONS, do not forget to use 'makeblastdb' build blast database | |
173 # and 'diamond makedb' to build diamond database | |
174 | |
175 # PATH to binaries | |
176 LOUVAIN = PATH("louvain") | |
177 BINARIES = PATH("bin") | |
178 | |
179 CAP3_PATTERNS_REPLACE = {"{}.{}.contigs" : [">Contig", ">{}Contig"], | |
180 "{}.{}.aln" : ["* Contig", "* {}Contig"], | |
181 "{}.{}.contigs.qual" : [">Contig", ">{}Contig"], | |
182 "{}.{}.ace" : ["CO Contig", "CO {}Contig"], | |
183 "{}.{}.singlets" : None, | |
184 "{}.{}.info" : None, | |
185 "{}.{}.contigs.links" : None} | |
186 | |
187 CAP3_FILES_MAPPING = {"{}.{}.contigs" : "small_clusters.fasta", | |
188 "{}.{}.contigs.qual" : "small_clusters.contigs.qual", | |
189 "{}.{}.aln" : "small_clusters.aln", | |
190 "{}.{}.ace" : "small_clusters.ace", | |
191 "{}.{}.singlets" : None, | |
192 "{}.{}.info" : None, | |
193 "{}.{}.contigs.links" : None} | |
194 | |
195 CAP3_FILENAMES = list(CAP3_PATTERNS_REPLACE.keys()) | |
196 | |
197 CAP3_FILES_GOODNAMES = {"{}.{}.contigs" : "contigs.fasta", | |
198 "{}.{}.contigs.qual" : "contigs.qual", | |
199 "{}.{}.aln" : "contigs.aln", | |
200 "{}.{}.ace" : "contigs.ace", | |
201 "{}.{}.singlets" : "singlets.fasta", | |
202 "{}.{}.info" : "assembly.info", | |
203 "{}.{}.contigs.links" : "contigs.links"} | |
204 | |
205 | |
206 | |
207 | |
208 CAP3_PARAMS = " -p 80 -o 40 " ## not implemented yet | |
209 | |
210 LTR_DETECTION_FILES = {'ADJ': 'LTR_info.ADJ', | |
211 'LTR': 'LTR_info.LTR', | |
212 'PBS_BLAST': 'LTR_info.with_PBS_blast.csv', | |
213 'BASE' : 'LTR_info'} | |
214 | |
215 # options for all-2-all search and annotations | |
216 FilteringThreshod = namedtuple("FilteringThreshold", | |
217 "min_lcov min_pid min_ovl min_scov evalue") | |
218 AnnotationParams = namedtuple("AnnotationParams", "blastn blastx blastn_trna") | |
219 Option = namedtuple('Options', | |
220 ('name database all2all_search_params filtering_threshold ' | |
221 'filter_self_hits legacy_database lastdb annotation_search_params')) | |
222 | |
223 # protein domain search options: | |
224 DIAMOND = { | |
225 'args': ' -p {max_proc} --max-target-seqs 1 --min-score 30 --freq-sd 1000 --more-sensitive', | |
226 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
227 'column_types' : [str, str, float, float, float, float, float], | |
228 'program': 'diamond blastx', | |
229 'filter_function' : lambda x: x.bitscore >= 30, | |
230 'parallelize' : False | |
231 } | |
232 BLASTX_W3 = { | |
233 'args': ' -num_alignments 1 -word_size 2 -evalue 0.01 ', | |
234 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
235 'column_types' : [str, str, float, float, float, float, float], | |
236 'program': 'blastx', | |
237 'filter_function' : lambda x: x.bitscore >= 33 | |
238 } | |
239 BLASTX_W2 = BLASTX_W3 | |
240 BLASTX_W2['args'] = ' -num_alignments 1 -word_size 3 -evalue 0.01 ' | |
241 | |
242 | |
243 ARGS = None | |
244 | |
245 ILLUMINA = Option( | |
246 name="illumina", | |
247 database='blastdb_legacy', | |
248 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F ' | |
249 '"m D" -v100000000 -b100000000' | |
250 ' -D4 -C 30 -H 30 -i {query} -d {blastdb}'), | |
251 filtering_threshold=FilteringThreshod(55, 90, 0, 0, 1), | |
252 filter_self_hits=False, | |
253 legacy_database=True, | |
254 lastdb=False, | |
255 annotation_search_params=AnnotationParams( | |
256 blastn={ | |
257 'args': ' -task blastn -num_alignments 1 -evalue 0.01 ', | |
258 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
259 'column_types' : [str, str, float, float, float, float, float], | |
260 'program': 'blastn', | |
261 'filter_function' : lambda x: x.length > 30 and x.bitscore > 60 | |
262 | |
263 }, | |
264 blastx=BLASTX_W3, | |
265 blastn_trna={ | |
266 'args': ' -task blastn -num_alignments 1 -word_size 7', | |
267 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
268 'column_types' : [str, str, float, float, float, float, float], | |
269 'program': 'blastn', | |
270 'filter_function' : lambda x: x.length > 18 and x.bitscore > 60 | |
271 } | |
272 ) | |
273 ) | |
274 | |
275 ILLUMINA_DUST_OFF = ILLUMINA._replace( | |
276 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F ' | |
277 'F -v100000000 -b100000000' | |
278 ' -D4 -C 30 -H 30 -i {query} -d {blastdb}'), | |
279 ) | |
280 | |
281 ILLUMINA_SHORT = ILLUMINA._replace( | |
282 name="illumina_short", | |
283 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F ' | |
284 '"m D" -v100000000 -b100000000' | |
285 ' -D4 -C 20 -H 30 -i {query} -d {blastdb}'), | |
286 filtering_threshold=FilteringThreshod(40, 90, 0, 0, 0.1) | |
287 ) | |
288 | |
289 | |
290 OXFORD_NANOPORE = Option( | |
291 name="oxford_nanopore", | |
292 database='lastdb', | |
293 all2all_search_params=('last_wrapper.py -f blasttab+ -P1 ' | |
294 ' -m 700 -p {} ' | |
295 ' {{blastdb}} {{query}} ').format(LASTAL_PARAMS), | |
296 filtering_threshold=FilteringThreshod(40, 50, 0, 0, 0.01), | |
297 filter_self_hits=True, | |
298 legacy_database=False, | |
299 lastdb=True, | |
300 annotation_search_params=AnnotationParams( | |
301 blastn={ | |
302 'args': ' -task blastn -num_alignments 1 -evalue 0.01 -word_size 11', | |
303 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
304 'column_types' : [str, str, float, float, float, float, float], | |
305 'program': 'blastn', | |
306 'filter_function' : lambda x: x.length > 30 and x.bitscore > 50 | |
307 }, | |
308 blastx={ | |
309 'args': ' -num_alignments 1 -word_size 2 -evalue 0.1', | |
310 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
311 'column_types' : [str, str, float, float, float, float, float], | |
312 'program': 'blastx', | |
313 'filter_function' : lambda x: x.bitscore >= 30 | |
314 }, | |
315 blastn_trna={ | |
316 'args': ' -task blastn -num_alignments 1 -word_size 7', | |
317 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore", | |
318 'column_types' : [str, str, float, float, float, float, float], | |
319 'program': 'blastn', | |
320 'filter_function' : lambda x: x.length > 18 and x.length > 60 | |
321 } | |
322 ) | |
323 ) |