annotate config.py @ 0:f6ebec6e235e draft

Uploaded
author petrn
date Thu, 19 Dec 2019 13:46:43 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
2 All configuration for clustering
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
3 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
4 import os
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
5 import tempfile
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
6 from math import exp
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
7 from collections import namedtuple
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
8 MAIN_DIR = os.path.dirname(os.path.realpath(__file__))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
9 def add_base_path(base):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
10 '''automates generating absolute path in config'''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
11 def joined_path(p):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
12 '''create absolute path function '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
13 return os.path.join(base, p)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
14 return joined_path
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
15
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
16 PATH = add_base_path(MAIN_DIR)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
17
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
18 # clustering general settings
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
19 DIRECTORY_TREE = {'libdir': 'libdir',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
20 'seqclust': 'seqclust',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
21 'assembly': 'seqclust/small_clusters_assembly',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
22 'blastx': 'seqclust/blastx',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
23 'clustering': 'seqclust/clustering',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
24 'clusters': 'seqclust/clustering/clusters',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
25 'superclusters': 'seqclust/clustering/superclusters',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
26 'mgblast': 'seqclust/mgblast',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
27 'blastn': 'seqclust/blastn',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
28 'prerun': 'seqclust/prerun',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
29 'prerun_clusters': 'seqclust/prerun/clusters',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
30 'sequences': 'seqclust/reads',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
31 'custom_databases': 'seqclust/custom_databases'}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
32
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
33 if "TEMP" in os.environ:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
34 DIRECTORY_TREE['TEMP'] = os.environ["TEMP"]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
35 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
36 DIRECTORY_TREE['TEMP'] = tempfile.TemporaryDirectory().name
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
37
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
38 FILES = {'sample_db': DIRECTORY_TREE['TEMP'] + "/sample.db",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
39 'sample_fasta': DIRECTORY_TREE['prerun'] + "/sample.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
40 'prerun_cls_file' : DIRECTORY_TREE['prerun'] + "/sample_hitsort.cls",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
41 'filter_sequences_file' : DIRECTORY_TREE['prerun'] + "/filter_sequences.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
42 'sequences_db': DIRECTORY_TREE['TEMP'] + "/sequences.db",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
43 'sequences_fasta': DIRECTORY_TREE['sequences'] + "/reads.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
44 'hitsort': DIRECTORY_TREE['clustering'] + "/hitsort",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
45 'hitsort_db': DIRECTORY_TREE['TEMP'] + "/hitsort.db",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
46 'cls_file': DIRECTORY_TREE['clustering'] + "/hitsort.cls",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
47 'clusters_summary_csv': "CLUSTER_TABLE.csv",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
48 'profrep_classification_csv': "PROFREP_CLASSIFICATION_TEMPLATE.csv",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
49 'superclusters_csv_summary': "SUPERCLUSTER_TABLE.csv",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
50 'comparative_analysis_counts_csv': "COMPARATIVE_ANALYSIS_COUNTS.csv",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
51 'clusters_info': ".clusters_info.csv",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
52 'tarean_report_html': "tarean_report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
53 'cluster_report_html' : "cluster_report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
54 'supercluster_report_html' : 'supercluster_report.html',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
55 'repeat_annotation_summary_rds' : 'repeat_annotation_summary.rds',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
56 'summarized_annotation_html' :'summarized_annotation.html',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
57 'main_report_html' : 'index.html',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
58 'TR_consensus_fasta': "TAREAN_consensus_rank_{}.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
59 'summary_histogram' : 'summary_histogram.png',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
60 'comparative_summary_map': 'comparative_summary.png',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
61 "how_to_cite" : "HOW_TO_CITE.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
62 'logfile' : "logfile.txt",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
63 'contigs' : "contigs.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
64 'filter_omitted' : DIRECTORY_TREE['sequences'] + "/removed_filtering_positive_reads.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
65 'filter_kept' : DIRECTORY_TREE['sequences'] + "/kept_filtering_positive_reads.fasta"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
66 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
67
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
68
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
69 # include in output- [source, destination]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
70 INCLUDE = [
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
71 [PATH("HOW_TO_CITE.html"), FILES["how_to_cite"]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
72 ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
73
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
74 # this is attribute of path - not a file name!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
75 FILES_TO_DISCARD_AT_CLEANUP = [
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
76 'prerun', 'mgblast', 'blastn', "blastx",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
77 'hitsort', "repeat_annotation_summary_rds"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
78 ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
79
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
80 # relative links for html files
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
81 HTML_LINKS = {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
82 "CLUSTER_TO_SUPERCLUSTER" : "../../superclusters/dir_SC%04d/index.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
83 "SUPERCLUSTER_TO_CLUSTER" : "../../clusters/dir_CL%04d/index.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
84 "CLUSTER_TO_CLUSTER" : "../dir_CL%04d/index.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
85 "SUPERCLUSTER_TO_SUPERCLUSTER" : "../dir_SC%04d/index.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
86 "CLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
87 "SEPERCLUSTER_TO_CLUSTER_TABLE" : "../../../../cluster_report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
88 "ROOT_TO_CLUSTER" : "seqclust/clustering/clusters/dir_CL%04d/index.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
89 "ROOT_TO_SUPERCLUSTER" : "seqclust/clustering/superclusters/dir_SC%04d/index.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
90 "ROOT_TO_TAREAN" : "seqclust/clustering/clusters/dir_CL%04d/tarean/report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
91 "CLUSTER_TO_KMER_REPORT" : "tarean/report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
92 "INDEX_TO_TAREAN": "tarean_report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
93 "INDEX_TO_CLUSTER_REPORT": "cluster_report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
94 "INDEX_TO_SUPERCLUSTER_REPORT" : "supercluster_report.html",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
95 "INDEX_TO_SUMMARIZED_ANNOTATION" : "summarized_annotation.html"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
96 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
97
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
98
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
99 EMAX = 42.6 # define how many graph edges can be processed in 1Kb RAM
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
100 # MINIMUM_NUMBER_OF_INPUT_SEQUENCES = 5000
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
101 # FOR TESTING:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
102 MINIMUM_NUMBER_OF_INPUT_SEQUENCES = 1000
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
103 MINIMUM_NUMBER_OF_READS_IN_CLUSTER = 20 # smaller clusters are not analyzed
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
104 MINIMUM_NUMBER_OF_READS_FOR_MERGING = 20 # smaller clusters will not be merged
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
105 MINIMUM_NUMBER_OF_SHARED_PAIRS_FOR_MERGING = 20 # min size of W param
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
106
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
107
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
108 NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING = 50000
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
109 NUMBER_OF_SEQUENCES_FOR_PRERUN_WITHOUT_FILTERING = 20000
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
110 NUMBER_OF_SEQUENCES_FOR_PRERUN = NUMBER_OF_SEQUENCES_FOR_PRERUN_WITHOUT_FILTERING
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
111 CHUNK_SIZE = 20000
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
112
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
113
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
114 CLUSTER_EMAX = 2E7 # this parameter higle affect memory usage!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
115
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
116 CLUSTER_VMAX = 40000
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
117 SUPERCLUSTER_THRESHOLD = 0.1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
118 # Number of processors to use - it will be set at runtime
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
119 PROC = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
120 RSERVE_PORT = 6311
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
121
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
122 #some settings related to repeats annotation
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
123 ORF_THRESHOLD = 1200
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
124 PBS_THRESHOLD = 2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
125 # threshold for rDNA detection - percentage of similarity hits
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
126 RDNA_THRESHOLD = 20
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
127 # threshold for contamination detection
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
128 CONTAMINATION_THRESHOLD = 10
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
129 # tandem ranks codes:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
130 # 1 : putative tandem repeats - high confidence
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
131 # 2 : putative tandem repeats - low confidence
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
132 # 3 : potential LTR element
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
133 # 4 : rDNA
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
134 TANDEM_RANKS = [1, 2, 3, 4]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
135 SKIP_CAP3_ASSEMBLY_TANDEM_RANKS = [1]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
136 FILTER_MIN_PROP_THRESHOLD = 0.03 # this is minimal proportion of graph edges!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
137 FILTER_MIN_SIZE_THRESHOLD = 1000 # minimal size of the cluster to be consider for filtering
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
138 FILTER_PROPORTION_OF_KEPT = 0.1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
139
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
140 R = 'lib'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
141 # external scripts
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
142 RSOURCE_tarean = PATH('lib/tarean/tarean.R')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
143 RSOURCE_reporting = PATH('lib/reporting.R')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
144 RSOURCE_create_annotation = PATH('lib/create_annotation.R')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
145 LTR_DETECTION = PATH("lib/detect_LTR_insertion_sites.pl")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
146
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
147 #PATH to DATABASES:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
148 DNA_DATABASE = PATH("databases/dna_database_masked.fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
149 TRNA_DATABASE = PATH("databases/tRNA_database.fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
150 SATELLITE_MODEL = PATH("databases/satellite_model.rds")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
151 LASTAL_PARAMS = PATH("databases/lastal_params")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
152 # for testing
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
153 PROTEIN_DATABASE = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
154 CLASSIFICATION_HIERARCHY = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
155 CUSTOM_DNA_DATABASE = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
156
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
157 # when modifying this section check if makefile has most recent target for protein database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
158 PROTEIN_DATABASE_DEFAULT = "VIRIDIPLANTAE3.0"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
159 PROTEIN_DATABASE_OPTIONS = {'VIRIDIPLANTAE3.0' :
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
160 (PATH("databases/protein_database_viridiplantae_v3.0.fasta"), # change according if you use custom protein database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
161 PATH("databases/classification_tree_viridiplantae_v3.0.rds")), # classification schem - data.tree object
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
162 'VIRIDIPLANTAE2.2' :
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
163 (PATH("databases/protein_database_viridiplantae_v2.2.fasta"), # change according if you use custom protein database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
164 PATH("databases/classification_viridiplantae_tree.rds")), # classification schem - data.tree object
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
165 'METAZOA2.0' :
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
166 (PATH("databases/protein_database_metazoa_v3.fasta"), # change according if you use custom protein database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
167 PATH("databases/classification_tree_metazoa_v3.rds")), # classification schem - data.tree object
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
168 'METAZOA3.0' :
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
169 (PATH("databases/protein_database_metazoa_v3.fasta"), # change according if you use custom protein database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
170 PATH("databases/classification_tree_metazoa_v3.rds")) # classification schem - data.tree object
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
171 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
172 # if you change PROTEIN_DATABASE_OPTIONS, do not forget to use 'makeblastdb' build blast database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
173 # and 'diamond makedb' to build diamond database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
174
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
175 # PATH to binaries
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
176 LOUVAIN = PATH("louvain")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
177 BINARIES = PATH("bin")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
178
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
179 CAP3_PATTERNS_REPLACE = {"{}.{}.contigs" : [">Contig", ">{}Contig"],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
180 "{}.{}.aln" : ["* Contig", "* {}Contig"],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
181 "{}.{}.contigs.qual" : [">Contig", ">{}Contig"],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
182 "{}.{}.ace" : ["CO Contig", "CO {}Contig"],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
183 "{}.{}.singlets" : None,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
184 "{}.{}.info" : None,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
185 "{}.{}.contigs.links" : None}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
186
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
187 CAP3_FILES_MAPPING = {"{}.{}.contigs" : "small_clusters.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
188 "{}.{}.contigs.qual" : "small_clusters.contigs.qual",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
189 "{}.{}.aln" : "small_clusters.aln",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
190 "{}.{}.ace" : "small_clusters.ace",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
191 "{}.{}.singlets" : None,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
192 "{}.{}.info" : None,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
193 "{}.{}.contigs.links" : None}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
194
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
195 CAP3_FILENAMES = list(CAP3_PATTERNS_REPLACE.keys())
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
196
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
197 CAP3_FILES_GOODNAMES = {"{}.{}.contigs" : "contigs.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
198 "{}.{}.contigs.qual" : "contigs.qual",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
199 "{}.{}.aln" : "contigs.aln",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
200 "{}.{}.ace" : "contigs.ace",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
201 "{}.{}.singlets" : "singlets.fasta",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
202 "{}.{}.info" : "assembly.info",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
203 "{}.{}.contigs.links" : "contigs.links"}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
204
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
205
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
206
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
207
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
208 CAP3_PARAMS = " -p 80 -o 40 " ## not implemented yet
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
209
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
210 LTR_DETECTION_FILES = {'ADJ': 'LTR_info.ADJ',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
211 'LTR': 'LTR_info.LTR',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
212 'PBS_BLAST': 'LTR_info.with_PBS_blast.csv',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
213 'BASE' : 'LTR_info'}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
214
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
215 # options for all-2-all search and annotations
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
216 FilteringThreshod = namedtuple("FilteringThreshold",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
217 "min_lcov min_pid min_ovl min_scov evalue")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
218 AnnotationParams = namedtuple("AnnotationParams", "blastn blastx blastn_trna")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
219 Option = namedtuple('Options',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
220 ('name database all2all_search_params filtering_threshold '
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
221 'filter_self_hits legacy_database lastdb annotation_search_params'))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
222
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
223 # protein domain search options:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
224 DIAMOND = {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
225 'args': ' -p {max_proc} --max-target-seqs 1 --min-score 30 --freq-sd 1000 --more-sensitive',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
226 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
227 'column_types' : [str, str, float, float, float, float, float],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
228 'program': 'diamond blastx',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
229 'filter_function' : lambda x: x.bitscore >= 30,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
230 'parallelize' : False
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
231 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
232 BLASTX_W3 = {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
233 'args': ' -num_alignments 1 -word_size 2 -evalue 0.01 ',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
234 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
235 'column_types' : [str, str, float, float, float, float, float],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
236 'program': 'blastx',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
237 'filter_function' : lambda x: x.bitscore >= 33
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
238 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
239 BLASTX_W2 = BLASTX_W3
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
240 BLASTX_W2['args'] = ' -num_alignments 1 -word_size 3 -evalue 0.01 '
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
241
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
242
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
243 ARGS = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
244
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
245 ILLUMINA = Option(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
246 name="illumina",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
247 database='blastdb_legacy',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
248 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F '
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
249 '"m D" -v100000000 -b100000000'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
250 ' -D4 -C 30 -H 30 -i {query} -d {blastdb}'),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
251 filtering_threshold=FilteringThreshod(55, 90, 0, 0, 1),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
252 filter_self_hits=False,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
253 legacy_database=True,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
254 lastdb=False,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
255 annotation_search_params=AnnotationParams(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
256 blastn={
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
257 'args': ' -task blastn -num_alignments 1 -evalue 0.01 ',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
258 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
259 'column_types' : [str, str, float, float, float, float, float],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
260 'program': 'blastn',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
261 'filter_function' : lambda x: x.length > 30 and x.bitscore > 60
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
262
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
263 },
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
264 blastx=BLASTX_W3,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
265 blastn_trna={
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
266 'args': ' -task blastn -num_alignments 1 -word_size 7',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
267 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
268 'column_types' : [str, str, float, float, float, float, float],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
269 'program': 'blastn',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
270 'filter_function' : lambda x: x.length > 18 and x.bitscore > 60
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
271 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
272 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
273 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
274
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
275 ILLUMINA_DUST_OFF = ILLUMINA._replace(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
276 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F '
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
277 'F -v100000000 -b100000000'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
278 ' -D4 -C 30 -H 30 -i {query} -d {blastdb}'),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
279 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
280
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
281 ILLUMINA_SHORT = ILLUMINA._replace(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
282 name="illumina_short",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
283 all2all_search_params=('mgblast -p 75 -W18 -UT -X40 -KT -JF -F '
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
284 '"m D" -v100000000 -b100000000'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
285 ' -D4 -C 20 -H 30 -i {query} -d {blastdb}'),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
286 filtering_threshold=FilteringThreshod(40, 90, 0, 0, 0.1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
287 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
288
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
289
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
290 OXFORD_NANOPORE = Option(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
291 name="oxford_nanopore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
292 database='lastdb',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
293 all2all_search_params=('last_wrapper.py -f blasttab+ -P1 '
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
294 ' -m 700 -p {} '
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
295 ' {{blastdb}} {{query}} ').format(LASTAL_PARAMS),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
296 filtering_threshold=FilteringThreshod(40, 50, 0, 0, 0.01),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
297 filter_self_hits=True,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
298 legacy_database=False,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
299 lastdb=True,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
300 annotation_search_params=AnnotationParams(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
301 blastn={
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
302 'args': ' -task blastn -num_alignments 1 -evalue 0.01 -word_size 11',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
303 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
304 'column_types' : [str, str, float, float, float, float, float],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
305 'program': 'blastn',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
306 'filter_function' : lambda x: x.length > 30 and x.bitscore > 50
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
307 },
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
308 blastx={
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
309 'args': ' -num_alignments 1 -word_size 2 -evalue 0.1',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
310 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
311 'column_types' : [str, str, float, float, float, float, float],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
312 'program': 'blastx',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
313 'filter_function' : lambda x: x.bitscore >= 30
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
314 },
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
315 blastn_trna={
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
316 'args': ' -task blastn -num_alignments 1 -word_size 7',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
317 'output_columns' : "qseqid sseqid qlen slen length ppos bitscore",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
318 'column_types' : [str, str, float, float, float, float, float],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
319 'program': 'blastn',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
320 'filter_function' : lambda x: x.length > 18 and x.length > 60
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
321 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
322 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
323 )