comparison seqclust @ 8:3bc73f5dc785 draft

Uploaded
author petrn
date Fri, 20 Dec 2019 14:17:59 +0000
parents
children
comparison
equal deleted inserted replaced
7:c56807be3b72 8:3bc73f5dc785
1 #!/usr/bin/env python3
2 ''' TAndem REpeat ANalyzer '''
3 import os
4 import sys
5 import shutil
6 import subprocess
7 import argparse
8 from argparse import RawTextHelpFormatter
9 import logging
10 import shlex
11 import multiprocessing
12 # config must be loaded before seqtools,...
13 import config
14 import re
15 from lib import seqtools, graphtools, utils, assembly_tools
16 from lib import r2py
17
18 REQUIRED_VERSION = (3, 4)
19 if sys.version_info < REQUIRED_VERSION:
20 raise Exception("\n\npython 3.4 or higher is required!\n")
21
22 # append path to louvain clustering and other binaries
23 os.environ['PATH'] = "{}:{}:{}".format(config.BINARIES, config.LOUVAIN,
24 os.environ['PATH'])
25
26 LOGGER = logging.getLogger(__name__)
27
28
29 def get_version(path, tarean_mode):
30 # get git version
31 branch = "?"
32 shorthash = "?"
33 revcount = "?"
34 tag = "?"
35 try:
36 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
37 shell=True,
38 cwd=path).decode('ascii').strip()
39 shorthash = subprocess.check_output(
40 "git log --pretty=format:'%h' -n 1 ",
41 shell=True,
42 cwd=path).decode('ascii').strip()
43 revcount = len(subprocess.check_output(
44 "git log --oneline", shell=True,
45 cwd=path).decode('ascii').split())
46 tag = subprocess.check_output("git describe --tags --abbrev=0",
47 cwd=path,
48 shell=True).decode('ascii').strip()
49 version_info = "{branch}-{tag}-{revcount}({shorthash})".format(
50 branch=branch,
51 shorthash=shorthash,
52 tag=tag,
53 revcount=revcount
54 )
55 except:
56 # alernativelly - read it from file
57 try:
58 with open(path + "/version_info.txt", 'r') as f:
59 version_info = f.read()
60 except FileNotFoundError:
61 version_info = "version of pipeline not available!"
62
63 ## get database versions:
64 PD = "?"
65 PDmd5 = "?"
66 DD = "?"
67 DDmd5 = "?"
68 try:
69 PD = os.path.basename(config.PROTEIN_DATABASE)
70 PDmd5 = utils.md5checksum(config.PROTEIN_DATABASE + ".psq",
71 fail_if_missing=not tarean_mode)
72 DD = os.path.basename(config.DNA_DATABASE)
73 DDmd5 = utils.md5checksum(config.DNA_DATABASE + ".nsq")
74 except:
75 ## some problem with databases
76 pass
77 version_string = (
78 "-------------------------------------"
79 "-------------------------------------\n"
80 "PIPELINE VERSION : "
81 "{version_info}\n\n"
82 "PROTEIN DATABASE VERSION : {PD}\n"
83 " md5 checksum : {PDmd5}\n\n"
84 "DNA DATABASE VERSION : {DD}\n"
85 " md5 checksum : {DDmd5}\n"
86 "-------------------------------------"
87 "-------------------------------------\n").format(
88
89 version_info=version_info,
90 PD=PD,
91 PDmd5=PDmd5,
92 DD=DD,
93 DDmd5=DDmd5
94 )
95
96 LOGGER.info(version_string)
97 return version_string
98
99
100 def valid_database(database_file):
101 with open(database_file, 'r', encoding='ascii') as f:
102 for i in f:
103 if i[0] == ">":
104 if not re.match(">.+#.+/*", i):
105 # TODO - make edits to correct fomating of custom database???
106 return False
107 return True
108
109
110 def add_databases(databases, custom_databases_dir, dbtype='nucl'):
111 '''custom databases are copied to directory tree and blast
112 database is created using makeblastdb
113 '''
114
115 databases_ok = []
116 print(databases)
117 for db_path, db_name in databases:
118 db_destination = "{}/{}".format(custom_databases_dir, db_name)
119 shutil.copyfile(db_path, db_destination)
120 if not valid_database(db_destination):
121 raise ValueError((
122 "\n------------------------------------------------------------\n"
123 "Custom database is not valid!\n"
124 "Custom database of repeats are DNA sequences in fasta format.\n"
125 "The required format for IDs in a custom library is : \n"
126 " '>reapeatname#class/subclass'\n"
127 "Reformat the database and try again!\n"
128 "-------------------------------------------------------------\n\n"
129 ))
130
131 cmd = "makeblastdb -in {0} -out {0} -dbtype {1}".format(db_destination,
132 dbtype)
133 print(cmd)
134 args = shlex.split(cmd)
135 print(args)
136 if subprocess.check_call(args, stderr=sys.stdout):
137 Warning("makeblastdb on {} failed".format(db_name))
138 else:
139 databases_ok.append([db_destination, "custom_db_" + db_name])
140 if len(databases_ok) == 0:
141 return None
142 else:
143 return databases_ok
144
145
146 def meminfo():
147 ''' detect physical memory and memory usage'''
148 info = {}
149 required_fields = [
150 'MemTotal:', 'MemFree:', 'Cached:', 'SwapCached:', 'Buffers:'
151 ]
152 with open('/proc/meminfo', 'r') as f:
153 for i in f:
154 a = i.split()
155 if a[0] in required_fields:
156 info[a[0]] = int(a[1])
157 return info
158
159
160 def dict2lists(d):
161 ''' convert dict to nested list
162 use the funsction to pass dictionary to R function
163 '''
164 values = list(d.values())
165 keys = list(d.keys())
166 return [values, keys]
167
168
169 def show_object(obj):
170 '''
171 helper function for printing all public atributes,
172 does not print callebme atributes e.i. methods..
173 '''
174
175 s = "Configuration--------------->\n"
176 for i in dir(obj):
177 # do not show private
178 if i[:2] != "__":
179 value = getattr(obj, i)
180 if not callable(value):
181 s += "{} : {}\n".format(i, value)
182 s += "<---------------configuration\n"
183 return s
184
185
186 class DataInfo():
187 '''
188 stores information state of clustering and data
189 '''
190
191 def __init__(self, args, paths):
192 LOGGER.info("getting information about input sequences")
193 self.args = args
194 self.working_directory = args.output_dir
195 self.input_sequences = args.sequences.name
196 self.number_of_input_sequences = seqtools.SequenceSet.fasta_length(
197 self.input_sequences)
198 self.paired = args.paired
199 self.prefix_length = args.prefix_length
200 self.physical_memory = meminfo()['MemTotal:']
201 self.edges_max = config.EMAX
202 # set max memory
203 if args.max_memory:
204 self.max_memory = args.max_memory
205 else:
206 self.max_memory = meminfo()["MemTotal:"]
207 # modify initial setup if number of sequences is low
208 if args.automatic_filtering:
209 config.NUMBER_OF_SEQUENCES_FOR_PRERUN = config.NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING
210
211 if self.number_of_input_sequences < config.NUMBER_OF_SEQUENCES_FOR_PRERUN:
212 config.NUMBER_OF_SEQUENCES_FOR_PRERUN = self.number_of_input_sequences
213
214 # is number of input sequences sufficient
215 if self.number_of_input_sequences < config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES:
216 raise WrongInputDataError(
217 "provide more sequences for clustering, minumum {} is .required".format(
218 config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES))
219 # these atribudes will be set later after clustering is done
220 self.max_annotated_clusters = None
221 self.max_annotated_superclusters = None
222 # the atributes will be set after prerun is performed
223 self.prerun_ecount = None
224 self.prerun_ecount_corrected = None
225 self.sample_size = None
226 self.max_number_reads_for_clustering = None
227 self.mincln = None
228 self.number_of_omitted_reads = 0
229 LOGGER.info("sampling sequences for prerun analysis")
230 sample = seqtools.SequenceSet(
231 source=self.input_sequences,
232 sample_size=config.NUMBER_OF_SEQUENCES_FOR_PRERUN,
233 paired=self.paired,
234 filename=paths.sample_db,
235 fasta=paths.sample_fasta,
236 rename=True)
237 sample.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
238 # preliminary clustering
239 self.prerun_vcount = len(sample)
240 # line count
241 self._prerun(sample, paths)
242 # adjust size of chunks:
243 if self.number_of_reads_for_clustering < config.CHUNK_SIZE * 30:
244 config.CHUNK_SIZE = round(self.number_of_reads_for_clustering / 40)
245
246 def _prerun(self, sample, paths):
247 '''Preliminary characterization sequences using
248 clustering on small dataset - stored as sample '''
249 sample.make_chunks(chunk_size=1000)
250 sample.create_hitsort(options=self.args.options)
251 sample_hitsort = graphtools.Graph(source=sample.hitsort,
252 paired=self.paired,
253 seqids=sample.keys())
254 sample_hitsort.save_indexed_graph()
255 sample_hitsort.louvain_clustering(merge_threshold=0.2)
256 sample_hitsort.export_cls(path=paths.prerun_cls_file)
257 sample.annotate(
258 config.DNA_DATABASE,
259 annotation_name="dna_database",
260 directory=paths.prerun,
261 params=self.args.options.annotation_search_params.blastn)
262
263 selected_tarean_contigs = []
264 ecount_corrected = sample_hitsort.ecount
265 vcount_corrected = sample_hitsort.vcount
266 if self.args.automatic_filtering:
267 prerun_cluster_info = sample_hitsort.export_clusters_files_multiple(
268 min_size=10,
269 directory=paths.prerun_clusters,
270 sequences=sample,
271 tRNA_database_path=config.TRNA_DATABASE,
272 satellite_model_path=config.SATELLITE_MODEL)
273 # check of prerun contain clusters with large number of edges
274 # these sequences can be used for filtering
275 for cl in prerun_cluster_info:
276 print(cl.ecount, cl.vcount, sample_hitsort.ecount,
277 cl.tandem_rank)
278
279 if (cl.tandem_rank in config.TANDEM_RANKS[0:2] and
280 cl.ecount / sample_hitsort.ecount >
281 config.FILTER_MIN_PROP_THRESHOLD and
282 cl.vcount > config.FILTER_MIN_SIZE_THRESHOLD):
283 selected_tarean_contigs.append(cl.tarean_contig_file)
284 ecount_corrected -= cl.ecount
285 vcount_corrected -= cl.vcount
286
287 if selected_tarean_contigs:
288 with open(paths.filter_sequences_file, 'w') as out:
289 for fname in selected_tarean_contigs:
290 with open(fname, 'r') as f:
291 out.write(f.read())
292 self.sequence_fiter = paths.filter_sequences_file
293 else:
294 self.sequence_fiter = None
295
296 self.prerun_ecount = sample_hitsort.ecount
297 self.prerun_ecount_corrected = ecount_corrected
298 self.prerun_vcount_corrected = vcount_corrected
299 self.max_number_reads_for_clustering = round((
300 ((self.edges_max * self.max_memory) /
301 self.prerun_ecount_corrected * self.prerun_vcount**2)**(0.5)) / 2)
302
303 if self.max_number_reads_for_clustering >= self.number_of_input_sequences:
304 self.sample_size = 0
305 else:
306 self.sample_size = self.max_number_reads_for_clustering
307
308 n1 = self.sample_size if self.sample_size != 0 else self.number_of_input_sequences
309 n2 = self.args.sample if self.args.sample != 0 else self.number_of_input_sequences
310 self.number_of_reads_for_clustering = min(n1, n2)
311 # minlcn is set either based on mincl or value specified in config,
312 # whatever is higher
313 self.mincln = int(self.number_of_reads_for_clustering *
314 self.args.mincl / 100)
315 if self.mincln < config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER:
316 self.mincln = config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER
317
318 def __str__(self):
319 s = "Data info------------------->\n"
320 for i in dir(self):
321 # do not show private
322 if i[:2] != "__":
323 value = getattr(self, i)
324 if not callable(value):
325 s += "{} : {}\n".format(i, value)
326 s += "<----------------------Data info\n"
327 return s
328
329
330 class DataFiles(object):
331 '''
332 stores location of data files and create directories ...
333 atributes are:
334 - individual directories
335 - individual files
336 - list of files or directories
337
338 directories are created if does not exist
339 '''
340
341 def __init__(self, working_dir, subdirs, files):
342 LOGGER.info("creating directory structure")
343 self.working_dir = working_dir
344 # add and create directories paths
345 for i in subdirs:
346 d = os.path.join(self.working_dir, subdirs[i])
347 os.makedirs(d, exist_ok=True)
348 setattr(self, i, d)
349 setattr(self, i + "__relative", subdirs[i])
350 # add file paths
351 for i in files:
352 d = os.path.join(self.working_dir, files[i])
353 setattr(self, i, d)
354 setattr(self, i + "__relative", files[i])
355
356 def __str__(self):
357 s = ""
358 for i in dir(self):
359 # do not show private
360 if i[:2] != "__":
361 value = getattr(self, i)
362 if not callable(value):
363 s += "{} : {}\n".format(i, value)
364 return s
365
366 def as_list(self):
367 '''
368 convert attr and vaues to list - suitable for passing values to R functions
369 '''
370 values = list()
371 keys = list()
372 for i in dir(self):
373 # do not show private
374 if i[:2] != "__":
375 value = getattr(self, i)
376 if not callable(value):
377 values.append(value)
378 keys.append(i)
379 return [values, keys]
380
381 def cleanup(self, paths):
382 ''' will remove unnecessary files from working directory '''
383 for i in paths:
384 fn = getattr(self, i)
385 if os.path.exists(fn):
386 if os.path.isdir(fn):
387 shutil.rmtree(fn, ignore_errors=False)
388 else:
389 os.remove(fn)
390
391
392 class WrongInputDataError(Exception):
393 '''Custom exception for wrong input
394 '''
395
396 def __init__(self, arg):
397 super(WrongInputDataError, self).__init__(arg)
398 self.msg = arg
399
400
401 class Range():
402 '''
403 This class is used to check float range in argparse
404 '''
405
406 def __init__(self, start, end):
407 self.start = start
408 self.end = end
409
410 def __eq__(self, other):
411 return self.start <= other <= self.end
412
413 def __str__(self):
414 return "float range {}..{}".format(self.start, self.end)
415
416 def __repr__(self):
417 return "float range {}..{}".format(self.start, self.end)
418
419
420 class DirectoryType(object):
421 '''
422 this class is similar to argparse.FileType
423 for mode 'w' creates and check the access to the directory
424 for mode 'r' check the presence of the dictory and accesibility
425 '''
426
427 def __init__(self, mode='r'):
428 self._mode = mode
429
430 def __call__(self, string):
431 if self._mode == 'w':
432 try:
433 os.makedirs(string, exist_ok=True)
434 except FileExistsError:
435 raise argparse.ArgumentTypeError(
436 "Cannot create directory, '{}' is a file".format(string))
437 if os.access(string, os.W_OK):
438 return string
439 else:
440 raise argparse.ArgumentTypeError(
441 "Directory '{}' is not writable".format(string))
442 if self._mode == 'r':
443 if not os.path.isdir(string):
444 raise argparse.ArgumentTypeError(
445 "'{}' is not a directory".format(string))
446 if os.access(string, os.R_OK):
447 return string
448 else:
449 raise argparse.ArgumentTypeError(
450 "Directory '{}' is not readable".format(string))
451
452
453 def get_cmdline_args():
454 '''seqclust command line parser'''
455
456 description = """RepeatExplorer:
457 Repetitive sequence discovery and clasification from NGS data
458
459 """
460
461 # arguments parsing
462 parser = argparse.ArgumentParser(description=description,
463 formatter_class=RawTextHelpFormatter)
464 parser.add_argument('-p', '--paired', action='store_true', default=False)
465 parser.add_argument('-A',
466 '--automatic_filtering',
467 action='store_true',
468 default=False)
469 parser.add_argument(
470 '-t',
471 '--tarean_mode',
472 action='store_true',
473 default=False,
474 help="analyze only tandem reapeats without additional classification")
475 parser.add_argument('sequences', type=argparse.FileType('r'))
476 parser.add_argument('-l',
477 '--logfile',
478 type=argparse.FileType('w'),
479 default=None,
480 help='log file, logging goes to stdout if not defines')
481 parser.add_argument('-m',
482 '--mincl',
483 type=float,
484 choices=[Range(0.0, 100.0)],
485 default=0.01)
486 parser.add_argument(
487 '-M',
488 '--merge_threshold',
489 type=float,
490 choices=[0, Range(0.1, 1)],
491 default=0,
492 help=
493 "threshold for mate-pair based cluster merging, default 0 - no merging")
494 parser.add_argument(
495 '-o',
496 '--min_lcov',
497 type=float,
498 choices=[Range(30.0, 80.0)],
499 default=55,
500 help=
501 "minimal overlap coverage - relative to longer sequence length, default 55")
502 parser.add_argument('-c',
503 '--cpu',
504 type=int,
505 default=int(os.environ.get('TAREAN_CPU', 0)),
506 help="number of cpu to use, if 0 use max available")
507 parser.add_argument(
508 '-s',
509 '--sample',
510 type=int,
511 default=0,
512 help="use only sample of input data[by default max reads is used")
513 parser.add_argument(
514 '-P',
515 '--prefix_length',
516 type=int,
517 default=0,
518 help=("If you wish to keep part of the sequences name,\n"
519 " enter the number of characters which should be \n"
520 "kept (1-10) instead of zero. Use this setting if\n"
521 " you are doing comparative analysis"))
522 parser.add_argument('-v',
523 '--output_dir',
524 type=DirectoryType('w'),
525 default="clustering_results")
526 parser.add_argument(
527 '-r',
528 '--max_memory',
529 type=int,
530 default=int(os.environ.get('TAREAN_MAX_MEM', 0)),
531 help=("Maximal amount of available RAM in kB if not set\n"
532 "clustering tries to use whole available RAM"))
533 parser.add_argument(
534 '-d',
535 '--database',
536 default=None,
537 help="fasta file with database for annotation and name of database",
538 nargs=2,
539 action='append')
540
541 parser.add_argument(
542 "-C",
543 "--cleanup",
544 default=False,
545 action="store_true",
546 help="remove unncessary large files from working directory")
547
548 parser.add_argument(
549 "-k",
550 "--keep_names",
551 default=False,
552 action="store_true",
553 help="keep sequence names, by default sequences are renamed")
554
555 parser.add_argument(
556 '-a', '--assembly_min',
557 default=5, type=int,
558 choices=[2,3,4,5],
559 help=('Assembly is performed on individual clusters, by default \n'
560 'clusters with size less then 5 are not assembled. If you \n'
561 'want need assembly of smaller cluster set *assmbly_min* \n'
562 'accordingly\n')
563 )
564
565 parser.add_argument('-tax',
566 '--taxon',
567 default=config.PROTEIN_DATABASE_DEFAULT,
568 choices=list(config.PROTEIN_DATABASE_OPTIONS.keys()),
569 help="Select taxon and protein database version"
570 )
571
572 parser.add_argument(
573 '-opt',
574 '--options',
575 default="ILLUMINA",
576 choices=['ILLUMINA','ILLUMINA_DUST_OFF', 'ILLUMINA_SHORT', 'OXFORD_NANOPORE'])
577
578 parser.add_argument(
579 '-D',
580 '--domain_search',
581 default="BLASTX_W3",
582 choices=['BLASTX_W2', 'BLASTX_W3', 'DIAMOND'],
583 help=
584 ('Detection of protein domains can be performed by either blastx or\n'
585 ' diamond" program. options are:\n'
586 ' BLASTX_W2 - blastx with word size 2 (slowest, the most sesitive)\n'
587 ' BLASTX_W3 - blastx with word size 3 (default)\n'
588 ' DIAMOND - diamond program (significantly faster, less sensitive)\n'
589 'To use this option diamond program must be installed in your PATH'))
590
591 args = parser.parse_args()
592
593 # covert option string to namedtuple of options
594 args.options = getattr(config, args.options)
595 # set protein database
596 args.options = args.options._replace(
597 annotation_search_params=
598 args.options.annotation_search_params._replace(blastx=getattr(
599 config, args.domain_search)))
600 return args
601
602
603 def main():
604 '''
605 Perform graph based clustering
606 '''
607 # argument parsing:
608 args = get_cmdline_args()
609 config.ARGS = args
610 logfile = args.logfile.name if args.logfile else None
611 logging.basicConfig(
612 filename=logfile,
613 format='\n%(asctime)s - %(name)s - %(levelname)s -\n%(message)s\n',
614 level=logging.INFO)
615 config.PROTEIN_DATABASE, config.CLASSIFICATION_HIERARCHY = config.PROTEIN_DATABASE_OPTIONS[
616 args.taxon]
617 # number of CPU to use
618 pipeline_version_info = get_version(config.MAIN_DIR, tarean_mode = args.tarean_mode)
619 config.PROC = args.cpu if args.cpu != 0 else multiprocessing.cpu_count()
620 # TODO add kmer range specification to config - based on the technology
621 r2py.create_connection()
622 try:
623 reporting = r2py.R(config.RSOURCE_reporting, verbose=True)
624 create_annotation = r2py.R(config.RSOURCE_create_annotation,
625 verbose=True)
626 LOGGER.info(args)
627 paths = DataFiles(working_dir=args.output_dir,
628 subdirs=config.DIRECTORY_TREE,
629 files=config.FILES)
630 # files to be included in output
631 for src, dest in config.INCLUDE:
632 shutil.copy(src, os.path.join(paths.working_dir, dest))
633 # geting information about data
634 run_info = DataInfo(args, paths)
635 LOGGER.info(run_info)
636 LOGGER.info(show_object(config))
637 # load all sequences or sample
638 sequences = seqtools.SequenceSet(
639 source=run_info.input_sequences,
640 sample_size=run_info.number_of_reads_for_clustering,
641 paired=run_info.paired,
642 filename=paths.sequences_db,
643 fasta=paths.sequences_fasta,
644 prefix_length=run_info.prefix_length,
645 rename=not run_info.args.keep_names)
646 if run_info.sequence_fiter:
647 n = sequences.remove_sequences_using_filter(
648 run_info.sequence_fiter,
649 keep_proportion=config.FILTER_PROPORTION_OF_KEPT,
650 omitted_sequences_file=paths.filter_omitted,
651 kept_sequences_file=paths.filter_kept
652 )
653 run_info.number_of_omitted_reads = n
654 # add custom databases if provided
655 if args.database:
656 config.CUSTOM_DNA_DATABASE = add_databases(
657 args.database,
658 custom_databases_dir=paths.custom_databases)
659 sequences.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
660 LOGGER.info("chunksize: {}".format(config.CHUNK_SIZE))
661 sequences.make_chunks(chunk_size=config.CHUNK_SIZE)
662 sequences.create_hitsort(output=paths.hitsort, options=args.options)
663 hitsort = graphtools.Graph(filename=paths.hitsort_db,
664 source=paths.hitsort,
665 paired=run_info.paired,
666 seqids=sequences.keys())
667
668 LOGGER.info('hitsort with {} reads and {} edges loaded.'.format(
669 hitsort.vcount, hitsort.ecount))
670
671 hitsort.save_indexed_graph()
672 LOGGER.info('hitsort index created.')
673
674 hitsort.louvain_clustering(merge_threshold=args.merge_threshold,
675 cleanup=args.cleanup)
676 hitsort.export_cls(path=paths.cls_file)
677 hitsort.adjust_cluster_size(config.FILTER_PROPORTION_OF_KEPT,
678 sequences.ids_kept)
679 sequences.annotate(config.DNA_DATABASE,
680 annotation_name="dna_database",
681 directory=paths.blastn,
682 params=args.options.annotation_search_params.blastn)
683
684 if config.CUSTOM_DNA_DATABASE:
685 LOGGER.info('annotating with custom database')
686 for db, db_name in config.CUSTOM_DNA_DATABASE:
687 sequences.annotate(
688 db,
689 annotation_name=db_name,
690 directory=paths.blastn,
691 params=args.options.annotation_search_params.blastn)
692
693 if not args.tarean_mode:
694 # additional analyses - full RE run
695 # this must be finished befor creating clusters_info
696 sequences.annotate(
697 config.PROTEIN_DATABASE,
698 annotation_name="protein_database",
699 directory=paths.blastx,
700 params=args.options.annotation_search_params.blastx)
701
702 ## annotating using customa databasesreplace
703 LOGGER.info('creating cluster graphs')
704 clusters_info = hitsort.export_clusters_files_multiple(
705 min_size=run_info.mincln,
706 directory=paths.clusters,
707 sequences=sequences,
708 tRNA_database_path=config.TRNA_DATABASE,
709 satellite_model_path=config.SATELLITE_MODEL)
710 if not args.tarean_mode:
711 LOGGER.info("assembling..")
712 assembly_tools.assembly(sequences,
713 hitsort,
714 clusters_info,
715 assembly_dir=paths.assembly,
716 contigs_file=paths.contigs,
717 min_size_of_cluster_for_assembly=args.assembly_min)
718
719 LOGGER.info("detecting LTR in assembly..")
720 for i in clusters_info:
721 i.detect_ltr(config.TRNA_DATABASE)
722
723 run_info.max_annotated_clusters = max([i.index for i in clusters_info])
724 run_info.max_annotated_superclusters = max([i.supercluster
725 for i in clusters_info])
726 # make reports
727 cluster_listing = [i.listing() for i in clusters_info]
728 # make path relative to paths.cluster_info
729 utils.save_as_table(cluster_listing, paths.clusters_info)
730 # creates table cluster_info in hitsort database
731 graphtools.Cluster.add_cluster_table_to_database(cluster_listing,
732 paths.hitsort_db)
733 # export files for consensus sequences, one for each ranks
734 consensus_files = []
735 for i in config.TANDEM_RANKS:
736 consensus_files.append(utils.export_tandem_consensus(
737 clusters_info,
738 path=paths.TR_consensus_fasta.format(i),
739 rank=i))
740
741 if not args.tarean_mode:
742 LOGGER.info("Creating report for superclusters")
743 create_annotation.create_all_superclusters_report(
744 max_supercluster=run_info.max_annotated_superclusters,
745 paths=paths.as_list(),
746 libdir=paths.libdir,
747 superclusters_dir=paths.superclusters,
748 seqdb=paths.sequences_db,
749 hitsortdb=paths.hitsort_db,
750 classification_hierarchy_file=config.CLASSIFICATION_HIERARCHY,
751 HTML_LINKS=dict2lists(config.HTML_LINKS))
752
753 LOGGER.info("Creating report for individual clusters")
754 for cluster in clusters_info:
755 create_annotation.create_cluster_report(
756 cluster.index,
757 seqdb=paths.sequences_db,
758 hitsortdb=paths.hitsort_db,
759 classification_hierarchy_file=
760 config.CLASSIFICATION_HIERARCHY,
761 HTML_LINKS=dict2lists(config.HTML_LINKS))
762
763 LOGGER.info("Creating main html report")
764 reporting.create_main_reports(
765 paths=paths.as_list(),
766 N_clustering=run_info.number_of_reads_for_clustering,
767 N_input=run_info.number_of_input_sequences,
768 N_omit=run_info.number_of_omitted_reads,
769 merge_threshold=args.merge_threshold,
770 paired=run_info.paired,
771 consensus_files=consensus_files,
772 custom_db=bool(config.CUSTOM_DNA_DATABASE),
773 tarean_mode=args.tarean_mode,
774 HTML_LINKS=dict2lists(config.HTML_LINKS),
775 pipeline_version_info=pipeline_version_info,
776 max_memory=run_info.max_memory,
777 max_number_reads_for_clustering=run_info.max_number_reads_for_clustering,
778 mincln=run_info.mincln
779 )
780
781 LOGGER.info("Html report reports created")
782
783 except:
784 r2py.shutdown(config.RSERVE_PORT)
785 raise
786 finally:
787 if args.cleanup:
788 paths.cleanup(config.FILES_TO_DISCARD_AT_CLEANUP)
789 else:
790 LOGGER.info("copy databases to working directory")
791 shutil.copy(paths.sequences_db, paths.working_dir)
792 shutil.copy(paths.hitsort_db, paths.working_dir)
793 # copy log file inside working directory
794 if logfile:
795 shutil.copyfile(logfile, paths.logfile)
796
797
798 if __name__ == "__main__":
799 main()
800 # some error handling here: