repeatexplorer: seqclust comparison

comparison seqclust @ 8:3bc73f5dc785 draft

Uploaded

author	petrn
date	Fri, 20 Dec 2019 14:17:59 +0000
parents
children

comparison

equal deleted inserted replaced

-:c56807be3b72
+:3bc73f5dc785
+#!/usr/bin/env python3
+''' TAndem REpeat ANalyzer  '''
+import os
+import sys
+import shutil
+import subprocess
+import argparse
+from argparse import RawTextHelpFormatter
+import logging
+import shlex
+import multiprocessing
+# config must be loaded before seqtools,...
+import config
+import re
+from lib import seqtools, graphtools, utils, assembly_tools
+from lib import r2py
+REQUIRED_VERSION = (3, 4)
+if sys.version_info < REQUIRED_VERSION:
+raise Exception("\n\npython 3.4 or higher is required!\n")
+# append path to louvain clustering and other binaries
+os.environ['PATH'] = "{}:{}:{}".format(config.BINARIES, config.LOUVAIN,
+os.environ['PATH'])
+LOGGER = logging.getLogger(__name__)
+def get_version(path, tarean_mode):
+# get git version
+branch = "?"
+shorthash = "?"
+revcount = "?"
+tag = "?"
+try:
+branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
+shell=True,
+cwd=path).decode('ascii').strip()
+shorthash = subprocess.check_output(
+"git log --pretty=format:'%h' -n 1  ",
+shell=True,
+cwd=path).decode('ascii').strip()
+revcount = len(subprocess.check_output(
+"git log --oneline", shell=True,
+cwd=path).decode('ascii').split())
+tag = subprocess.check_output("git describe --tags --abbrev=0",
+cwd=path,
+shell=True).decode('ascii').strip()
+version_info = "{branch}-{tag}-{revcount}({shorthash})".format(
+branch=branch,
+shorthash=shorthash,
+tag=tag,
+revcount=revcount
+)
+except:
+# alernativelly - read it from file
+try:
+with open(path + "/version_info.txt", 'r') as f:
+version_info = f.read()
+except FileNotFoundError:
+version_info = "version of pipeline not available!"
+## get database versions:
+PD = "?"
+PDmd5 = "?"
+DD = "?"
+DDmd5 = "?"
+try:
+PD = os.path.basename(config.PROTEIN_DATABASE)
+PDmd5 = utils.md5checksum(config.PROTEIN_DATABASE + ".psq",
+fail_if_missing=not tarean_mode)
+DD = os.path.basename(config.DNA_DATABASE)
+DDmd5 = utils.md5checksum(config.DNA_DATABASE + ".nsq")
+except:
+## some problem with databases
+pass
+version_string = (
+"-------------------------------------"
+"-------------------------------------\n"
+"PIPELINE VERSION         : "
+"{version_info}\n\n"
+"PROTEIN DATABASE VERSION : {PD}\n"
+"            md5 checksum : {PDmd5}\n\n"
+"DNA DATABASE VERSION     : {DD}\n"
+"            md5 checksum : {DDmd5}\n"
+"-------------------------------------"
+"-------------------------------------\n").format(
+version_info=version_info,
+PD=PD,
+PDmd5=PDmd5,
+DD=DD,
+DDmd5=DDmd5
+)
+LOGGER.info(version_string)
+return version_string
+def valid_database(database_file):
+with open(database_file, 'r', encoding='ascii') as f:
+for i in f:
+if i[0] == ">":
+if not re.match(">.+#.+/*", i):
+# TODO - make edits to correct fomating of custom database???
+return False
+return True
+def add_databases(databases, custom_databases_dir, dbtype='nucl'):
+'''custom databases are copied to directory tree and blast
+database is created using makeblastdb
+'''
+databases_ok = []
+print(databases)
+for db_path, db_name in databases:
+db_destination = "{}/{}".format(custom_databases_dir, db_name)
+shutil.copyfile(db_path, db_destination)
+if not valid_database(db_destination):
+raise ValueError((
+"\n------------------------------------------------------------\n"
+"Custom database is not valid!\n"
+"Custom database of repeats are DNA sequences in fasta format.\n"
+"The required format for IDs in a custom library is : \n"
+"   '>reapeatname#class/subclass'\n"
+"Reformat the database and try again!\n"
+"-------------------------------------------------------------\n\n"
+))
+cmd = "makeblastdb -in {0} -out {0} -dbtype {1}".format(db_destination,
+dbtype)
+print(cmd)
+args = shlex.split(cmd)
+print(args)
+if subprocess.check_call(args, stderr=sys.stdout):
+Warning("makeblastdb on {} failed".format(db_name))
+else:
+databases_ok.append([db_destination, "custom_db_" + db_name])
+if len(databases_ok) == 0:
+return None
+else:
+return databases_ok
+def meminfo():
+''' detect physical memory and memory usage'''
+info = {}
+required_fields = [
+'MemTotal:', 'MemFree:', 'Cached:', 'SwapCached:', 'Buffers:'
+]
+with open('/proc/meminfo', 'r') as f:
+for i in f:
+a = i.split()
+if a[0] in required_fields:
+info[a[0]] = int(a[1])
+return info
+def dict2lists(d):
+''' convert dict to nested list
+use the funsction to pass dictionary to R function
+'''
+values = list(d.values())
+keys = list(d.keys())
+return [values, keys]
+def show_object(obj):
+'''
+helper function for printing all public atributes,
+does not print callebme atributes e.i. methods..
+'''
+s = "Configuration--------------->\n"
+for i in dir(obj):
+# do not show private
+if i[:2] != "__":
+value = getattr(obj, i)
+if not callable(value):
+s += "{} : {}\n".format(i, value)
+s += "<---------------configuration\n"
+return s
+class DataInfo():
+'''
+stores information state of clustering and data
+'''
+def __init__(self, args, paths):
+LOGGER.info("getting information about input sequences")
+self.args = args
+self.working_directory = args.output_dir
+self.input_sequences = args.sequences.name
+self.number_of_input_sequences = seqtools.SequenceSet.fasta_length(
+self.input_sequences)
+self.paired = args.paired
+self.prefix_length = args.prefix_length
+self.physical_memory = meminfo()['MemTotal:']
+self.edges_max = config.EMAX
+# set max memory
+if args.max_memory:
+self.max_memory = args.max_memory
+else:
+self.max_memory = meminfo()["MemTotal:"]
+# modify initial setup if number of sequences is low
+if args.automatic_filtering:
+config.NUMBER_OF_SEQUENCES_FOR_PRERUN = config.NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING
+if self.number_of_input_sequences < config.NUMBER_OF_SEQUENCES_FOR_PRERUN:
+config.NUMBER_OF_SEQUENCES_FOR_PRERUN = self.number_of_input_sequences
+# is number of input sequences sufficient
+if self.number_of_input_sequences < config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES:
+raise WrongInputDataError(
+"provide more sequences for clustering, minumum {} is .required".format(
+config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES))
+# these atribudes will be set later after clustering is done
+self.max_annotated_clusters = None
+self.max_annotated_superclusters = None
+# the atributes will be set after prerun is performed
+self.prerun_ecount = None
+self.prerun_ecount_corrected = None
+self.sample_size = None
+self.max_number_reads_for_clustering = None
+self.mincln = None
+self.number_of_omitted_reads = 0
+LOGGER.info("sampling sequences for prerun analysis")
+sample = seqtools.SequenceSet(
+source=self.input_sequences,
+sample_size=config.NUMBER_OF_SEQUENCES_FOR_PRERUN,
+paired=self.paired,
+filename=paths.sample_db,
+fasta=paths.sample_fasta,
+rename=True)
+sample.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
+# preliminary clustering
+self.prerun_vcount = len(sample)
+# line count
+self._prerun(sample, paths)
+# adjust size of chunks:
+if self.number_of_reads_for_clustering < config.CHUNK_SIZE * 30:
+config.CHUNK_SIZE = round(self.number_of_reads_for_clustering / 40)
+def _prerun(self, sample, paths):
+'''Preliminary characterization sequences using
+clustering on small dataset - stored as sample '''
+sample.make_chunks(chunk_size=1000)
+sample.create_hitsort(options=self.args.options)
+sample_hitsort = graphtools.Graph(source=sample.hitsort,
+paired=self.paired,
+seqids=sample.keys())
+sample_hitsort.save_indexed_graph()
+sample_hitsort.louvain_clustering(merge_threshold=0.2)
+sample_hitsort.export_cls(path=paths.prerun_cls_file)
+sample.annotate(
+config.DNA_DATABASE,
+annotation_name="dna_database",
+directory=paths.prerun,
+params=self.args.options.annotation_search_params.blastn)
+selected_tarean_contigs = []
+ecount_corrected = sample_hitsort.ecount
+vcount_corrected = sample_hitsort.vcount
+if self.args.automatic_filtering:
+prerun_cluster_info = sample_hitsort.export_clusters_files_multiple(
+min_size=10,
+directory=paths.prerun_clusters,
+sequences=sample,
+tRNA_database_path=config.TRNA_DATABASE,
+satellite_model_path=config.SATELLITE_MODEL)
+# check of prerun contain clusters with large number of edges
+# these sequences can be used for filtering
+for cl in prerun_cluster_info:
+print(cl.ecount, cl.vcount, sample_hitsort.ecount,
+cl.tandem_rank)
+if (cl.tandem_rank in config.TANDEM_RANKS[0:2] and
+cl.ecount / sample_hitsort.ecount >
+config.FILTER_MIN_PROP_THRESHOLD and
+cl.vcount > config.FILTER_MIN_SIZE_THRESHOLD):
+selected_tarean_contigs.append(cl.tarean_contig_file)
+ecount_corrected -= cl.ecount
+vcount_corrected -= cl.vcount
+if selected_tarean_contigs:
+with open(paths.filter_sequences_file, 'w') as out:
+for fname in selected_tarean_contigs:
+with open(fname, 'r') as f:
+out.write(f.read())
+self.sequence_fiter = paths.filter_sequences_file
+else:
+self.sequence_fiter = None
+self.prerun_ecount = sample_hitsort.ecount
+self.prerun_ecount_corrected = ecount_corrected
+self.prerun_vcount_corrected = vcount_corrected
+self.max_number_reads_for_clustering = round((
+((self.edges_max * self.max_memory) /
+self.prerun_ecount_corrected * self.prerun_vcount**2)**(0.5)) / 2)
+if self.max_number_reads_for_clustering >= self.number_of_input_sequences:
+self.sample_size = 0
+else:
+self.sample_size = self.max_number_reads_for_clustering
+n1 = self.sample_size if self.sample_size != 0 else self.number_of_input_sequences
+n2 = self.args.sample if self.args.sample != 0 else self.number_of_input_sequences
+self.number_of_reads_for_clustering = min(n1, n2)
+# minlcn is set either based on mincl or value specified in config,
+# whatever is higher
+self.mincln = int(self.number_of_reads_for_clustering *
+self.args.mincl / 100)
+if self.mincln < config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER:
+self.mincln = config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER
+def __str__(self):
+s = "Data info------------------->\n"
+for i in dir(self):
+# do not show private
+if i[:2] != "__":
+value = getattr(self, i)
+if not callable(value):
+s += "{} : {}\n".format(i, value)
+s += "<----------------------Data info\n"
+return s
+class DataFiles(object):
+'''
+stores location of data files and create directories ...
+atributes are:
+- individual directories
+- individual files
+- list of files or directories
+directories are created if does not exist
+'''
+def __init__(self, working_dir, subdirs, files):
+LOGGER.info("creating directory structure")
+self.working_dir = working_dir
+# add and create directories paths
+for i in subdirs:
+d = os.path.join(self.working_dir, subdirs[i])
+os.makedirs(d, exist_ok=True)
+setattr(self, i, d)
+setattr(self, i + "__relative", subdirs[i])
+# add file paths
+for i in files:
+d = os.path.join(self.working_dir, files[i])
+setattr(self, i, d)
+setattr(self, i + "__relative", files[i])
+def __str__(self):
+s = ""
+for i in dir(self):
+# do not show private
+if i[:2] != "__":
+value = getattr(self, i)
+if not callable(value):
+s += "{} : {}\n".format(i, value)
+return s
+def as_list(self):
+'''
+convert attr and vaues to list - suitable for passing values to R functions
+'''
+values = list()
+keys = list()
+for i in dir(self):
+# do not show private
+if i[:2] != "__":
+value = getattr(self, i)
+if not callable(value):
+values.append(value)
+keys.append(i)
+return [values, keys]
+def cleanup(self, paths):
+''' will remove unnecessary files from working directory '''
+for i in paths:
+fn = getattr(self, i)
+if os.path.exists(fn):
+if os.path.isdir(fn):
+shutil.rmtree(fn, ignore_errors=False)
+else:
+os.remove(fn)
+class WrongInputDataError(Exception):
+'''Custom exception for wrong input
+'''
+def __init__(self, arg):
+super(WrongInputDataError, self).__init__(arg)
+self.msg = arg
+class Range():
+'''
+This class is used to check float range in argparse
+'''
+def __init__(self, start, end):
+self.start = start
+self.end = end
+def __eq__(self, other):
+return self.start <= other <= self.end
+def __str__(self):
+return "float range {}..{}".format(self.start, self.end)
+def __repr__(self):
+return "float range {}..{}".format(self.start, self.end)
+class DirectoryType(object):
+'''
+this class is similar to argparse.FileType
+for mode 'w' creates and check the access to the directory
+for mode 'r' check the presence of the dictory and accesibility
+'''
+def __init__(self, mode='r'):
+self._mode = mode
+def __call__(self, string):
+if self._mode == 'w':
+try:
+os.makedirs(string, exist_ok=True)
+except FileExistsError:
+raise argparse.ArgumentTypeError(
+"Cannot create directory, '{}' is a file".format(string))
+if os.access(string, os.W_OK):
+return string
+else:
+raise argparse.ArgumentTypeError(
+"Directory '{}' is not writable".format(string))
+if self._mode == 'r':
+if not os.path.isdir(string):
+raise argparse.ArgumentTypeError(
+"'{}' is not a directory".format(string))
+if os.access(string, os.R_OK):
+return string
+else:
+raise argparse.ArgumentTypeError(
+"Directory '{}' is not readable".format(string))
+def get_cmdline_args():
+'''seqclust command line parser'''
+description = """RepeatExplorer:
+Repetitive sequence discovery and clasification from NGS data
+"""
+# arguments parsing
+parser = argparse.ArgumentParser(description=description,
+formatter_class=RawTextHelpFormatter)
+parser.add_argument('-p', '--paired', action='store_true', default=False)
+parser.add_argument('-A',
+'--automatic_filtering',
+action='store_true',
+default=False)
+parser.add_argument(
+'-t',
+'--tarean_mode',
+action='store_true',
+default=False,
+help="analyze only tandem reapeats without additional classification")
+parser.add_argument('sequences', type=argparse.FileType('r'))
+parser.add_argument('-l',
+'--logfile',
+type=argparse.FileType('w'),
+default=None,
+help='log file, logging goes to stdout if not defines')
+parser.add_argument('-m',
+'--mincl',
+type=float,
+choices=[Range(0.0, 100.0)],
+default=0.01)
+parser.add_argument(
+'-M',
+'--merge_threshold',
+type=float,
+choices=[0, Range(0.1, 1)],
+default=0,
+help=
+"threshold for mate-pair based cluster merging, default 0 - no merging")
+parser.add_argument(
+'-o',
+'--min_lcov',
+type=float,
+choices=[Range(30.0, 80.0)],
+default=55,
+help=
+"minimal overlap coverage - relative to longer sequence length, default 55")
+parser.add_argument('-c',
+'--cpu',
+type=int,
+default=int(os.environ.get('TAREAN_CPU', 0)),
+help="number of cpu to use, if 0 use max available")
+parser.add_argument(
+'-s',
+'--sample',
+type=int,
+default=0,
+help="use only sample of input data[by default max reads is used")
+parser.add_argument(
+'-P',
+'--prefix_length',
+type=int,
+default=0,
+help=("If you wish to keep part of the sequences name,\n"
+" enter the number of characters which should be \n"
+"kept (1-10) instead of zero. Use this setting if\n"
+" you are doing comparative analysis"))
+parser.add_argument('-v',
+'--output_dir',
+type=DirectoryType('w'),
+default="clustering_results")
+parser.add_argument(
+'-r',
+'--max_memory',
+type=int,
+default=int(os.environ.get('TAREAN_MAX_MEM', 0)),
+help=("Maximal amount of available RAM in kB if not set\n"
+"clustering tries to use whole available RAM"))
+parser.add_argument(
+'-d',
+'--database',
+default=None,
+help="fasta file with database for annotation and name of database",
+nargs=2,
+action='append')
+parser.add_argument(
+"-C",
+"--cleanup",
+default=False,
+action="store_true",
+help="remove unncessary large files from working directory")
+parser.add_argument(
+"-k",
+"--keep_names",
+default=False,
+action="store_true",
+help="keep sequence names, by default sequences are renamed")
+parser.add_argument(
+'-a', '--assembly_min',
+default=5, type=int,
+choices=[2,3,4,5],
+help=('Assembly is performed on individual clusters, by default \n'
+'clusters with size less then 5 are not assembled. If you \n'
+'want need assembly of smaller cluster set *assmbly_min* \n'
+'accordingly\n')
+)
+parser.add_argument('-tax',
+'--taxon',
+default=config.PROTEIN_DATABASE_DEFAULT,
+choices=list(config.PROTEIN_DATABASE_OPTIONS.keys()),
+help="Select taxon and protein database version"
+)
+parser.add_argument(
+'-opt',
+'--options',
+default="ILLUMINA",
+choices=['ILLUMINA','ILLUMINA_DUST_OFF', 'ILLUMINA_SHORT', 'OXFORD_NANOPORE'])
+parser.add_argument(
+'-D',
+'--domain_search',
+default="BLASTX_W3",
+choices=['BLASTX_W2', 'BLASTX_W3', 'DIAMOND'],
+help=
+('Detection of protein domains can be performed by either blastx or\n'
+' diamond" program. options are:\n'
+'  BLASTX_W2 - blastx with word size 2 (slowest, the most sesitive)\n'
+'  BLASTX_W3 - blastx with word size 3 (default)\n'
+'  DIAMOND   - diamond program (significantly faster, less sensitive)\n'
+'To use this option diamond program must be installed in your PATH'))
+args = parser.parse_args()
+# covert option string to namedtuple of options
+args.options = getattr(config, args.options)
+# set protein database
+args.options = args.options._replace(
+annotation_search_params=
+args.options.annotation_search_params._replace(blastx=getattr(
+config, args.domain_search)))
+return args
+def main():
+'''
+Perform graph based clustering
+'''
+# argument parsing:
+args = get_cmdline_args()
+config.ARGS = args
+logfile = args.logfile.name if args.logfile else None
+logging.basicConfig(
+filename=logfile,
+format='\n%(asctime)s - %(name)s - %(levelname)s -\n%(message)s\n',
+level=logging.INFO)
+config.PROTEIN_DATABASE, config.CLASSIFICATION_HIERARCHY = config.PROTEIN_DATABASE_OPTIONS[
+args.taxon]
+# number of CPU to use
+pipeline_version_info = get_version(config.MAIN_DIR, tarean_mode = args.tarean_mode)
+config.PROC = args.cpu if args.cpu != 0 else multiprocessing.cpu_count()
+# TODO add kmer range specification to config - based on the technology
+r2py.create_connection()
+try:
+reporting = r2py.R(config.RSOURCE_reporting, verbose=True)
+create_annotation = r2py.R(config.RSOURCE_create_annotation,
+verbose=True)
+LOGGER.info(args)
+paths = DataFiles(working_dir=args.output_dir,
+subdirs=config.DIRECTORY_TREE,
+files=config.FILES)
+# files to be included in output
+for src, dest in config.INCLUDE:
+shutil.copy(src, os.path.join(paths.working_dir, dest))
+# geting information about data
+run_info = DataInfo(args, paths)
+LOGGER.info(run_info)
+LOGGER.info(show_object(config))
+# load all sequences or sample
+sequences = seqtools.SequenceSet(
+source=run_info.input_sequences,
+sample_size=run_info.number_of_reads_for_clustering,
+paired=run_info.paired,
+filename=paths.sequences_db,
+fasta=paths.sequences_fasta,
+prefix_length=run_info.prefix_length,
+rename=not run_info.args.keep_names)
+if run_info.sequence_fiter:
+n = sequences.remove_sequences_using_filter(
+run_info.sequence_fiter,
+keep_proportion=config.FILTER_PROPORTION_OF_KEPT,
+omitted_sequences_file=paths.filter_omitted,
+kept_sequences_file=paths.filter_kept
+)
+run_info.number_of_omitted_reads = n
+# add custom databases if provided
+if args.database:
+config.CUSTOM_DNA_DATABASE = add_databases(
+args.database,
+custom_databases_dir=paths.custom_databases)
+sequences.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
+LOGGER.info("chunksize: {}".format(config.CHUNK_SIZE))
+sequences.make_chunks(chunk_size=config.CHUNK_SIZE)
+sequences.create_hitsort(output=paths.hitsort, options=args.options)
+hitsort = graphtools.Graph(filename=paths.hitsort_db,
+source=paths.hitsort,
+paired=run_info.paired,
+seqids=sequences.keys())
+LOGGER.info('hitsort with {} reads and {} edges loaded.'.format(
+hitsort.vcount, hitsort.ecount))
+hitsort.save_indexed_graph()
+LOGGER.info('hitsort index created.')
+hitsort.louvain_clustering(merge_threshold=args.merge_threshold,
+cleanup=args.cleanup)
+hitsort.export_cls(path=paths.cls_file)
+hitsort.adjust_cluster_size(config.FILTER_PROPORTION_OF_KEPT,
+sequences.ids_kept)
+sequences.annotate(config.DNA_DATABASE,
+annotation_name="dna_database",
+directory=paths.blastn,
+params=args.options.annotation_search_params.blastn)
+if config.CUSTOM_DNA_DATABASE:
+LOGGER.info('annotating with custom database')
+for db, db_name in config.CUSTOM_DNA_DATABASE:
+sequences.annotate(
+db,
+annotation_name=db_name,
+directory=paths.blastn,
+params=args.options.annotation_search_params.blastn)
+if not args.tarean_mode:
+# additional analyses - full RE run
+# this must be finished befor creating clusters_info
+sequences.annotate(
+config.PROTEIN_DATABASE,
+annotation_name="protein_database",
+directory=paths.blastx,
+params=args.options.annotation_search_params.blastx)
+## annotating using customa databasesreplace
+LOGGER.info('creating cluster graphs')
+clusters_info = hitsort.export_clusters_files_multiple(
+min_size=run_info.mincln,
+directory=paths.clusters,
+sequences=sequences,
+tRNA_database_path=config.TRNA_DATABASE,
+satellite_model_path=config.SATELLITE_MODEL)
+if not args.tarean_mode:
+LOGGER.info("assembling..")
+assembly_tools.assembly(sequences,
+hitsort,
+clusters_info,
+assembly_dir=paths.assembly,
+contigs_file=paths.contigs,
+min_size_of_cluster_for_assembly=args.assembly_min)
+LOGGER.info("detecting LTR in assembly..")
+for i in clusters_info:
+i.detect_ltr(config.TRNA_DATABASE)
+run_info.max_annotated_clusters = max([i.index for i in clusters_info])
+run_info.max_annotated_superclusters = max([i.supercluster
+for i in clusters_info])
+# make reports
+cluster_listing = [i.listing() for i in clusters_info]
+# make path relative to paths.cluster_info
+utils.save_as_table(cluster_listing, paths.clusters_info)
+# creates table cluster_info in hitsort database
+graphtools.Cluster.add_cluster_table_to_database(cluster_listing,
+paths.hitsort_db)
+# export files for consensus sequences, one for each ranks
+consensus_files = []
+for i in config.TANDEM_RANKS:
+consensus_files.append(utils.export_tandem_consensus(
+clusters_info,
+path=paths.TR_consensus_fasta.format(i),
+rank=i))
+if not args.tarean_mode:
+LOGGER.info("Creating report for superclusters")
+create_annotation.create_all_superclusters_report(
+max_supercluster=run_info.max_annotated_superclusters,
+paths=paths.as_list(),
+libdir=paths.libdir,
+superclusters_dir=paths.superclusters,
+seqdb=paths.sequences_db,
+hitsortdb=paths.hitsort_db,
+classification_hierarchy_file=config.CLASSIFICATION_HIERARCHY,
+HTML_LINKS=dict2lists(config.HTML_LINKS))
+LOGGER.info("Creating report for individual clusters")
+for cluster in clusters_info:
+create_annotation.create_cluster_report(
+cluster.index,
+seqdb=paths.sequences_db,
+hitsortdb=paths.hitsort_db,
+classification_hierarchy_file=
+config.CLASSIFICATION_HIERARCHY,
+HTML_LINKS=dict2lists(config.HTML_LINKS))
+LOGGER.info("Creating main html report")
+reporting.create_main_reports(
+paths=paths.as_list(),
+N_clustering=run_info.number_of_reads_for_clustering,
+N_input=run_info.number_of_input_sequences,
+N_omit=run_info.number_of_omitted_reads,
+merge_threshold=args.merge_threshold,
+paired=run_info.paired,
+consensus_files=consensus_files,
+custom_db=bool(config.CUSTOM_DNA_DATABASE),
+tarean_mode=args.tarean_mode,
+HTML_LINKS=dict2lists(config.HTML_LINKS),
+pipeline_version_info=pipeline_version_info,
+max_memory=run_info.max_memory,
+max_number_reads_for_clustering=run_info.max_number_reads_for_clustering,
+mincln=run_info.mincln
+)
+LOGGER.info("Html report reports created")
+except:
+r2py.shutdown(config.RSERVE_PORT)
+raise
+finally:
+if args.cleanup:
+paths.cleanup(config.FILES_TO_DISCARD_AT_CLEANUP)
+else:
+LOGGER.info("copy databases to working directory")
+shutil.copy(paths.sequences_db, paths.working_dir)
+shutil.copy(paths.hitsort_db, paths.working_dir)
+# copy log file inside working directory
+if logfile:
+shutil.copyfile(logfile, paths.logfile)
+if __name__ == "__main__":
+main()
+# some error handling here:

Mercurial > repos > petrn > repeatexplorer

comparison seqclust @ 8:3bc73f5dc785 draft