annotate lib/seqtools.py @ 0:f6ebec6e235e draft

Uploaded
author petrn
date Thu, 19 Dec 2019 13:46:43 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1 #!/usr/bin/env python3
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
2 import logging
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
3 logger = logging.getLogger(__name__)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
4 import itertools
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
5 import os
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
6 import sys
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
7 import random
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
8 import sqlite3
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
9 import subprocess
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
10 import shlex # for command line arguments split
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
11 from collections import namedtuple
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
12 from lib.utils import format_query
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
13 from lib.parallel.parallel import parallel2 as parallel
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
14 from lib.parallel.parallel import get_max_proc
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
15 REQUIRED_VERSION = (3, 4)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
16 MAX_PRINT = 10
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
17
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
18 if sys.version_info < REQUIRED_VERSION:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
19 raise Exception("\n\npython 3.4 or higher is required!\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
20
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
21 # additional functions
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
22
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
23
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
24
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
25 def _hitsort_worker(query, blastdb, output, options):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
26
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
27 # output from blast is parsed from stdout
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
28 cmd = options.all2all_search_params.format(query = query, blastdb = blastdb)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
29 print(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
30 min_lcov, min_pid, min_ovl, min_scov, evalue_max = options.filtering_threshold
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
31 pair2insert = ''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
32 signs ={'plus':'+', 'minus':'-'}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
33 with open(output, 'w') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
34 with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
35 for i in p.stdout:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
36 items = i.decode('utf-8').split()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
37 if options.filter_self_hits:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
38 if items[4] == items[0]:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
39 continue
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
40 evalue = float(items[10])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
41 ovl_q = abs(float(items[2]) - float(items[3])) + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
42 ovl_h = abs(float(items[6]) - float(items[7])) + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
43 if (ovl_q >= min_ovl or ovl_h >= min_ovl) and float(items[8]) >= min_pid:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
44 if float(items[1]) >= float(items[5]):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
45 lcov = ovl_q * 100.0 / float(items[1])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
46 scov = ovl_h * 100.0 / float(items[5])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
47 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
48 lcov = ovl_q * 100.0 / float(items[5])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
49 scov = ovl_h * 100.0 / float(items[1])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
50 # positive line:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
51 if lcov >= min_lcov and scov >= min_scov and evalue_max > evalue:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
52 pr = [items[0], items[4]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
53 # previous HSP
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
54 if pair2insert != "{0}\t{1}".format(pr[0], pr[1]):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
55 pair2insert = "{0}\t{1}".format(pr[0], pr[1])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
56 if items[11] in ['plus', 'minus']:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
57 items[11] = signs[items[11]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
58 f.write("{0}\t{1}\t{2}\n".format(pair2insert, items[9], "\t".join(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
59 items[k] for k in [1, 2, 3, 5, 6, 7, 8, 10, 11])))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
60
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
61 def blast_with_filter(fasta_file_filter, blastdb):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
62 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
63 Return list of sequences query id which has similarity to filter
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
64 It uses mgblast for search
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
65 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
66 params = '-p 85 -W18 -UT -X40 -KT -JF -F "m D" -v100000000 -b100000000 -D4 -C 30 -H 30'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
67 positive_hits = set()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
68 min_pid = 90
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
69 min_ovl_perc = 90
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
70 cmd = " ".join(['mgblast',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
71 '-i', fasta_file_filter,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
72 '-d', blastdb,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
73 params
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
74 ])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
75 with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
76 for i in p.stdout:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
77 items = i.decode('utf-8').split()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
78 ovl_perc = (abs(float(items[6]) - float(items[7])) + 1) / float(items[5]) * 100
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
79 pid = float(items[8])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
80 if pid > min_pid and ovl_perc > min_ovl_perc:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
81 positive_hits.add(items[4])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
82 return(positive_hits)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
83
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
84
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
85
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
86 # TODO test task
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
87 # predefine blast params
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
88 def blast_worker(query, blastdb, output, params):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
89 if params['program'] in ['blastx', 'blastn']:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
90 default_columns = ' -outfmt "6 {}"'.format(params['output_columns'])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
91 cmd = "{} -query {} -db {} {} {}".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
92 params['program'], query, blastdb, params['args'], default_columns)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
93 elif params['program']=='diamond blastx':
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
94 # diomand have slightly different format than blastx
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
95 default_columns = ' --outfmt 6 {}'.format(params['output_columns'])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
96 cmd = "{} -q {} -d {} {} {}".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
97 params['program'], query, blastdb, params['args'], default_columns)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
98 print(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
99 preceeding_pair = ['', '']
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
100 BlastValues = namedtuple("BlastValues", params['output_columns'])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
101 with open(output, 'wb') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
102 with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
103 for i in p.stdout:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
104 items = i.decode('utf-8').split()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
105 blast_values = BlastValues(*[f(i) for i, f in zip(items, params['column_types'])])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
106 #qseqid, sseqid, qlen, slen, length, ppos, bitscore = [
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
107 # f(i) for i, f in zip(items, params['column_types'])]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
108 if params['filter_function'](blast_values):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
109 if preceeding_pair != [blast_values.qseqid, blast_values.sseqid]:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
110 f.write(i)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
111 preceeding_pair = [blast_values.qseqid, blast_values.sseqid]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
112
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
113
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
114 class Sequence:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
115
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
116 def __init__(self, seq, name, paired=False):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
117 # the mode os seq storage can be changed later to make it more
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
118 # memory efficient
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
119 self._seq = bytes(str(seq), "ascii")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
120 self.name = str(name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
121
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
122 @property
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
123 def seq(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
124 return self._seq.decode("utf-8")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
125
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
126 @seq.setter
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
127 def seq(self, value):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
128 self._seq = bytes(str(value), "ascii")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
129
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
130 def __str__(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
131 return "{0} : {1}".format(self.name, self.seq)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
132
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
133 @staticmethod
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
134 def read_fasta(fasta_file_name):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
135 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
136 generator - reads sequences from fasta file
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
137 return sequence one by one
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
138 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
139 with open(fasta_file_name, 'r') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
140 header = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
141 seqstr = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
142 for rawline in f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
143 line = rawline.strip()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
144 if line == "":
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
145 continue
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
146 if line[0] == ">":
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
147 if header and seqstr:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
148 yield Sequence(seqstr, header)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
149 # reset
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
150 seqstr = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
151 header = line[1:]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
152 elif seqstr:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
153 Warning("sequence was not preceeded by header")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
154 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
155 header = line[1:]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
156 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
157 seqstr = line if not seqstr else seqstr + line
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
158 # skip empty lines:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
159 if header and seqstr:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
160 yield Sequence(seqstr, header)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
161 return
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
162
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
163 def write2fasta(self, file_object):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
164 file_object.write(">{0}\n{1}\n".format(self.name, self.seq))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
165
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
166
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
167 class SequenceSet:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
168
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
169 def __init__(self, filename=None, source=None, sample_size=0, new=False, paired=False, prefix_length=0, rename=False, seed=123, fasta=None):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
170 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
171 filename: path to sqlite database, if none database is stored in memory
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
172 source: path to fasta file, if none empty database is created
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
173 sample_size: use only sample of sample_size from fasta file, if 0 all is used
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
174 new: should be old database created?
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
175 paired - paired reads
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
176 prefix_length -int
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
177 rename -- use int as names
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
178
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
179 creates SequenceSet, either empty or from fasta file,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
180 it can be stored as dictionary or in shelve in file
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
181 only sample of all sequences can be loaded. if sample_size = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
182 all sequences are loaded
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
183 if source is give, new database is created - if filename exist - it will be deleted!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
184 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
185 #======================================================================
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
186 # TODO name checking
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
187 # TODO detect unique prefixes
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
188 #======================================================================
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
189 # type of storage
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
190 random.seed(seed)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
191
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
192 self.source = source
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
193 self.sample_size = sample_size
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
194 self.paired = paired
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
195 self.filename = filename
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
196 self.prefix_length = prefix_length
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
197 self.prefix_codes = {}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
198 self.rename = rename
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
199 self.fasta = fasta
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
200 self._length = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
201 self._original_length = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
202 self.blastdb = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
203 self.blastdb_legacy = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
204 self.chunks = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
205 self.ids_kept = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
206 self.annotations = []
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
207 self.hitsort = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
208 if filename:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
209 logger.debug("Creating database in file")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
210 if os.path.isfile(filename) and (new or source):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
211 # remove old database if exists
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
212 os.remove(filename)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
213 # sqlite database in file
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
214 self.conn = sqlite3.connect(filename)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
215 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
216 # sqlite database in memory
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
217 logger.debug("Creating database in memory")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
218 self.conn = sqlite3.connect(":memory:")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
219 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
220 c.execute("create table sequences (name text, sequence text)")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
221
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
222 if source:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
223 self._read_from_fasta()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
224 c.execute("CREATE TABLE prefix_codes (prefix TEXT, N INTEGER)")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
225 self.conn.executemany("INSERT INTO prefix_codes (prefix, N) values (?,?)", self.prefix_codes.items())
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
226 self.conn.commit()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
227 self._commit_immediately = True
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
228 self.makeblastdb(legacy=True)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
229 # store some attribudes in database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
230 self._update_database()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
231 logger.info("sequences loaded")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
232 logger.info(self)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
233
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
234 def _update_database(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
235 # pass all atributes - which are either float, ind and str to extra table
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
236 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
237 stored_attributes = [
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
238 ("sample_size", "integer"),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
239 ("_length", "integer"),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
240 ("_original_length", "integer"),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
241 ("paired", "integer"),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
242 ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
243 columns = ", ".join(i[0] + " " + i[1] for i in stored_attributes)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
244 try:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
245 c.execute((
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
246 "CREATE TABLE seqinfo ( {} )".format(columns)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
247 ))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
248 except sqlite3.OperationalError:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
249 pass # table already exists
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
250 # get current values
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
251 values = tuple(getattr(self, i[0]) for i in stored_attributes)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
252 placeholder = ", ".join(["?"] * len(values))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
253 c.execute("DELETE FROM seqinfo")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
254 c.execute("INSERT INTO seqinfo VALUES ({})".format(placeholder), values)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
255
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
256
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
257 def _read_from_fasta(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
258 # changed will be commited after whole file is loaded - this is faster
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
259 self._commit_immediately = False
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
260 if self.fasta:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
261 f = open(self.fasta, mode='w')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
262 # define sampling
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
263 if self.sample_size:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
264 logger.debug(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
265 'sampling sequences: {} sample size'.format(self.sample_size))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
266 N = self.fasta_length(self.source)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
267 if self.paired:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
268 s = itertools.chain(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
269 [[i, i + 1] for i in sorted(random.sample(range(1, N, 2), int(self.sample_size / 2)))])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
270
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
271 sample = itertools.chain(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
272 [item for sublist in s for item in sublist])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
273
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
274 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
275 sample = itertools.chain(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
276 sorted(random.sample(range(1, N + 1), self.sample_size)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
277 logger.debug("sampling unpaired reads")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
278 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
279 # no sampling - use all
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
280 sample = itertools.count(1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
281
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
282 # define counter:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
283 if self.rename:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
284 if self.paired:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
285 counter = itertools.count(1, 0.5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
286 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
287 counter = itertools.count(1, 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
288 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
289 counter = itertools.cycle([""])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
290 # define pairs:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
291 if self.paired:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
292 pair = itertools.cycle(['f', 'r'])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
293 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
294 pair = itertools.cycle([''])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
295 position = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
296
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
297 seq2write = next(sample)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
298
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
299 for i in Sequence.read_fasta(self.source):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
300 position += 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
301 if position == seq2write:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
302
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
303 # create name:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
304 prefix = i.name[0:self.prefix_length] # could be empty ""
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
305 if self.rename:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
306 i.name = "{0}{1}{2}".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
307 prefix,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
308 int(next(counter)),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
309 next(pair)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
310 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
311 if self.prefix_length:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
312 if prefix in self.prefix_codes:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
313 self.prefix_codes[prefix] += 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
314 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
315 self.prefix_codes[prefix] = 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
316 self[i.name] = i.seq
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
317 if self.fasta:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
318 i.write2fasta(f)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
319 try:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
320 seq2write = next(sample)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
321 except StopIteration:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
322 # no more sequences to sample
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
323 break
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
324
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
325
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
326 self.conn.commit() # save changes
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
327 if self.fasta:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
328 f.close()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
329 self._commit_immediately = True
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
330
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
331 def __getitem__(self, item):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
332 # item cannot be empty!!!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
333 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
334 c.execute("select * from sequences where name= '{}'".format(item))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
335 # try:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
336 name, sequence = c.fetchone() # should be only one
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
337 # except TypeError:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
338 # return None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
339 return sequence
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
340
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
341 def __setitem__(self, item, value):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
342 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
343 c.execute(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
344 "insert into sequences values ( '{0}', '{1}')".format(item, value))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
345 if self._commit_immediately:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
346 self.conn.commit() # save changes
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
347
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
348 # def __iter__(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
349 # self.c.execute("select name from sequences")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
350 # for i in self.c:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
351 # yield i[0]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
352
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
353 def __iter__(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
354 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
355 c.execute("select name from sequences")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
356 for i in c:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
357 yield i[0]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
358
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
359
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
360 # def __iter__(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
361 # for i in range(1,len(self)):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
362 # yield i
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
363 def items(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
364 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
365 c.execute("select name, sequence from sequences")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
366 for name, seq in c:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
367 yield Sequence(seq, name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
368
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
369 def toDict(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
370 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
371 c.execute("select name, sequence from sequences")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
372 d = {}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
373 for name, seq in c:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
374 d[name]=seq
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
375 return(d)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
376
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
377 def __len__(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
378 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
379 c.execute("select count(*) from sequences")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
380 length = c.fetchone()[0]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
381 return(length)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
382
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
383 def __str__(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
384 out = ""
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
385 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
386 c.execute("select * from sequences limit {0}".format(MAX_PRINT))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
387 for n, s in c:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
388 out = out + "{0} : {1}\n".format(n, s)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
389 out = out + "...\n"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
390 out = out + str(len(self)) + " sequence total\n"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
391 return out
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
392
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
393 def insert(self, sequence, commit=True):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
394 self._commit_immediately = commit
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
395 self[sequence.name] = sequence.seq
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
396 self._commit_immediately = True
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
397
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
398 def keys(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
399 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
400 this will get names of sequences
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
401 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
402 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
403 c.execute('select name from sequences order by rowid')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
404 names = []
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
405 for i in c.fetchall():
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
406 names.append(i[0])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
407 return names
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
408
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
409 def sample(self, size, seed=123):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
410 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
411 generator - reproducible sampling sequences
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
412 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
413 max_index = len(self)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
414 # sample = seqtools.SequenceSet(source=info.input_sequences, )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
415 if self.paired:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
416 size = int(size / 2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
417 step = 2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
418 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
419 step = 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
420 random.seed(seed)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
421 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
422 for i in sorted(random.sample(range(1, max_index, step), size)):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
423 c.execute(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
424 "select name, sequence from sequences where rowid={}".format(i))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
425 name, sequence = c.fetchone()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
426 yield Sequence(sequence, name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
427 if self.paired:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
428 c.execute(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
429 "select name, sequence from sequences where rowid={}".format(i + 1))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
430 name, sequence = c.fetchone()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
431 yield Sequence(sequence, name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
432
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
433 def sample2db(self, db_file=None, fasta_file_name=None, size=100, seed=123):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
434 if (not db_file) and (not fasta_file_name):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
435 raise IOError("no db_file nor fasta_file_name were defined")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
436 # open files
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
437 if db_file:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
438 db = SequenceSet(filename=db_file, source=None, new=True)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
439 if fasta_file_name:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
440 f = open(fasta_file_name, 'w')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
441 # write in files
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
442 for i in self.sample(size, seed):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
443 if db_file:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
444 db.insert(i, commit=False)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
445 if fasta_file_name:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
446 i.write2fasta(f)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
447 # close files
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
448 if fasta_file_name:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
449 f.close()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
450 if db_file:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
451 db.conn.commit()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
452 return db
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
453
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
454 def save2fasta(self, fasta_file_name, keep=True, subset=None):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
455 if subset:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
456 with open(fasta_file_name, 'w') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
457 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
458 c.execute("select name, sequence from sequences where name in ('{}')".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
459 "','".join(subset)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
460 for name, sequence in c:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
461 s = Sequence(sequence, name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
462 s.write2fasta(f)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
463 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
464 with open(fasta_file_name, 'w') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
465 for i in self.items():
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
466 i.write2fasta(f)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
467 if keep:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
468 self.fasta = fasta_file_name
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
469
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
470 def save_annotation(self, annotation_name, subset, dir):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
471 annotation_file = "{}/{}_annotation.csv".format(dir, annotation_name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
472 with open(annotation_file, "w") as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
473 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
474 c.execute(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
475 "select * from {} where name in ('{}')".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
476 annotation_name, "','".join(subset)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
477 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
478 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
479 header = [description[0] for description in c.description]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
480 f.write("\t".join(str(j) for j in header) + "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
481 for i in c:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
482 f.write("\t".join(str(j) for j in i) + "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
483 return annotation_file
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
484
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
485 def make_chunks(self, file_name=None, chunk_size=5000):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
486 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
487 split SequneceSet to chunks and save to files,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
488 return list of files names
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
489 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
490 logger.debug("creating chunks from fasta file: ")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
491 if not file_name and self.fasta:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
492 file_name = self.fasta
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
493 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
494 raise Exception('file name for chunks is not defined!')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
495
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
496 seqs = iter(self.items())
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
497 fn_chunk_names = []
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
498 for i in itertools.count():
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
499 try:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
500 fn = "{0}.{1}".format(file_name, i)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
501 logger.info("saving chunk {}".format(fn))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
502 for n in range(chunk_size):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
503 s = next(seqs)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
504 if not n:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
505 f = open(fn, 'w')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
506 s.write2fasta(f)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
507 f.close()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
508 fn_chunk_names.append(fn)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
509 except StopIteration:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
510 if not f.closed:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
511 f.close()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
512 fn_chunk_names.append(fn)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
513 break
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
514 self.chunks = fn_chunk_names
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
515 logger.debug(self.chunks)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
516 return(fn_chunk_names)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
517
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
518 def makeblastdb(self, blastdb=None, dbtype='nucl', legacy=False, lastdb = False):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
519 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
520 dbtype eithe 'nucl' of 'prot'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
521 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
522 logger.info("creating blast database")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
523 # check if fasta file on disk is present
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
524 if not self.fasta:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
525 try:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
526 self.save2fasta(blastdb)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
527 except TypeError:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
528 print("\nEither blastdb or fasta must be ",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
529 "defined when making blast database!!!\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
530 raise
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
531 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
532 blastdb = blastdb if blastdb else self.fasta
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
533
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
534 cmd = "makeblastdb -in {0} -out {1} -dbtype {2}".format(self.fasta,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
535 blastdb,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
536 dbtype)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
537 args = shlex.split(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
538 if 0 == subprocess.check_call(args, stderr=sys.stdout):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
539 self.blastdb = blastdb
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
540 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
541 Warning("makeblastdb failed")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
542
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
543 if legacy:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
544 logger.info("creating legacy blast database")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
545 prot = {'nucl': 'F', 'prot': 'T'}[dbtype]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
546 cmd = "formatdb -i {0} -p {1} -n {2}.legacy".format(self.fasta,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
547 prot,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
548 blastdb
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
549 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
550 args = shlex.split(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
551 if subprocess.check_call(args) == 0:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
552 self.blastdb_legacy = "{}.legacy".format(blastdb)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
553 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
554 Warning("formatdb failed")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
555 if lastdb:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
556 logger.info("creating lastdb database")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
557 cmd = "lastdb -u NEAR {fasta} {fasta}".format(fasta=self.fasta)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
558 args = shlex.split(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
559 if subprocess.check_call(args) == 0:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
560 self.lastdb = self.fasta
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
561 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
562 Warning("formatdb failed")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
563
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
564
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
565 def create_hitsort(self, options, output=None):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
566 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
567 query is name of files to search against blastdb of SequenceSet object
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
568 query could be multiple files
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
569 this is inteded to be used for tabular output only
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
570 output file must be specified,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
571 if more than one query is used, multiple files are created
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
572 worker - function to execute blast search
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
573 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
574
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
575 logger.info('running all to all blast')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
576 query = self.chunks
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
577 if not output:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
578 output = "{}.blast".format(self.fasta)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
579 database = getattr(self, options.database)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
580 if len(query) > 1:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
581 output_parts = [output + "." + str(i) for i in range(len(query))]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
582 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
583 output_parts = [output]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
584 query = [query]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
585
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
586 parallel(_hitsort_worker, query, [database],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
587 output_parts, [options])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
588 logger.info('all to all blast finished')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
589 logger.info('removing duplicates from all to all blast results')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
590 self._clean_hitsort(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
591 hitsort_files=output_parts, filtered_hitsort=output)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
592 # remove temp files:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
593 for f in output_parts:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
594 os.unlink(f)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
595 self.hitsort = output
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
596 return output
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
597
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
598 def _clean_hitsort(self, hitsort_files, filtered_hitsort):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
599 ''' alterantive hitsort duplicate removal '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
600
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
601 ids = self.keys()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
602 Nfiles = 5 + int(sum([os.path.getsize(i)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
603 for i in hitsort_files]) / 5e9)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
604 hitsort_parts = [
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
605 "{0}.{1}.parts".format(filtered_hitsort, i) for i in range(Nfiles)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
606 # open files
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
607 fs = []
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
608 for i in hitsort_parts:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
609 fs.append(open(i, 'w'))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
610
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
611 ids_destination = {}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
612 fs_iter = itertools.cycle(fs)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
613 for i in ids:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
614 #ids_destination[i] = fs_iter.next()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
615 ids_destination[i] = next(fs_iter)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
616 lines_out = lines_total = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
617 temp_open = False
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
618
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
619 for i in hitsort_files:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
620 f = open(i, 'r')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
621 while True:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
622 line = f.readline()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
623 if not line:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
624 break
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
625 lines_total += 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
626 line_items = line.split()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
627 # ids_destination[line_items[0]].write(line)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
628 # do not assume that it is sorted!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
629 ids_destination[min(line_items[0:2])].write(line)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
630 # close all files
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
631 for i in fs:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
632 i.close()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
633
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
634 # load one by one and exclude duplicates:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
635 hit_out = open(filtered_hitsort, 'w')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
636 for i in hitsort_parts:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
637 f = open(i, 'r')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
638 hitsort_shelve = {}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
639 while True:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
640 line = f.readline()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
641 if not line:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
642 break
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
643 line_items = line.split()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
644 key = "\t".join(sorted(line_items[0:2]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
645 value = line_items
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
646 bitscore = float(line_items[2])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
647 if key in hitsort_shelve:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
648 if float(hitsort_shelve[key][2]) > bitscore:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
649 hitsort_shelve[key] = value
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
650 hit_out.write("\t".join(hitsort_shelve[key]) + "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
651 del hitsort_shelve[key]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
652 lines_out += 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
653 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
654 hitsort_shelve[key] = value
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
655 f.close()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
656 for key in hitsort_shelve:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
657 hit_out.write("\t".join(hitsort_shelve[key]) + "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
658 lines_out += 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
659 # clean up
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
660 for i in hitsort_parts:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
661 os.unlink(i)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
662 hit_out.close()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
663 return lines_total, lines_out
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
664
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
665 def _check_database(self, database):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
666 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
667 database is path to fastafile, database with same prefix should exist
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
668 it creates database if does not exist as temporal file!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
669 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
670 pass
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
671 # TODO
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
672
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
673 def remove_sequences_using_filter(self, fasta_file_filter, keep_proportion,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
674 omitted_sequences_file,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
675 kept_sequences_file):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
676 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
677 Run blast with fasta_file_filter against legaci blastdb of SequenceSet to detect sequences
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
678 which are to be removed from SequencesSet. Complete pairsare removed if paired sequences
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
679 are used.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
680 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
681 ids = blast_with_filter(fasta_file_filter, self.blastdb_legacy)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
682 # check against database and remove sequences
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
683
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
684 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
685 if self.paired:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
686 c.execute("SELECT rowid FROM sequences WHERE name IN {}".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
687 format_query(ids)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
688 ))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
689 odd_rowids = set()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
690 for i in c.fetchall():
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
691 if i[0] % 2 == 0:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
692 odd_rowids.add(i[0] - 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
693 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
694 odd_rowids.add(i[0])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
695 odd_rowids_keep = random.sample(odd_rowids, round(keep_proportion * len(odd_rowids)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
696 c.execute("SELECT name FROM sequences WHERE rowid IN {} ORDER BY rowid".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
697 format_query(odd_rowids_keep + [i+1 for i in odd_rowids_keep])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
698 ))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
699 self.ids_kept = [i[0] for i in c.fetchall()]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
700 odd_rowids_omit = list(odd_rowids.difference(odd_rowids_keep))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
701 c.execute("SELECT name FROM sequences WHERE rowid IN {} ORDER BY rowid".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
702 format_query(odd_rowids_omit + [i+1 for i in odd_rowids_omit])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
703 ))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
704 ids_omit = [i[0] for i in c.fetchall()]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
705
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
706 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
707 self.ids_kept = random.sample(ids, round(keep_proportion * len(ids)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
708 ids_omit = list(ids.difference(self.ids_kept))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
709 self.save2fasta(omitted_sequences_file, keep=False, subset=ids_omit)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
710 self.save2fasta(kept_sequences_file, keep=False, subset=self.ids_kept)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
711 self.drop_sequences(ids_omit)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
712 # save info about omited sequences
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
713 c.execute("CREATE TABLE ids_kept (name TEXT)")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
714 c.executemany("INSERT INTO ids_kept (name) values (?)", [(i,) for i in self.ids_kept])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
715 return len(ids_omit)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
716
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
717 def drop_sequences(self, ids_to_drop):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
718 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
719 remove sequences from sequence set based on the ID
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
720 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
721 self._original_length = len(self)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
722 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
723 idslist = format_query(ids_to_drop)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
724 c.execute("CREATE TABLE omited_sequences AS SELECT * FROM sequences WHERE name IN {}".format(idslist))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
725 c.execute("DELETE FROM sequences WHERE name IN {}".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
726 idslist
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
727 ))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
728 # new databases must be created - with ommited sequences!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
729 self.save2fasta(fasta_file_name=self.fasta) ## this will replace origninal file!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
730 self._length = len(self)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
731 self.makeblastdb(legacy=True)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
732 self._update_database()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
733
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
734 def annotate(self, database, annotation_name, directory="", params=""):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
735 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
736 database : path to blast formated database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
737 method: type of search to use for annotation. it must be
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
738 'blastn' of 'blastx'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
739 Annotation is save in directory also into the table with name stored in
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
740 annotation_name variable
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
741 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
742 logger.info("annotating reads with {} ".format(annotation_name))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
743 self._check_database(database)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
744 if "parallelize" in params:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
745 parallelize = params['parallelize']
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
746 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
747 parallelize = True
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
748 if parallelize:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
749 if not self.chunks:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
750 self.make_chunks()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
751 output = [
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
752 "{}/{}_{}".format(directory, annotation_name,os.path.basename(i)) for i in self.chunks]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
753 parallel(blast_worker, self.chunks, [database], output, [params])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
754 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
755 # no explicit parallelization using parallel
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
756 single_output = "{}/{}_hits".format(directory, annotation_name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
757 params['args'] = params['args'].format(max_proc=get_max_proc())
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
758 blast_worker(self.fasta, database, single_output,params)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
759 output = [single_output]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
760
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
761 # put into as new attribute and sqlite table
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
762 c = self.conn.cursor()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
763 c.execute("create table {} (name text, db_id text, length real, bitscore real , pid real)".format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
764 annotation_name))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
765 for blast_results in output:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
766 with open(blast_results, 'r') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
767 for i in f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
768 items = i.split()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
769 types = [str, str, float, float, float, float, float]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
770 qseqid, sseqid, qlen, slen, length, pident, bitscore = [
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
771 f(i) for i, f in zip(items, types)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
772 c.execute('insert into {} values ("{}", "{}", "{}", "{}", "{}")'.format(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
773 annotation_name, qseqid, sseqid, length, bitscore, pident))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
774 self.conn.commit()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
775 self.annotations.append(annotation_name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
776
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
777 @staticmethod
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
778 def fasta_length(fasta_file_name):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
779 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
780 count number of sequences,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
781 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
782 with open(fasta_file_name, 'r') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
783 N = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
784 for i in f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
785 if i.strip()[0] == ">":
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
786 N += 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
787 return N
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
788
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
789
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
790 # test
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
791 if __name__ == "__main__":
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
792 # get root directory
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
793
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
794 MAIN_DIR = os.path.realpath(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
795 os.path.dirname(os.path.realpath(__file__)) + "/..")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
796 print("MAIN_DIR:")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
797 print(MAIN_DIR)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
798 TMP_DIR = "{}/tmp".format(MAIN_DIR)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
799 TEST_DATA_DIR = "{}/test_data".format(MAIN_DIR)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
800
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
801 # some data:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
802 fn = "{}/pair_reads.fasta".format(TEST_DATA_DIR)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
803 os.chdir(TMP_DIR)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
804 # s = SequenceSet(filename='sqlite.db')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
805 print("number of sequences in fasta file:")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
806 print(SequenceSet.fasta_length(fn))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
807
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
808 print("loading sequences...")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
809 s = SequenceSet(source=fn, paired=True, rename=False,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
810 filename='sqlite.db', prefix_length=4, fasta='sample2.fasta')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
811 print(s)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
812
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
813 print("creating blast database")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
814 s.makeblastdb()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
815
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
816 print("creating chunks from fasta file: ")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
817 file_list = s.make_chunks("fasta_chunks", chunk_size=500)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
818 print(file_list)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
819
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
820 print('creating hitsort')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
821 s.create_hitsort(file_list, "hitsort")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
822
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
823 print("saving to fasta file")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
824 s.save2fasta('sampleX.fasta', keep=False)