annotate lib/tarean/kmer_counting.py @ 0:f6ebec6e235e draft

Uploaded
author petrn
date Thu, 19 Dec 2019 13:46:43 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1 #!/usr/bin/env python3
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
3 import logging
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
4 logger = logging.getLogger(__name__)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
5 import itertools
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
6 import os
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
7 import sys
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
8 import random
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
9 import sqlite3
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
10 import subprocess
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
11 import shlex # for command line arguments split
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
12 import operator
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
13
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
14 REQUIRED_VERSION = (3, 4)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
15 MAX_PRINT = 10
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
16 MEGABLAST = "-task megablast"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
17 HITSORT = "-task megablast"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
18
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
19 if sys.version_info < REQUIRED_VERSION:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
20 raise Exception("\n\npython 3.4 or higher is required!\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
21
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
22 # additional functions
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
23
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
24
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
25 class Sequence:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
26
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
27 def __init__(self, seq, name="", paired=False):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
28 # the mode os seq storage can be changed later to make it more
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
29 # memory efficient
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
30 self._seq = bytes(str(seq), "ascii")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
31 self.name = str(name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
32
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
33 @property
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
34 def seq(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
35 return self._seq.decode("utf-8")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
36
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
37 @seq.setter
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
38 def seq(self, value):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
39 self._seq = bytes(str(value), "ascii")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
40
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
41 def __str__(self):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
42 return "{0} : {1}".format(self.name, self.seq)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
43
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
44 @staticmethod
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
45 def read_fasta(fasta_file_name):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
46 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
47 generator - reads sequences from fasta file
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
48 return sequence one by one
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
49 '''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
50 with open(fasta_file_name, 'r') as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
51 header = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
52 seqstr = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
53 for rawline in f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
54 line = rawline.strip()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
55 if line == "":
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
56 continue
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
57 if line[0] == ">":
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
58 if header and seqstr:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
59 yield Sequence(seqstr, header)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
60 # reset
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
61 seqstr = None
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
62 header = line[1:]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
63 elif seqstr:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
64 Warning("sequence was not preceeded by header")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
65 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
66 header = line[1:]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
67 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
68 seqstr = line if not seqstr else seqstr + line
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
69 # skip empty lines:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
70 if header and seqstr:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
71 yield Sequence(seqstr, header)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
72 return
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
73
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
74 def write2fasta(self, file_object):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
75 file_object.write(">{0}\n{1}\n".format(self.name, self.seq))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
76
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
77
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
78 def get_kmers(string, width=11):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
79 L = len(string)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
80 parts = [string[i:i + width] for i in range(L - width + 0)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
81 return parts
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
82
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
83
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
84 def count_kmers_from_file(f, width=11):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
85 counts = {}
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
86 for i in Sequence.read_fasta(f):
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
87 a = get_kmers(i.seq, width)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
88 for km in a:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
89 if "N" in km:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
90 continue
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
91 if km in counts:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
92 counts[km] += 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
93 else:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
94 counts[km] = 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
95 sorted_counts = sorted(counts.items(),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
96 key=operator.itemgetter(1),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
97 reverse=True)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
98 return sorted_counts
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
99
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
100
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
101 if __name__ == "__main__":
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
102 L = len(sys.argv) - 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
103 kmer_length = int(sys.argv[-1])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
104 files = sys.argv[1:-1]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
105 for fin in files:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
106 counts = count_kmers_from_file(fin, kmer_length)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
107 fout = "{}_{}.kmers".format(fin, kmer_length)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
108 with open(fout, "w") as f:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
109 for i in counts:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
110 f.write("{}\t{}\n".format(*i))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
111 print(fout)