Mercurial > repos > petrn > repeatexplorer
comparison lib/tarean/kmer_counting.py @ 0:f6ebec6e235e draft
Uploaded
| author | petrn |
|---|---|
| date | Thu, 19 Dec 2019 13:46:43 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f6ebec6e235e |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import logging | |
| 4 logger = logging.getLogger(__name__) | |
| 5 import itertools | |
| 6 import os | |
| 7 import sys | |
| 8 import random | |
| 9 import sqlite3 | |
| 10 import subprocess | |
| 11 import shlex # for command line arguments split | |
| 12 import operator | |
| 13 | |
| 14 REQUIRED_VERSION = (3, 4) | |
| 15 MAX_PRINT = 10 | |
| 16 MEGABLAST = "-task megablast" | |
| 17 HITSORT = "-task megablast" | |
| 18 | |
| 19 if sys.version_info < REQUIRED_VERSION: | |
| 20 raise Exception("\n\npython 3.4 or higher is required!\n") | |
| 21 | |
| 22 # additional functions | |
| 23 | |
| 24 | |
| 25 class Sequence: | |
| 26 | |
| 27 def __init__(self, seq, name="", paired=False): | |
| 28 # the mode os seq storage can be changed later to make it more | |
| 29 # memory efficient | |
| 30 self._seq = bytes(str(seq), "ascii") | |
| 31 self.name = str(name) | |
| 32 | |
| 33 @property | |
| 34 def seq(self): | |
| 35 return self._seq.decode("utf-8") | |
| 36 | |
| 37 @seq.setter | |
| 38 def seq(self, value): | |
| 39 self._seq = bytes(str(value), "ascii") | |
| 40 | |
| 41 def __str__(self): | |
| 42 return "{0} : {1}".format(self.name, self.seq) | |
| 43 | |
| 44 @staticmethod | |
| 45 def read_fasta(fasta_file_name): | |
| 46 ''' | |
| 47 generator - reads sequences from fasta file | |
| 48 return sequence one by one | |
| 49 ''' | |
| 50 with open(fasta_file_name, 'r') as f: | |
| 51 header = None | |
| 52 seqstr = None | |
| 53 for rawline in f: | |
| 54 line = rawline.strip() | |
| 55 if line == "": | |
| 56 continue | |
| 57 if line[0] == ">": | |
| 58 if header and seqstr: | |
| 59 yield Sequence(seqstr, header) | |
| 60 # reset | |
| 61 seqstr = None | |
| 62 header = line[1:] | |
| 63 elif seqstr: | |
| 64 Warning("sequence was not preceeded by header") | |
| 65 else: | |
| 66 header = line[1:] | |
| 67 else: | |
| 68 seqstr = line if not seqstr else seqstr + line | |
| 69 # skip empty lines: | |
| 70 if header and seqstr: | |
| 71 yield Sequence(seqstr, header) | |
| 72 return | |
| 73 | |
| 74 def write2fasta(self, file_object): | |
| 75 file_object.write(">{0}\n{1}\n".format(self.name, self.seq)) | |
| 76 | |
| 77 | |
| 78 def get_kmers(string, width=11): | |
| 79 L = len(string) | |
| 80 parts = [string[i:i + width] for i in range(L - width + 0)] | |
| 81 return parts | |
| 82 | |
| 83 | |
| 84 def count_kmers_from_file(f, width=11): | |
| 85 counts = {} | |
| 86 for i in Sequence.read_fasta(f): | |
| 87 a = get_kmers(i.seq, width) | |
| 88 for km in a: | |
| 89 if "N" in km: | |
| 90 continue | |
| 91 if km in counts: | |
| 92 counts[km] += 1 | |
| 93 else: | |
| 94 counts[km] = 1 | |
| 95 sorted_counts = sorted(counts.items(), | |
| 96 key=operator.itemgetter(1), | |
| 97 reverse=True) | |
| 98 return sorted_counts | |
| 99 | |
| 100 | |
| 101 if __name__ == "__main__": | |
| 102 L = len(sys.argv) - 1 | |
| 103 kmer_length = int(sys.argv[-1]) | |
| 104 files = sys.argv[1:-1] | |
| 105 for fin in files: | |
| 106 counts = count_kmers_from_file(fin, kmer_length) | |
| 107 fout = "{}_{}.kmers".format(fin, kmer_length) | |
| 108 with open(fout, "w") as f: | |
| 109 for i in counts: | |
| 110 f.write("{}\t{}\n".format(*i)) | |
| 111 print(fout) |
