Mercurial > repos > iuc > fasta_stats
comparison fasta-stats.py @ 4:60718157dd5f draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 50f5cce5a8c11001e2c59600a2b99a4243b6d06f"
| author | iuc |
|---|---|
| date | Thu, 18 Nov 2021 20:56:27 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:5b072a9eaa9d | 4:60718157dd5f |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 | |
| 4 # python version of fasta-stats with some extra features | |
| 5 # written by anmol.kiran@gmail.com | |
| 6 # git: @codemeleon | |
| 7 # date: 10/11/2021 | |
| 8 | |
| 9 import argparse | |
| 10 import re | |
| 11 from os import path | |
| 12 | |
| 13 import numpy as np | |
| 14 from Bio import SeqIO | |
| 15 | |
| 16 | |
| 17 def calculate_NG50(estimated_genome, total_length, sequence_lengths): | |
| 18 temp = 0 | |
| 19 teoretical_NG50 = estimated_genome / 2.0 | |
| 20 NG50 = 0 | |
| 21 for seq in sequence_lengths: | |
| 22 temp += seq | |
| 23 if teoretical_NG50 < temp: | |
| 24 NG50 = seq | |
| 25 break | |
| 26 return NG50 | |
| 27 | |
| 28 | |
| 29 def run(fasta, stats_output, gaps_output, genome_size): | |
| 30 """Generates scaffold statistics.""" | |
| 31 if not fasta: | |
| 32 exit("Input file not given.") | |
| 33 if not path.isfile(fasta): | |
| 34 exit(f"{fasta} path does not exist.") | |
| 35 | |
| 36 seq_len = {} | |
| 37 bases_global = {"A": 0, "N": 0, "T": 0, "C": 0, "G": 0} | |
| 38 bases_seq = {} | |
| 39 seq_id_Ngaprange = {} | |
| 40 nstart = 0 | |
| 41 contigs_len = [] | |
| 42 gap_count = 0 | |
| 43 for seq_record in SeqIO.parse(fasta, "fasta"): | |
| 44 seq = str(seq_record.seq).upper() | |
| 45 # print(len(seq)) | |
| 46 seq_len[seq_record.id] = len(seq) | |
| 47 | |
| 48 # NOTE: Nucleotide count | |
| 49 bases_seq[seq_record.id] = { | |
| 50 "A": seq.count("A"), | |
| 51 "N": seq.count("N"), | |
| 52 "T": seq.count("T"), | |
| 53 "C": seq.count("C"), | |
| 54 "G": seq.count("G"), | |
| 55 } | |
| 56 bases_global["A"] += bases_seq[seq_record.id]["A"] | |
| 57 bases_global["N"] += bases_seq[seq_record.id]["N"] | |
| 58 bases_global["T"] += bases_seq[seq_record.id]["T"] | |
| 59 bases_global["C"] += bases_seq[seq_record.id]["C"] | |
| 60 bases_global["G"] += bases_seq[seq_record.id]["G"] | |
| 61 | |
| 62 # NOTE: Gap count and their range | |
| 63 range_gen = re.finditer("N+", seq) | |
| 64 n_range = [match.span() for match in range_gen] | |
| 65 for n_rng in n_range: | |
| 66 if n_rng[0] == 0 or n_rng[1] == seq_len[seq_record.id]: | |
| 67 continue | |
| 68 else: | |
| 69 gap_count += 1 | |
| 70 | |
| 71 # NOTE: Contigs, their lenths from scaffold and their N gap range | |
| 72 seq_id_Ngaprange[seq_record.id] = n_range | |
| 73 n_range_len = len(n_range) | |
| 74 if n_range_len > 0: | |
| 75 n_range = ( | |
| 76 [(0, 0)] + n_range + [(seq_len[seq_record.id], seq_len[seq_record.id])] | |
| 77 ) | |
| 78 for idx in range(n_range_len + 1): | |
| 79 nstart = n_range[idx][1] | |
| 80 nend = n_range[idx + 1][0] | |
| 81 con_len = nend - nstart | |
| 82 if con_len: | |
| 83 contigs_len.append(con_len) | |
| 84 else: | |
| 85 contigs_len.append(len(seq)) | |
| 86 | |
| 87 # NOTE: Scaffold statistics | |
| 88 SEQ_LEN_LIST = sorted(seq_len.values(), reverse=True) | |
| 89 scaffold_lens = np.array(SEQ_LEN_LIST) | |
| 90 scaffold_lens_sum = np.cumsum(scaffold_lens) | |
| 91 N50_len = scaffold_lens_sum[-1] * 0.5 | |
| 92 N50_idx = np.where(scaffold_lens_sum > N50_len)[0][0] | |
| 93 N90_len = scaffold_lens_sum[-1] * 0.9 | |
| 94 N90_idx = np.where(scaffold_lens_sum > N90_len)[0][0] | |
| 95 NG50 = calculate_NG50(genome_size, scaffold_lens_sum[-1], scaffold_lens) | |
| 96 | |
| 97 # NOTE: Contig statistics | |
| 98 seq_len_list = sorted(contigs_len, reverse=True) | |
| 99 contigs_len = np.array(seq_len_list) | |
| 100 contigs_len_sum = np.cumsum(contigs_len) | |
| 101 n50_len = contigs_len_sum[-1] * 0.5 | |
| 102 n50_idx = np.where(contigs_len_sum > n50_len)[0][0] | |
| 103 n90_len = contigs_len_sum[-1] * 0.9 | |
| 104 n90_idx = np.where(contigs_len_sum > n90_len)[0][0] | |
| 105 ng50 = calculate_NG50(genome_size, contigs_len_sum[-1], contigs_len) | |
| 106 | |
| 107 with open(stats_output, "w") as soutput: | |
| 108 soutput.write("{}\t{}\n".format("Scaffold L50", N50_idx + 1)) | |
| 109 soutput.write("{}\t{}\n".format("Scaffold N50", SEQ_LEN_LIST[N50_idx])) | |
| 110 soutput.write("{}\t{}\n".format("Scaffold L90", N90_idx + 1)) | |
| 111 soutput.write("{}\t{}\n".format("Scaffold N90", SEQ_LEN_LIST[N90_idx])) | |
| 112 if genome_size != 0: | |
| 113 soutput.write("{}\t{}\n".format("Scaffold NG50", NG50)) | |
| 114 soutput.write("{}\t{}\n".format("Scaffold len_max", SEQ_LEN_LIST[0])) | |
| 115 soutput.write("{}\t{}\n".format("Scaffold len_min", SEQ_LEN_LIST[-1])) | |
| 116 soutput.write( | |
| 117 "{}\t{}\n".format("Scaffold len_mean", int(np.mean(SEQ_LEN_LIST))) | |
| 118 ) | |
| 119 soutput.write( | |
| 120 "{}\t{}\n".format("Scaffold len_median", int(np.median(SEQ_LEN_LIST))) | |
| 121 ) | |
| 122 soutput.write("{}\t{}\n".format("Scaffold len_std", int(np.std(SEQ_LEN_LIST)))) | |
| 123 soutput.write("{}\t{}\n".format("Scaffold num_A", bases_global["A"])) | |
| 124 soutput.write("{}\t{}\n".format("Scaffold num_T", bases_global["T"])) | |
| 125 soutput.write("{}\t{}\n".format("Scaffold num_C", bases_global["C"])) | |
| 126 soutput.write("{}\t{}\n".format("Scaffold num_G", bases_global["G"])) | |
| 127 soutput.write("{}\t{}\n".format("Scaffold num_N", bases_global["N"])) | |
| 128 soutput.write("{}\t{}\n".format("Scaffold num_bp", scaffold_lens_sum[-1])) | |
| 129 soutput.write( | |
| 130 "{}\t{}\n".format( | |
| 131 "Scaffold num_bp_not_N", scaffold_lens_sum[-1] - bases_global["N"] | |
| 132 ) | |
| 133 ) | |
| 134 soutput.write("{}\t{}\n".format("Scaffold num_seq", len(SEQ_LEN_LIST))) | |
| 135 soutput.write( | |
| 136 "{}\t{:.2f}\n".format( | |
| 137 "Scaffold GC content overall", | |
| 138 ( | |
| 139 (bases_global["G"] + bases_global["C"]) | |
| 140 * 100.0 | |
| 141 / scaffold_lens_sum[-1] | |
| 142 ), | |
| 143 ) | |
| 144 ) | |
| 145 | |
| 146 soutput.write("{}\t{}\n".format("Contig L50", n50_idx + 1)) | |
| 147 soutput.write("{}\t{}\n".format("Contig N50", seq_len_list[n50_idx])) | |
| 148 soutput.write("{}\t{}\n".format("Contig L90", n90_idx + 1)) | |
| 149 soutput.write("{}\t{}\n".format("Contig N90", seq_len_list[n90_idx])) | |
| 150 if genome_size != 0: | |
| 151 soutput.write("{}\t{}\n".format("Contig NG50", ng50)) | |
| 152 soutput.write("{}\t{}\n".format("Contig len_max", seq_len_list[0])) | |
| 153 soutput.write("{}\t{}\n".format("Contig len_min", seq_len_list[-1])) | |
| 154 soutput.write("{}\t{}\n".format("Contig len_mean", int(np.mean(seq_len_list)))) | |
| 155 soutput.write( | |
| 156 "{}\t{}\n".format("Contig len_median", int(np.median(seq_len_list))) | |
| 157 ) | |
| 158 soutput.write("{}\t{}\n".format("Contig len_std", int(np.std(seq_len_list)))) | |
| 159 soutput.write("{}\t{}\n".format("Contig num_bp", contigs_len_sum[-1])) | |
| 160 soutput.write("{}\t{}\n".format("Contig num_seq", len(contigs_len_sum))) | |
| 161 soutput.write("{}\t{}\n".format("Number of gaps", gap_count)) | |
| 162 if gaps_output is not None: | |
| 163 # NOTE: generate gaps statistics file | |
| 164 with open(gaps_output, "w") as goutput: | |
| 165 for key in seq_id_Ngaprange: | |
| 166 for rng in seq_id_Ngaprange[key]: | |
| 167 goutput.write("{}\t{}\t{}\n".format(key, rng[0], rng[1])) | |
| 168 | |
| 169 | |
| 170 if __name__ == "__main__": | |
| 171 parser = argparse.ArgumentParser() | |
| 172 parser.add_argument("-f", "--fasta", required=True, help="FASTA file") | |
| 173 parser.add_argument( | |
| 174 "-z", | |
| 175 "--genome_size", | |
| 176 required=False, | |
| 177 type=int, | |
| 178 help="If provided, the NG50 statistic will be computed", | |
| 179 default=0, | |
| 180 ) | |
| 181 parser.add_argument( | |
| 182 "-s", | |
| 183 "--stats_output", | |
| 184 required=True, | |
| 185 help="File to store the general statistics", | |
| 186 ) | |
| 187 parser.add_argument( | |
| 188 "-r", | |
| 189 "--gaps_output", | |
| 190 required=False, | |
| 191 help="File to store the gaps statistics", | |
| 192 default=None, | |
| 193 ) | |
| 194 args = parser.parse_args() | |
| 195 | |
| 196 run( | |
| 197 args.fasta, | |
| 198 args.stats_output, | |
| 199 args.gaps_output, | |
| 200 args.genome_size, | |
| 201 ) |
