annotate tools/seq_length/seq_length.py @ 0:61f295b4db89 draft

Initial release v0.0.1
author peterjc
date Tue, 08 May 2018 09:34:49 -0400
parents
children cc5697699212
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
2 """Compute length of FASTA, QUAL, FASTQ or SSF sequences.
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
3
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
4 Takes three command line options: input sequence filename, input type
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
5 (e.g. FASTA or SFF) and the output filename (tabular).
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
6
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
7 This tool is a short Python script which requires Biopython 1.54 or later
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
8 for SFF file support. If you use this tool in scientific work leading to a
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
9 publication, please cite the Biopython application note:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
10
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
11 Cock et al 2009. Biopython: freely available Python tools for computational
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
14
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
15 This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
16 All rights reserved. See accompanying text file for licence details (MIT
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
17 license).
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
18 """
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
19
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
20 from __future__ import print_function
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
21
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
22 import sys
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
23
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
24 if "-v" in sys.argv or "--version" in sys.argv:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
25 print("v0.0.1")
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
26 sys.exit(0)
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
27
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
28 try:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
29 from Bio import SeqIO
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
30 except ImportError:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
31 sys.exit("Missing required Python library Biopython.")
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
32
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
33
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
34 # Parse Command Line
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
35 try:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
36 in_file, seq_format, out_file = sys.argv[1:]
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
37 except ValueError:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
38 sys.exit("Expected three arguments (input file, format, output file), "
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
39 "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
40
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
41
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
42 if seq_format.startswith("fastq"):
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
43 # We don't care about the quality score encoding, just
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
44 # need to translate Galaxy format name into something
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
45 # Biopython will accept:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
46 format = "fastq"
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
47 elif seq_format.lower() == "csfasta":
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
48 # I have not tested with colour space FASTA
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
49 format = "fasta"
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
50 elif seq_format.lower == "sff":
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
51 # The masked/trimmed numbers are more interesting
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
52 format = "sff-trim"
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
53 elif seq_format.lower() in ["fasta", "qual"]:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
54 format = seq_format.lower()
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
55 else:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
56 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
57 sys.exit("Unexpected format argument: %r" % seq_format)
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
58
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
59
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
60 count = 0
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
61 total = 0
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
62 with open(out_file, "w") as out_handle:
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
63 out_handle.write("#Identifier\tLength\n")
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
64 for record in SeqIO.parse(in_file, format):
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
65 count += 1
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
66 length = len(record)
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
67 total += length
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
68 out_handle.write("%s\t%i\n" % (record.id, length))
61f295b4db89 Initial release v0.0.1
peterjc
parents:
diff changeset
69 print("%i sequences, total length %i" % (count, total))