comparison remove_tail.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 0b9aab6aaebf
comparison
equal deleted inserted replaced
1:ae0f58d3318f 2:de4ea3aa1090
1 #!/usr/bin/env python
2
3 tool_description = """
4 Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
5 format.
6
7 Example usage:
8 - remove the last 7 nucleotides from file input.fastq, write result to file
9 output.fastq:
10 remove_tail.py input.fastq 7 --out output.fastq
11 """
12
13 epilog = """
14 Author: Daniel Maticzka
15 Copyright: 2015
16 License: Apache
17 Email: maticzkd@informatik.uni-freiburg.de
18 Status: Testing
19 """
20
21 import argparse
22 import logging
23 from sys import stdout
24 from Bio.SeqIO.QualityIO import FastqGeneralIterator
25
26 # avoid ugly python IOError when stdout output is piped into another program
27 # and then truncated (such as piping to head)
28 from signal import signal, SIGPIPE, SIG_DFL
29 signal(SIGPIPE, SIG_DFL)
30
31 # parse command line arguments
32 parser = argparse.ArgumentParser(description=tool_description,
33 epilog=epilog,
34 formatter_class=argparse.RawDescriptionHelpFormatter)
35 # positional arguments
36 parser.add_argument(
37 "infile",
38 help="Path to fastq file.")
39 parser.add_argument(
40 "length",
41 type=int,
42 help="Remove this many nts.")
43 # optional arguments
44 parser.add_argument(
45 "-o", "--outfile",
46 help="Write results to this file.")
47 parser.add_argument(
48 "-v", "--verbose",
49 help="Be verbose.",
50 action="store_true")
51 parser.add_argument(
52 "-d", "--debug",
53 help="Print lots of debugging information",
54 action="store_true")
55 parser.add_argument(
56 '--version',
57 action='version',
58 version='0.1.0')
59
60 args = parser.parse_args()
61 if args.debug:
62 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
63 elif args.verbose:
64 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
65 else:
66 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
67 logging.info("Parsed arguments:")
68 logging.info(" infile: '{}'".format(args.infile))
69 logging.info(" length: '{}'".format(args.length))
70 if args.outfile:
71 logging.info(" outfile: enabled writing to file")
72 logging.info(" outfile: '{}'".format(args.outfile))
73 logging.info("")
74
75 # check length parameter
76 if args.length < 0:
77 raise ValueError("Length must be a positive integer, is '{}'.".format(args.length))
78
79 # remove tail
80 with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout:
81 for header, seq, qual in FastqGeneralIterator(open(args.infile)):
82
83 # if removing tail would lead to an empty sequence,
84 # set sequence to a single N to keep fastq synchronized
85 if len(seq) <= args.length:
86 seq = "N"
87 qual = "B"
88 logging.debug("read '{}' was too short to remove full tail".format(header))
89 logging.debug("seq: {}".format(seq))
90 logging.debug("len(seq): {}".format(len(seq)))
91 else:
92 seq = seq[0:-args.length]
93 qual = qual[0:-args.length]
94
95 samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))