diff rm_spurious_events.py @ 50:0b9aab6aaebf draft

Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
author rnateam
date Tue, 26 Jan 2016 04:38:27 -0500
parents de4ea3aa1090
children
line wrap: on
line diff
--- a/rm_spurious_events.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/rm_spurious_events.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,10 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from subprocess import check_call
+import os
+
 tool_description = """
 Remove spurious events originating from errors in random sequence tags.
 
@@ -7,8 +12,6 @@
 of events the maximum number of PCR duplicates is determined. All events that
 are supported by less than 10 percent of this maximum count are removed.
 
-By default output is written to stdout.
-
 Input:
 * bed6 file containing crosslinking events with score field set to number of PCR
   duplicates
@@ -19,7 +22,7 @@
 
 Example usage:
 - remove spurious events from spurious.bed and write results to file cleaned.bed
-rm_spurious_events.py spurious.bed --out cleaned.bed
+rm_spurious_events.py spurious.bed --oufile cleaned.bed
 """
 
 epilog = """
@@ -30,89 +33,74 @@
 Status: Testing
 """
 
-import argparse
-import logging
-from sys import stdout
-import pandas as pd
-
 
 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
                                           argparse.RawDescriptionHelpFormatter):
     # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
     pass
 
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
-# parse command line arguments
-parser = argparse.ArgumentParser(description=tool_description,
-                                 epilog=epilog,
-                                 formatter_class=DefaultsRawDescriptionHelpFormatter)
-# positional arguments
-parser.add_argument(
-    "events",
-    help="Path to bed6 file containing alignments.")
-# optional arguments
-parser.add_argument(
-    "-o", "--outfile",
-    help="Write results to this file.")
-parser.add_argument(
-    "-t", "--threshold",
-    type=float,
-    default=0.1,
-    help="Threshold for spurious event removal."
-)
-# misc arguments
-parser.add_argument(
-    "-v", "--verbose",
-    help="Be verbose.",
-    action="store_true")
-parser.add_argument(
-    "-d", "--debug",
-    help="Print lots of debugging information",
-    action="store_true")
-parser.add_argument(
-    '--version',
-    action='version',
-    version='0.1.0')
 
-args = parser.parse_args()
-
-if args.debug:
-    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
-elif args.verbose:
-    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
-else:
-    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
-logging.info("Parsed arguments:")
-logging.info("  alignments: '{}'".format(args.events))
-logging.info("  threshold: '{}'".format(args.threshold))
-if args.outfile:
-    logging.info("  outfile: enabled writing to file")
-    logging.info("  outfile: '{}'".format(args.outfile))
-logging.info("")
+def main():
+    # parse command line arguments
+    parser = argparse.ArgumentParser(description=tool_description,
+                                     epilog=epilog,
+                                     formatter_class=DefaultsRawDescriptionHelpFormatter)
+    # positional arguments
+    parser.add_argument(
+        "events",
+        help="Path to bed6 file containing alignments.")
+    # optional arguments
+    parser.add_argument(
+        "-o", "--outfile",
+        required=True,
+        help="Write results to this file.")
+    parser.add_argument(
+        "-t", "--threshold",
+        type=float,
+        default=0.1,
+        help="Threshold for spurious event removal."
+    )
+    # misc arguments
+    parser.add_argument(
+        "-v", "--verbose",
+        help="Be verbose.",
+        action="store_true")
+    parser.add_argument(
+        "-d", "--debug",
+        help="Print lots of debugging information",
+        action="store_true")
+    parser.add_argument(
+        '--version',
+        action='version',
+        version='0.1.0')
 
-# check threshold parameter value
-if args.threshold < 0 or args.threshold > 1:
-    raise ValueError("Threshold must be in [0,1].")
-
-# load alignments
-alns = pd.read_csv(
-    args.events,
-    sep="\t",
-    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+    args = parser.parse_args()
 
-# remove all alignments that not enough PCR duplicates with respect to
-# the group maximum
-grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False)
-alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()])
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+    elif args.verbose:
+        logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+    else:
+        logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+    logging.info("Parsed arguments:")
+    logging.info("  alignments: '{}'".format(args.events))
+    logging.info("  threshold: '{}'".format(args.threshold))
+    if args.outfile:
+        logging.info("  outfile: enabled writing to file")
+        logging.info("  outfile: '{}'".format(args.outfile))
+    logging.info("")
 
-# write coordinates of crosslinking event alignments
-alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout)
-alns_cleaned.to_csv(
-    alns_cleaned_out,
-    columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'],
-    sep="\t", index=False, header=False)
-alns_cleaned_out.close()
+    # check threshold parameter value
+    if args.threshold < 0 or args.threshold > 1:
+        raise ValueError("Threshold must be in [0,1].")
+
+    if not os.path.isfile(args.events):
+        raise Exception("ERROR: file '{}' not found.")
+
+    # prepare barcode library
+    syscall = "cat " + args.events + " | sort -k1,1V -k6,6 -k2,2n -k3,3 -k5,5nr | perl " + os.path.dirname(os.path.realpath(__file__)) + "/rm_spurious_events.pl --frac_max " + str(args.threshold) + "| sort -k1,1V -k2,2n -k3,3n -k6,6 -k4,4 -k5,5nr > " + args.outfile
+    check_call(syscall, shell=True)
+
+
+if __name__ == "__main__":
+    main()