# HG changeset patch
# User rnateam
# Date 1453801107 18000
# Node ID 0b9aab6aaebfcddd4b5d870af2594c84660b4fab
# Parent 303f6402a035645cf84611fbc6f9ea5097490de0
Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
diff -r 303f6402a035 -r 0b9aab6aaebf convert_bc_to_binary_RY.py
--- a/convert_bc_to_binary_RY.py Sat Dec 19 06:16:22 2015 -0500
+++ b/convert_bc_to_binary_RY.py Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,13 @@
#!/usr/bin/env python
+import argparse
+import logging
+from string import maketrans
+from sys import stdout
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+
tool_description = """
Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes.
@@ -19,19 +27,6 @@
Status: Testing
"""
-import argparse
-import logging
-from string import maketrans
-from sys import stdout
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC
-
-# # avoid ugly python IOError when stdout output is piped into another program
-# # and then truncated (such as piping to head)
-# from signal import signal, SIGPIPE, SIG_DFL
-# signal(SIGPIPE, SIG_DFL)
-
# parse command line arguments
parser = argparse.ArgumentParser(description=tool_description,
epilog=epilog,
diff -r 303f6402a035 -r 0b9aab6aaebf convert_bc_to_binary_RY.xml
--- a/convert_bc_to_binary_RY.xml Sat Dec 19 06:16:22 2015 -0500
+++ b/convert_bc_to_binary_RY.xml Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,7 @@
- python convert_bc_to_binary_RY.py --version
+ python $__tool_directory__/convert_bc_to_binary_RY.py --version
- python coords2clnt.py --version
+ python $__tool_directory__/coords2clnt.py --version
- python extract_aln_ends.py --version
+ python $__tool_directory__/extract_aln_ends.py --version
- python extract_bcs.py --version
+ python $__tool_directory__/extract_bcs.py --version
" + tmpdir + "/bclib.csv"
+ check_call(syscall1, shell=True)
+
+ # prepare alinments
+ syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv"
+ check_call(syscall2, shell=True)
-# keep id parts up to first whitespace
-alns["read_id"] = alns["read_id"].str.split(' ').str.get(0)
+ # join barcode library and alignments
+ syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv"
+ check_call(syscall3, shell=True)
-# combine barcode library and alignments
-bcalib = pd.merge(
- bcs, alns,
- on="read_id",
- how="inner",
- sort=False)
+ # get alignments combined with barcodes
+ bcalib = pd.read_csv(
+ tmpdir + "/bcalib.csv",
+ sep="\t",
+ names=["chrom", "start", "stop", "bc", "score", "strand"])
+finally:
+ logging.debug("removed tmpdir: " + tmpdir)
+ rmtree(tmpdir)
+
+# fail if alignments given but combined library is empty
if bcalib.empty:
raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
-n_alns = len(alns.index)
+
+# warn if not all alignments could be assigned a barcode
n_bcalib = len(bcalib.index)
if n_bcalib < n_alns:
logging.warning(
- "{} of {} alignments could not be associated with a random barcode.".format(
- n_alns - n_bcalib, n_alns))
+ "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns))
# remove entries with barcodes that has uncalled base N
bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
n_bcalib_cleaned = len(bcalib_cleaned)
-if n_bcalib_cleaned < n_bcalib:
- msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
- n_bcalib - n_bcalib_cleaned, n_bcalib)
- if n_bcalib_cleaned < (0.8 * n_bcalib):
- logging.warning(msg)
- else:
- logging.info(msg)
+# if n_bcalib_cleaned < n_bcalib:
+# msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
+# n_bcalib - n_bcalib_cleaned, n_bcalib)
+# if n_bcalib_cleaned < (0.8 * n_bcalib):
+# logging.warning(msg)
+# else:
+# logging.info(msg)
# count and merge pcr duplicates
# grouping sorts by keys, so the ouput will be properly sorted
diff -r 303f6402a035 -r 0b9aab6aaebf merge_pcr_duplicates.xml
--- a/merge_pcr_duplicates.xml Sat Dec 19 06:16:22 2015 -0500
+++ b/merge_pcr_duplicates.xml Tue Jan 26 04:38:27 2016 -0500
@@ -1,11 +1,15 @@
-
+
according to random barcode library.
macros.xml
- python merge_pcr_duplicates.py --version
+
+ gnu_awk
+ gnu_coreutils
+
+ python $__tool_directory__/merge_pcr_duplicates.py --version
$default]]>
-
+
diff -r 303f6402a035 -r 0b9aab6aaebf remove_tail.py
--- a/remove_tail.py Sat Dec 19 06:16:22 2015 -0500
+++ b/remove_tail.py Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,14 @@
#!/usr/bin/env python
+import argparse
+import logging
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
tool_description = """
Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
format.
@@ -18,16 +27,6 @@
Status: Testing
"""
-import argparse
-import logging
-from sys import stdout
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
-
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
# parse command line arguments
parser = argparse.ArgumentParser(description=tool_description,
epilog=epilog,
diff -r 303f6402a035 -r 0b9aab6aaebf remove_tail.xml
--- a/remove_tail.xml Sat Dec 19 06:16:22 2015 -0500
+++ b/remove_tail.xml Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,7 @@
- python remove_tail.py --version
+ python $__tool_directory__/remove_tail.py --version
\$frac_max,
+ "help" => \$help,
+ "man" => \$man,
+ "debug" => \$debug);
+pod2usage(-exitstatus => 1, -verbose => 1) if $help;
+pod2usage(-exitstatus => 0, -verbose => 2) if $man;
+($result) or pod2usage(2);
+
+###############################################################################
+# main
+###############################################################################
+my $current_chr = '';
+my $current_start = -1;
+my $current_stop = -1;
+my $current_strand = '';
+my $current_max = -1;
+my $current_threshold = -1;
+
+while (<>) {
+ my ($chr, $start, $stop, $id, $count, $strand) = split("\t");
+ # my ($count, undef, $start, $chr, $strand, $stop) = split("\t");
+
+ if ($current_start != $start or
+ $current_stop != $stop or
+ $current_chr ne $chr or
+ $current_strand ne $strand) {
+ # if this is the first occourence of these coordinates
+ # this must be the new maximum
+ # save new state
+ $current_start = $start;
+ $current_stop = $stop;
+ $current_chr = $chr;
+ $current_strand = $strand;
+ # print record and record maximum
+ $current_max = $count;
+ $current_threshold = $count*$frac_max;
+ print $_;
+ $debug and say "new threshold ${current_threshold} @ $chr $start $stop $strand $count";
+ } elsif ($count >= $current_threshold) {
+ # if it is not the first occourence, evaluate threshold and print if valid
+ print $_;
+ } else {
+ $debug and say "below threshold @ $chr $start $stop $strand $count";
+ }
+}
diff -r 303f6402a035 -r 0b9aab6aaebf rm_spurious_events.py
--- a/rm_spurious_events.py Sat Dec 19 06:16:22 2015 -0500
+++ b/rm_spurious_events.py Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,10 @@
#!/usr/bin/env python
+import argparse
+import logging
+from subprocess import check_call
+import os
+
tool_description = """
Remove spurious events originating from errors in random sequence tags.
@@ -7,8 +12,6 @@
of events the maximum number of PCR duplicates is determined. All events that
are supported by less than 10 percent of this maximum count are removed.
-By default output is written to stdout.
-
Input:
* bed6 file containing crosslinking events with score field set to number of PCR
duplicates
@@ -19,7 +22,7 @@
Example usage:
- remove spurious events from spurious.bed and write results to file cleaned.bed
-rm_spurious_events.py spurious.bed --out cleaned.bed
+rm_spurious_events.py spurious.bed --oufile cleaned.bed
"""
epilog = """
@@ -30,89 +33,74 @@
Status: Testing
"""
-import argparse
-import logging
-from sys import stdout
-import pandas as pd
-
class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
# To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
pass
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
-# parse command line arguments
-parser = argparse.ArgumentParser(description=tool_description,
- epilog=epilog,
- formatter_class=DefaultsRawDescriptionHelpFormatter)
-# positional arguments
-parser.add_argument(
- "events",
- help="Path to bed6 file containing alignments.")
-# optional arguments
-parser.add_argument(
- "-o", "--outfile",
- help="Write results to this file.")
-parser.add_argument(
- "-t", "--threshold",
- type=float,
- default=0.1,
- help="Threshold for spurious event removal."
-)
-# misc arguments
-parser.add_argument(
- "-v", "--verbose",
- help="Be verbose.",
- action="store_true")
-parser.add_argument(
- "-d", "--debug",
- help="Print lots of debugging information",
- action="store_true")
-parser.add_argument(
- '--version',
- action='version',
- version='0.1.0')
-args = parser.parse_args()
-
-if args.debug:
- logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
-elif args.verbose:
- logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
-else:
- logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
-logging.info("Parsed arguments:")
-logging.info(" alignments: '{}'".format(args.events))
-logging.info(" threshold: '{}'".format(args.threshold))
-if args.outfile:
- logging.info(" outfile: enabled writing to file")
- logging.info(" outfile: '{}'".format(args.outfile))
-logging.info("")
+def main():
+ # parse command line arguments
+ parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=DefaultsRawDescriptionHelpFormatter)
+ # positional arguments
+ parser.add_argument(
+ "events",
+ help="Path to bed6 file containing alignments.")
+ # optional arguments
+ parser.add_argument(
+ "-o", "--outfile",
+ required=True,
+ help="Write results to this file.")
+ parser.add_argument(
+ "-t", "--threshold",
+ type=float,
+ default=0.1,
+ help="Threshold for spurious event removal."
+ )
+ # misc arguments
+ parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+ parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+ parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
-# check threshold parameter value
-if args.threshold < 0 or args.threshold > 1:
- raise ValueError("Threshold must be in [0,1].")
-
-# load alignments
-alns = pd.read_csv(
- args.events,
- sep="\t",
- names=["chrom", "start", "stop", "read_id", "score", "strand"])
+ args = parser.parse_args()
-# remove all alignments that not enough PCR duplicates with respect to
-# the group maximum
-grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False)
-alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()])
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+ elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+ else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+ logging.info("Parsed arguments:")
+ logging.info(" alignments: '{}'".format(args.events))
+ logging.info(" threshold: '{}'".format(args.threshold))
+ if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+ logging.info("")
-# write coordinates of crosslinking event alignments
-alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout)
-alns_cleaned.to_csv(
- alns_cleaned_out,
- columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'],
- sep="\t", index=False, header=False)
-alns_cleaned_out.close()
+ # check threshold parameter value
+ if args.threshold < 0 or args.threshold > 1:
+ raise ValueError("Threshold must be in [0,1].")
+
+ if not os.path.isfile(args.events):
+ raise Exception("ERROR: file '{}' not found.")
+
+ # prepare barcode library
+ syscall = "cat " + args.events + " | sort -k1,1V -k6,6 -k2,2n -k3,3 -k5,5nr | perl " + os.path.dirname(os.path.realpath(__file__)) + "/rm_spurious_events.pl --frac_max " + str(args.threshold) + "| sort -k1,1V -k2,2n -k3,3n -k6,6 -k4,4 -k5,5nr > " + args.outfile
+ check_call(syscall, shell=True)
+
+
+if __name__ == "__main__":
+ main()
diff -r 303f6402a035 -r 0b9aab6aaebf rm_spurious_events.xml
--- a/rm_spurious_events.xml Sat Dec 19 06:16:22 2015 -0500
+++ b/rm_spurious_events.xml Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,11 @@
- python rm_spurious_events.py --version
+
+ perl
+ gnu_coreutils
+
+ python $__tool_directory__/rm_spurious_events.py --version
$default]]>
+--outfile $default]]>
diff -r 303f6402a035 -r 0b9aab6aaebf tool_dependencies.xml
--- a/tool_dependencies.xml Sat Dec 19 06:16:22 2015 -0500
+++ b/tool_dependencies.xml Tue Jan 26 04:38:27 2016 -0500
@@ -7,18 +7,27 @@
-->
-
+
-
+
-
+
+
+
+
+
+
+
+
+
+