# HG changeset patch # User rnateam # Date 1453801107 18000 # Node ID 0b9aab6aaebfcddd4b5d870af2594c84660b4fab # Parent 303f6402a035645cf84611fbc6f9ea5097490de0 Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40 diff -r 303f6402a035 -r 0b9aab6aaebf convert_bc_to_binary_RY.py --- a/convert_bc_to_binary_RY.py Sat Dec 19 06:16:22 2015 -0500 +++ b/convert_bc_to_binary_RY.py Tue Jan 26 04:38:27 2016 -0500 @@ -1,5 +1,13 @@ #!/usr/bin/env python +import argparse +import logging +from string import maketrans +from sys import stdout +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC + tool_description = """ Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes. @@ -19,19 +27,6 @@ Status: Testing """ -import argparse -import logging -from string import maketrans -from sys import stdout -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Alphabet import IUPAC - -# # avoid ugly python IOError when stdout output is piped into another program -# # and then truncated (such as piping to head) -# from signal import signal, SIGPIPE, SIG_DFL -# signal(SIGPIPE, SIG_DFL) - # parse command line arguments parser = argparse.ArgumentParser(description=tool_description, epilog=epilog, diff -r 303f6402a035 -r 0b9aab6aaebf convert_bc_to_binary_RY.xml --- a/convert_bc_to_binary_RY.xml Sat Dec 19 06:16:22 2015 -0500 +++ b/convert_bc_to_binary_RY.xml Tue Jan 26 04:38:27 2016 -0500 @@ -5,7 +5,7 @@ - python convert_bc_to_binary_RY.py --version + python $__tool_directory__/convert_bc_to_binary_RY.py --version - python coords2clnt.py --version + python $__tool_directory__/coords2clnt.py --version - python extract_aln_ends.py --version + python $__tool_directory__/extract_aln_ends.py --version - python extract_bcs.py --version + python $__tool_directory__/extract_bcs.py --version " + tmpdir + "/bclib.csv" + check_call(syscall1, shell=True) + + # prepare alinments + syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv" + check_call(syscall2, shell=True) -# keep id parts up to first whitespace -alns["read_id"] = alns["read_id"].str.split(' ').str.get(0) + # join barcode library and alignments + syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv" + check_call(syscall3, shell=True) -# combine barcode library and alignments -bcalib = pd.merge( - bcs, alns, - on="read_id", - how="inner", - sort=False) + # get alignments combined with barcodes + bcalib = pd.read_csv( + tmpdir + "/bcalib.csv", + sep="\t", + names=["chrom", "start", "stop", "bc", "score", "strand"]) +finally: + logging.debug("removed tmpdir: " + tmpdir) + rmtree(tmpdir) + +# fail if alignments given but combined library is empty if bcalib.empty: raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") -n_alns = len(alns.index) + +# warn if not all alignments could be assigned a barcode n_bcalib = len(bcalib.index) if n_bcalib < n_alns: logging.warning( - "{} of {} alignments could not be associated with a random barcode.".format( - n_alns - n_bcalib, n_alns)) + "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns)) # remove entries with barcodes that has uncalled base N bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) n_bcalib_cleaned = len(bcalib_cleaned) -if n_bcalib_cleaned < n_bcalib: - msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( - n_bcalib - n_bcalib_cleaned, n_bcalib) - if n_bcalib_cleaned < (0.8 * n_bcalib): - logging.warning(msg) - else: - logging.info(msg) +# if n_bcalib_cleaned < n_bcalib: +# msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( +# n_bcalib - n_bcalib_cleaned, n_bcalib) +# if n_bcalib_cleaned < (0.8 * n_bcalib): +# logging.warning(msg) +# else: +# logging.info(msg) # count and merge pcr duplicates # grouping sorts by keys, so the ouput will be properly sorted diff -r 303f6402a035 -r 0b9aab6aaebf merge_pcr_duplicates.xml --- a/merge_pcr_duplicates.xml Sat Dec 19 06:16:22 2015 -0500 +++ b/merge_pcr_duplicates.xml Tue Jan 26 04:38:27 2016 -0500 @@ -1,11 +1,15 @@ - + according to random barcode library. macros.xml - python merge_pcr_duplicates.py --version + + gnu_awk + gnu_coreutils + + python $__tool_directory__/merge_pcr_duplicates.py --version $default]]> - +