Mercurial > repos > rnateam > bctools
comparison merge_pcr_duplicates.py @ 50:0b9aab6aaebf draft
Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
| author | rnateam |
|---|---|
| date | Tue, 26 Jan 2016 04:38:27 -0500 |
| parents | 570a7de9f151 |
| children | 4bedd35bcdff |
comparison
equal
deleted
inserted
replaced
| 49:303f6402a035 | 50:0b9aab6aaebf |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | |
| 3 import argparse | |
| 4 import logging | |
| 5 from sys import stdout | |
| 6 import pandas as pd | |
| 7 from subprocess import check_call | |
| 8 from shutil import rmtree | |
| 9 from tempfile import mkdtemp | |
| 10 from os.path import isfile | |
| 11 # avoid ugly python IOError when stdout output is piped into another program | |
| 12 # and then truncated (such as piping to head) | |
| 13 from signal import signal, SIGPIPE, SIG_DFL | |
| 14 signal(SIGPIPE, SIG_DFL) | |
| 2 | 15 |
| 3 tool_description = """ | 16 tool_description = """ |
| 4 Merge PCR duplicates according to random barcode library. | 17 Merge PCR duplicates according to random barcode library. |
| 5 | 18 |
| 6 Barcodes containing uncalled base 'N' are removed. By default output is written | 19 Barcodes containing uncalled base 'N' are removed. By default output is written |
| 26 License: Apache | 39 License: Apache |
| 27 Email: maticzkd@informatik.uni-freiburg.de | 40 Email: maticzkd@informatik.uni-freiburg.de |
| 28 Status: Testing | 41 Status: Testing |
| 29 """ | 42 """ |
| 30 | 43 |
| 31 import argparse | |
| 32 import logging | |
| 33 from sys import stdout | |
| 34 from Bio import SeqIO | |
| 35 import pandas as pd | |
| 36 | |
| 37 # avoid ugly python IOError when stdout output is piped into another program | |
| 38 # and then truncated (such as piping to head) | |
| 39 from signal import signal, SIGPIPE, SIG_DFL | |
| 40 signal(SIGPIPE, SIG_DFL) | |
| 41 | |
| 42 | |
| 43 def fasta_tuple_generator(fasta_iterator): | |
| 44 "Yields id, sequence tuples given an iterator over Biopython SeqIO objects." | |
| 45 for record in input_seq_iterator: | |
| 46 yield (record.id, str(record.seq)) | |
| 47 | |
| 48 | |
| 49 # parse command line arguments | 44 # parse command line arguments |
| 50 parser = argparse.ArgumentParser(description=tool_description, | 45 parser = argparse.ArgumentParser(description=tool_description, |
| 51 epilog=epilog, | 46 epilog=epilog, |
| 52 formatter_class=argparse.RawDescriptionHelpFormatter) | 47 formatter_class=argparse.RawDescriptionHelpFormatter) |
| 53 # positional arguments | 48 # positional arguments |
| 54 parser.add_argument( | 49 parser.add_argument( |
| 55 "alignments", | 50 "alignments", |
| 56 help="Path to bed6 file containing alignments.") | 51 help="Path to bed6 file containing alignments.") |
| 57 parser.add_argument( | 52 parser.add_argument( |
| 58 "bclib", | 53 "bclib", |
| 59 help="Path to fasta barcode library.") | 54 help="Path to fastq barcode library.") |
| 60 # optional arguments | 55 # optional arguments |
| 61 parser.add_argument( | 56 parser.add_argument( |
| 62 "-o", "--outfile", | 57 "-o", "--outfile", |
| 63 help="Write results to this file.") | 58 help="Write results to this file.") |
| 64 parser.add_argument( | |
| 65 "--fasta-library", | |
| 66 dest="fasta_library", | |
| 67 action="store_true", | |
| 68 help="Read random barcode library as fasta format.") | |
| 69 # misc arguments | 59 # misc arguments |
| 70 parser.add_argument( | 60 parser.add_argument( |
| 71 "-v", "--verbose", | 61 "-v", "--verbose", |
| 72 help="Be verbose.", | 62 help="Be verbose.", |
| 73 action="store_true") | 63 action="store_true") |
| 76 help="Print lots of debugging information", | 66 help="Print lots of debugging information", |
| 77 action="store_true") | 67 action="store_true") |
| 78 parser.add_argument( | 68 parser.add_argument( |
| 79 '--version', | 69 '--version', |
| 80 action='version', | 70 action='version', |
| 81 version='0.1.0') | 71 version='0.2.0') |
| 82 | 72 |
| 83 args = parser.parse_args() | 73 args = parser.parse_args() |
| 84 | 74 |
| 85 if args.debug: | 75 if args.debug: |
| 86 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | 76 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") |
| 94 if args.outfile: | 84 if args.outfile: |
| 95 logging.info(" outfile: enabled writing to file") | 85 logging.info(" outfile: enabled writing to file") |
| 96 logging.info(" outfile: '{}'".format(args.outfile)) | 86 logging.info(" outfile: '{}'".format(args.outfile)) |
| 97 logging.info("") | 87 logging.info("") |
| 98 | 88 |
| 99 # load barcode library into dictionary | 89 # see if alignments are empty and the tool can quit |
| 100 input_handle = open(args.bclib, "rU") | 90 n_alns = sum(1 for line in open(args.alignments)) |
| 101 if args.fasta_library: | 91 if n_alns == 0: |
| 102 input_seq_iterator = SeqIO.parse(input_handle, "fasta") | 92 logging.warning("WARNING: Working on empty set of alignments, writing empty output.") |
| 103 else: | 93 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout) |
| 104 input_seq_iterator = SeqIO.parse(input_handle, "fastq") | 94 eventalnout.close() |
| 105 bcs = pd.DataFrame.from_records( | 95 exit(0) |
| 106 data=fasta_tuple_generator(input_seq_iterator), | |
| 107 columns=["read_id", "bc"]) | |
| 108 | 96 |
| 109 # load alignments | 97 # check input filenames |
| 110 alns = pd.read_csv( | 98 if not isfile(args.bclib): |
| 111 args.alignments, | 99 raise Exception("ERROR: barcode library '{}' not found.") |
| 112 sep="\t", | 100 if not isfile(args.alignments): |
| 113 names=["chrom", "start", "stop", "read_id", "score", "strand"]) | 101 raise Exception("ERROR: alignments '{}' not found.") |
| 114 | 102 |
| 115 # keep id parts up to first whitespace | 103 try: |
| 116 alns["read_id"] = alns["read_id"].str.split(' ').str.get(0) | 104 tmpdir = mkdtemp() |
| 105 logging.debug("tmpdir: " + tmpdir) | |
| 117 | 106 |
| 118 # combine barcode library and alignments | 107 # prepare barcode library |
| 119 bcalib = pd.merge( | 108 syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv" |
| 120 bcs, alns, | 109 check_call(syscall1, shell=True) |
| 121 on="read_id", | 110 |
| 122 how="inner", | 111 # prepare alinments |
| 123 sort=False) | 112 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv" |
| 113 check_call(syscall2, shell=True) | |
| 114 | |
| 115 # join barcode library and alignments | |
| 116 syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv" | |
| 117 check_call(syscall3, shell=True) | |
| 118 | |
| 119 # get alignments combined with barcodes | |
| 120 bcalib = pd.read_csv( | |
| 121 tmpdir + "/bcalib.csv", | |
| 122 sep="\t", | |
| 123 names=["chrom", "start", "stop", "bc", "score", "strand"]) | |
| 124 finally: | |
| 125 logging.debug("removed tmpdir: " + tmpdir) | |
| 126 rmtree(tmpdir) | |
| 127 | |
| 128 # fail if alignments given but combined library is empty | |
| 124 if bcalib.empty: | 129 if bcalib.empty: |
| 125 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") | 130 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") |
| 126 n_alns = len(alns.index) | 131 |
| 132 # warn if not all alignments could be assigned a barcode | |
| 127 n_bcalib = len(bcalib.index) | 133 n_bcalib = len(bcalib.index) |
| 128 if n_bcalib < n_alns: | 134 if n_bcalib < n_alns: |
| 129 logging.warning( | 135 logging.warning( |
| 130 "{} of {} alignments could not be associated with a random barcode.".format( | 136 "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns)) |
| 131 n_alns - n_bcalib, n_alns)) | |
| 132 | 137 |
| 133 # remove entries with barcodes that has uncalled base N | 138 # remove entries with barcodes that has uncalled base N |
| 134 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) | 139 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) |
| 135 n_bcalib_cleaned = len(bcalib_cleaned) | 140 n_bcalib_cleaned = len(bcalib_cleaned) |
| 136 if n_bcalib_cleaned < n_bcalib: | 141 # if n_bcalib_cleaned < n_bcalib: |
| 137 msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( | 142 # msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( |
| 138 n_bcalib - n_bcalib_cleaned, n_bcalib) | 143 # n_bcalib - n_bcalib_cleaned, n_bcalib) |
| 139 if n_bcalib_cleaned < (0.8 * n_bcalib): | 144 # if n_bcalib_cleaned < (0.8 * n_bcalib): |
| 140 logging.warning(msg) | 145 # logging.warning(msg) |
| 141 else: | 146 # else: |
| 142 logging.info(msg) | 147 # logging.info(msg) |
| 143 | 148 |
| 144 # count and merge pcr duplicates | 149 # count and merge pcr duplicates |
| 145 # grouping sorts by keys, so the ouput will be properly sorted | 150 # grouping sorts by keys, so the ouput will be properly sorted |
| 146 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index() | 151 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index() |
| 147 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True) | 152 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True) |
