Mercurial > repos > davidvanzessen > extract_duplicates
changeset 2:1f1640608245 draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 01 Sep 2015 08:34:27 -0400 |
parents | a3c4e3e62e10 |
children | |
files | extract_duplicates.r extract_duplicates.sh extract_duplicates.xml |
diffstat | 3 files changed, 17 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/extract_duplicates.r Tue Sep 01 07:49:47 2015 -0400 +++ b/extract_duplicates.r Tue Sep 01 08:34:27 2015 -0400 @@ -3,14 +3,21 @@ input=args[1] column=as.numeric(args[2]) header=(args[3] == "yes") -out_file=args[4] +regex_filter=args[4] +out_file=args[5] + +print(regex_filter) dat = read.table(input, header=header, sep="\t", fill=T, stringsAsFactors=F, quote="") -duplicates = dat[duplicated(dat[,column]),column] +dat.names = names(dat) -dat = dat[dat[,column] %in% duplicates,] +dat$filtered = gsub("\\(.*", "", dat[,column]) + +duplicates = dat[duplicated(dat$filtered),"filtered"] -dat = dat[order(dat[,column]),] +dat = dat[dat[,"filtered"] %in% duplicates,] -write.table(dat, out_file, sep="\t", row.names=F, col.names=header, quote=F) +dat = dat[order(dat[,"filtered"]),] + +write.table(dat[,dat.names], out_file, sep="\t", row.names=F, col.names=header, quote=F)
--- a/extract_duplicates.sh Tue Sep 01 07:49:47 2015 -0400 +++ b/extract_duplicates.sh Tue Sep 01 08:34:27 2015 -0400 @@ -1,7 +1,8 @@ input=$1 column=$2 header=$3 -out_file=$4 +regex_filter=$4 +out_file=$5 dir="$(cd "$(dirname "$0")" && pwd)" -Rscript --verbose $dir/extract_duplicates.r ${input} ${column} ${header} ${out_file} 2>&1 +Rscript --verbose $dir/extract_duplicates.r ${input} ${column} ${header} "${regex_filter}" ${out_file} 2>&1
--- a/extract_duplicates.xml Tue Sep 01 07:49:47 2015 -0400 +++ b/extract_duplicates.xml Tue Sep 01 08:34:27 2015 -0400 @@ -1,11 +1,12 @@ <tool id="extract_duplicates" name="Extract Duplicates" version="1.1.0"> <description>to a new dataset</description> <command interpreter="bash"> - extract_duplicates.sh $input $column $header $out_file + extract_duplicates.sh $input $column $header "$regex_filter" $out_file </command> <inputs> <param format="tabular" name="input" type="data" label="Input"/> <param name="column" label="on column" type="data_column" data_ref="input" accept_default="true" /> + <param name="regex_filter" type="text" label="Filter to sanitize the column" value="\\(.*" size='70'/> <param name="header" type="boolean" checked="False" truevalue="yes" falsevalue="no" label="Input file has a header?" help="if checked, the first line of the input will be treated as a header"/> </inputs> <outputs>