Mercurial > repos > davidvanzessen > mutation_analysis
changeset 64:0fdd90f7c654 draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 01 Apr 2016 08:54:24 -0400 |
parents | a7381fd96dad |
children | ae8b721a2964 |
files | gene_identification.py merge_and_filter.r wrapper.sh |
diffstat | 3 files changed, 31 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/gene_identification.py Fri Mar 25 07:50:12 2016 -0400 +++ b/gene_identification.py Fri Apr 01 08:54:24 2016 -0400 @@ -25,13 +25,14 @@ with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence for line in f: total += 1 + linesplt = line.split("\t") if first: - linesplt = line.split("\t") + print "linesplt", linesplt IDIndex = linesplt.index("Sequence ID") seqIndex = linesplt.index("Sequence") first = False continue - linesplt = line.split("\t") + ID = linesplt[IDIndex] if len(linesplt) < 28: #weird rows without a sequence dic[ID] = ""
--- a/merge_and_filter.r Fri Mar 25 07:50:12 2016 -0400 +++ b/merge_and_filter.r Fri Apr 01 08:54:24 2016 -0400 @@ -52,16 +52,17 @@ higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70) -print(paste("test: ", sum(higher_than), sum(!higher_than), (sum(higher_than) + sum(!higher_than)))) +unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] -unmatched = summ[!higher_than,] -unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] -unmatched$best_match = paste("unmatched,", unmatched$best_match) -summ = summ[higher_than,] - +if(any(higher_than)){ + unmatched = summ[!higher_than,] + unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] + unmatched$best_match = paste("unmatched,", unmatched$best_match) + summ = summ[higher_than,] +} print(paste("Number of matched sequences:", nrow(summ))) -if(length(summ$Sequence.ID) == 0){ +if(nrow(summ) == 0){ stop("No data remaining after filter") } @@ -98,9 +99,9 @@ clmns = names(result) if(grepl("_c", filter_unique)){ - result$unique.def = paste(result$CDR1.Seq, result$CDR2.Seq, result$CDR3.Seq, result$FR2.IMGT, result$FR3.IMGT, result$best_match) + result$unique.def = paste(result$CDR1.IMGT, result$CDR2.IMGT, result$CDR3.IMGT, result$FR2.IMGT, result$FR3.IMGT, result$best_match) } else { - result$unique.def = paste(result$CDR1.Seq, result$CDR2.Seq, result$CDR3.Seq, result$FR2.IMGT, result$FR3.IMGT) + result$unique.def = paste(result$CDR1.IMGT, result$CDR2.IMGT, result$CDR3.IMGT, result$FR2.IMGT, result$FR3.IMGT) } result.filtered = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] fltr = result$Sequence.ID %in% result.filtered$Sequence.ID @@ -110,6 +111,7 @@ result = result[!fltr,] } else { result = result[fltr,] + result = result[!duplicated(result$unique.def),] } result = result[,clmns]
--- a/wrapper.sh Fri Mar 25 07:50:12 2016 -0400 +++ b/wrapper.sh Fri Apr 01 08:54:24 2016 -0400 @@ -30,13 +30,23 @@ tar -xJf $input -C $PWD/files/$title fi -cat $PWD/files/*/1_* > $PWD/summary.txt -cat $PWD/files/*/3_* > $PWD/sequences.txt -cat $PWD/files/*/5_* > $PWD/aa.txt -cat $PWD/files/*/6_* > $PWD/junction.txt -cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt -cat $PWD/files/*/8_* > $PWD/mutationstats.txt -cat $PWD/files/*/10_* > $PWD/hotspots.txt +cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt +cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt +cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt +cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt +cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt +cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt +cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt + + + +#cat $PWD/files/*/1_* > $PWD/summary.txt +#cat $PWD/files/*/3_* > $PWD/sequences.txt +#cat $PWD/files/*/5_* > $PWD/aa.txt +#cat $PWD/files/*/6_* > $PWD/junction.txt +#cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt +#cat $PWD/files/*/8_* > $PWD/mutationstats.txt +#cat $PWD/files/*/10_* > $PWD/hotspots.txt #BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"