Mercurial > repos > davidvanzessen > mutation_analysis
changeset 62:4262e880472d draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 25 Mar 2016 04:39:18 -0400 |
parents | 64e6a7803e07 |
children | a7381fd96dad |
files | gene_identification.py merge_and_filter.r wrapper.sh |
diffstat | 3 files changed, 77 insertions(+), 30 deletions(-) [+] |
line wrap: on
line diff
--- a/gene_identification.py Fri Mar 18 08:17:08 2016 -0400 +++ b/gene_identification.py Fri Mar 25 04:39:18 2016 -0400 @@ -41,10 +41,11 @@ print "Number of input sequences:", len(dic) #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc +#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag #lambda/kappa reference sequence searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", - "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag", + "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc", "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence compiledregex = {"ca": [], @@ -59,6 +60,12 @@ cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} +#remove last snp for shorter cg sequence --- note, also change varsInCG +del cg1[132] +del cg2[132] +del cg3[132] +del cg4[132] + #reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap chunklength = 8 @@ -152,7 +159,7 @@ chunksInCM = len(compiledregex["cm"]) requiredChunkPercentage = 0.7 varsInCA = float(len(ca1.keys()) * 2) -varsInCG = float(len(cg1.keys()) * 2) + 1 +varsInCG = float(len(cg1.keys()) * 2) - 1 # -1 because the sliding window doesn't hit the first nt twice varsInCM = 0 requiredVarPercentage = 0.7
--- a/merge_and_filter.r Fri Mar 18 08:17:08 2016 -0400 +++ b/merge_and_filter.r Fri Mar 25 04:39:18 2016 -0400 @@ -32,10 +32,14 @@ } +print(paste("Number of sequences in summary file:", nrow(summ))) + summ = merge(summ, gene_identification, by="Sequence.ID") summ = summ[summ$Functionality != "No results",] +print(paste("Number of sequences after merging with annotation:", nrow(summ))) + if(functionality == "productive"){ summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",] } else if (functionality == "unproductive"){ @@ -44,20 +48,35 @@ summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] } +print(paste("Number of sequences after productive filter:", nrow(summ))) + higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70) + +print(paste("test: ", sum(higher_than), sum(!higher_than), (sum(higher_than) + sum(!higher_than)))) + unmatched = summ[!higher_than,] unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] unmatched$best_match = paste("unmatched,", unmatched$best_match) summ = summ[higher_than,] +print(paste("Number of matched sequences:", nrow(summ))) + if(length(summ$Sequence.ID) == 0){ stop("No data remaining after filter") } result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with mutation analysis:", nrow(result))) + result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with mutation stats:", nrow(result))) + result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") +print(paste("Number of sequences after merging with hotspots:", nrow(result))) + cleanup_columns = c("FR1.IMGT.Nb.of.mutations", "CDR1.IMGT.Nb.of.mutations", "FR2.IMGT.Nb.of.mutations", @@ -75,6 +94,27 @@ result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) result$JGene = gsub("[*].*", "", result$JGene) +if(filter_unique != "no"){ + clmns = names(result) + + result$unique.def = paste(result$CDR1.Seq, result$CDR2.Seq, result$CDR3.Seq, result$FR1.IMGT, result$FR2.IMGT, result$FR3.IMGT) + result.filtered = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] + fltr = result$Sequence.ID %in% result.filtered$Sequence.ID + #fltr = result$unique.def %in% result.filtered$unique.def + + result = result[fltr,] + + if(filter_unique == "keep"){ + result = result[!fltr,] + } + + result = result[,clmns] + + #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) +} + +print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) + #result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match) if(unique_type == "AA.JUNCTION_V_subclass"){ result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match) @@ -90,41 +130,21 @@ result$past = 1:nrow(result) } -print(paste("filter uniques: ", filter_unique)) - -if(filter_unique != "no"){ - clmns = names(result) - - result$unique.def = paste(result$CDR1.Seq, result$CDR2.Seq, result$CDR3.Seq, result$FR1.IMGT, result$FR2.IMGT, result$FR3.IMGT) - result.filtered = result[duplicated(result$unique.def),] - fltr = result$unique.def %in% result.filtered$unique.def - - result.removed = result[!fltr,] - - result = result[fltr,] - - if(filter_unique == "keep"){ - result = result.removed - } - - result = result[,clmns] - - #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) -} - - -result = result[!duplicated(result$past), ] +result = result[!(duplicated(result$past)), ] result = result[,!(names(result) %in% c("past"))] -print(paste("Number of rows in result:", nrow(result))) -print(paste("Number of rows in unmatched:", nrow(unmatched))) - +print(paste("Number of sequences in result after", unique_type, "filtering:", nrow(result))) #remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions. sequences = sequences[grepl("n|N", sequences$FR2.IMGT) | grepl("n|N", sequences$FR3.IMGT) | grepl("n|N", sequences$CDR1.IMGT) | grepl("n|N", sequences$CDR2.IMGT),] result = result[!(result$Sequence.ID %in% sequences$Sequence.ID),] +print(paste("Number of sequences in result after n filtering:", nrow(result))) + +print(paste("Number of rows in result:", nrow(result))) +print(paste("Number of rows in unmatched:", nrow(unmatched))) + write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T) write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)
--- a/wrapper.sh Fri Mar 18 08:17:08 2016 -0400 +++ b/wrapper.sh Fri Mar 25 04:39:18 2016 -0400 @@ -14,6 +14,7 @@ mkdir $outdir echo "---------------- read parameters ----------------" +echo "---------------- read parameters ----------------" > $output echo "unpacking IMGT file" @@ -42,6 +43,7 @@ echo "${BLASTN_DIR}" echo "identification ($method)" +echo "identification ($method)" >> $output echo "blast or custom" @@ -64,10 +66,12 @@ fi echo "---------------- merge_and_filter.r ----------------" +echo "---------------- merge_and_filter.r ----------------" >> $output Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} echo "---------------- mutation_analysis.r ----------------" +echo "---------------- mutation_analysis.r ----------------" >> $output genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm" echo "R mutation analysis" @@ -79,11 +83,13 @@ echo "---------------- mutation_analysis.py ----------------" +echo "---------------- mutation_analysis.py ----------------" >> $output python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $genes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt echo "R AA histogram" echo "---------------- aa_histogram.r ----------------" +echo "---------------- aa_histogram.r ----------------" >> $output Rscript $dir/aa_histogram.r $outdir/aa_mutations.txt $outdir/aa_histogram.png 2>&1 @@ -92,7 +98,21 @@ funcs=(sum mean median) -echo "<html><center><h1>$title</h1></center>" >> $output +echo "<html><center><h1>$title</h1></center>" > $output + +#display the matched/unmatched for clearity + +matched_count="`cat $outdir/merged.txt | tail -n +2 | wc -l`" +unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`" +total_count=$((matched_count + unmatched_count)) +perc_count=$((unmatched_count / total_count * 100)) +perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"` +perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"` + +echo "<center><h2>Total: ${total_count}</h2></center>" >> $output +echo "<center><h2>Matched: ${matched_count} Unmatched: ${unmatched_count}</h2></center>" >> $output +echo "<center><h2>Percentage unmatched: ${perc_count}</h2></center>" >> $output + echo "---------------- main tables ----------------" for func in ${funcs[@]} do