Mercurial > repos > davidvanzessen > mutation_analysis
changeset 119:626a956f3811 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 11 Aug 2016 10:35:52 -0400 |
parents | ad7ca9c2b748 |
children | 613278c1bde0 |
files | mutation_analysis.r mutation_analysis.xml summary_to_fasta.py wrapper.sh |
diffstat | 4 files changed, 89 insertions(+), 27 deletions(-) [+] |
line wrap: on
line diff
--- a/mutation_analysis.r Thu Aug 11 08:00:00 2016 -0400 +++ b/mutation_analysis.r Thu Aug 11 10:35:52 2016 -0400 @@ -276,17 +276,28 @@ transition2[is.na(transition2$value),]$value = 0 - png(filename=paste("transitions_stacked_", name, ".png", sep="")) - p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity") #stacked bar - p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL)) - print(p) - dev.off() - - png(filename=paste("transitions_heatmap_", name, ".png", sep="")) - p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value), colour="white") + scale_fill_gradient(low="white", high="steelblue") #heatmap - p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") - print(p) - dev.off() + if(!all(transition2$value == 0)){ #having rows of data but a transition table filled with 0 is bad + + print("Plotting stacked transition") + + print(transition2) + + png(filename=paste("transitions_stacked_", name, ".png", sep="")) + p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity") #stacked bar + p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL)) + print(p) + dev.off() + + print("Plotting heatmap transition") + + png(filename=paste("transitions_heatmap_", name, ".png", sep="")) + p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value), colour="white") + scale_fill_gradient(low="white", high="steelblue") #heatmap + p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + print(p) + dev.off() + } else { + print("No data to plot") + } } #print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep=""))
--- a/mutation_analysis.xml Thu Aug 11 08:00:00 2016 -0400 +++ b/mutation_analysis.xml Thu Aug 11 10:35:52 2016 -0400 @@ -20,10 +20,8 @@ <option value="dont_filter">Don't filter</option> </param> <param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example."> - <option value="yes">Remove uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3)</option> - <option value="yes_c">Remove uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3 + C)</option> - <option value="keep">Keep uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3)</option> - <option value="keep_c">Keep uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3 + C)</option> + <option value="remove">Remove uniques (Based on nucleotide sequence + C)</option> + <option value="keep">Keep uniques (Based on nucleotide sequence + C)</option> <option value="no" selected="true">No</option> </param> <param name="unique" type="select" label="Remove duplicates based on" help="" > @@ -40,9 +38,9 @@ <option value="60_0">>60% class</option> </param> <param name="empty_region_filter" type="select" label="Sequence starts at" help="" > - <option value="FR1" selected="true">FR1 : exclude empty CDR1,FR2,CDR2,FR3</option> + <option value="FR1" selected="true">FR1: exclude empty CDR1,FR2,CDR2,FR3</option> <option value="CDR1">CDR1: exclude empty FR2,CDR2,FR3</option> - <option value="FR2">FR2: exclude empty,CDR2,FR3</option> + <option value="FR2">FR2: exclude empty CDR2,FR3</option> </param> <conditional name="naive_output_cond"> <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summary_to_fasta.py Thu Aug 11 10:35:52 2016 -0400 @@ -0,0 +1,42 @@ +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file") +parser.add_argument("--fasta", help="The output fasta file") + +args = parser.parse_args() + +infile = args.input +fasta = args.fasta + +with open(infile, 'r') as i, open(fasta, 'w') as o: + first = True + id_col = 0 + seq_col = 0 + no_results = 0 + no_seqs = 0 + passed = 0 + for line in i: + splt = line.split("\t") + if first: + id_col = splt.index("Sequence ID") + seq_col = splt.index("Sequence") + first = False + continue + if len(splt) < 5: + no_results += 1 + continue + + ID = splt[id_col] + seq = splt[seq_col] + + if not len(seq) > 0: + no_seqs += 1 + continue + + o.write(">" + ID + "\n" + seq + "\n") + passed += 1 + + print "No results:", no_results + print "No sequences:", no_seqs + print "Written to fasta file:", passed
--- a/wrapper.sh Thu Aug 11 08:00:00 2016 -0400 +++ b/wrapper.sh Thu Aug 11 10:35:52 2016 -0400 @@ -53,9 +53,12 @@ #cat $PWD/files/*/8_* > $PWD/mutationstats.txt #cat $PWD/files/*/10_* > $PWD/hotspots.txt -#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" - -echo "${BLASTN_DIR}" +if [[ ${#BLASTN_DIR} -ge 5 ]] ; then + echo "On server, using BLASTN_DIR env: ${BLASTN_DIR}" +else + BLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin" + echo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}" +fi echo "---------------- identification ($method) ----------------" echo "---------------- identification ($method) ----------------<br />" >> $log @@ -63,16 +66,24 @@ if [[ "${method}" == "custom" ]] ; then python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt else - ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l) - ID_index=$((ID_index+1)) - sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l) - sequence_index=$((sequence_index+1)) + #ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l) + #ID_index=$((ID_index+1)) + #sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l) + #sequence_index=$((sequence_index+1)) + + #echo "${ID_index}, ${sequence_index}" - cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.tmp + #cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.tmp + #cat $PWD/summary.txt | tail -n+2 | awk -v id="${ID_index}" -v seq="${sequence_index}" 'BEGIN{FS="\t"} if(NF>10 && length($seq) > 0) {print ">" $id "\n" $seq} {}' > $PWD/sequences.fasta + + #cat $PWD/sequences.tmp | grep -B1 -vE ">.*|^$" | grep -v "^\-\-$" > sequences.fasta #filter out empty sequences - cat $PWD/sequences.tmp | grep -B1 -vE "^$" sequences.fasta #filter out empty sequences + echo "---------------- summary_to_fasta.py ----------------" + echo "---------------- summary_to_fasta.py ----------------<br />" >> $log - rm $PWD/sequences.tmp + python $dir/summary_to_fasta.py --input $PWD/summary.txt --fasta $PWD/sequences.fasta + + #rm $PWD/sequences.tmp echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt