Mercurial > repos > davidvanzessen > mutation_analysis

--- a/mutation_analysis.r	Thu Aug 11 08:00:00 2016 -0400
+++ b/mutation_analysis.r	Thu Aug 11 10:35:52 2016 -0400
@@ -276,17 +276,28 @@

 		transition2[is.na(transition2$value),]$value = 0

-		png(filename=paste("transitions_stacked_", name, ".png", sep=""))
-		p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity") #stacked bar
-		p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL))
-		print(p)
-		dev.off()
-
-		png(filename=paste("transitions_heatmap_", name, ".png", sep=""))
-		p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value), colour="white") + scale_fill_gradient(low="white", high="steelblue") #heatmap
-		p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base")
-		print(p)
-		dev.off()
+		if(!all(transition2$value == 0)){ #having rows of data but a transition table filled with 0 is bad
+
+			print("Plotting stacked transition")
+
+			print(transition2)
+
+			png(filename=paste("transitions_stacked_", name, ".png", sep=""))
+			p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity") #stacked bar
+			p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL))
+			print(p)
+			dev.off()
+
+			print("Plotting heatmap transition")
+
+			png(filename=paste("transitions_heatmap_", name, ".png", sep=""))
+			p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value), colour="white") + scale_fill_gradient(low="white", high="steelblue") #heatmap
+			p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base")
+			print(p)
+			dev.off()
+		} else {
+			print("No data to plot")
+		}
 	}

 	#print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep=""))
--- a/mutation_analysis.xml	Thu Aug 11 08:00:00 2016 -0400
+++ b/mutation_analysis.xml	Thu Aug 11 10:35:52 2016 -0400
@@ -20,10 +20,8 @@
 			<option value="dont_filter">Don't filter</option>
 		</param>
 		<param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example.">
-			<option value="yes">Remove uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3)</option>
-			<option value="yes_c">Remove uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3 + C)</option>
-			<option value="keep">Keep uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3)</option>
-			<option value="keep_c">Keep uniques (CDR1 + FR2 + CDR2 + FR3 + CDR3 + C)</option>
+			<option value="remove">Remove uniques (Based on nucleotide sequence + C)</option>
+			<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>
 			<option value="no" selected="true">No</option>
 		</param>
 		<param name="unique" type="select" label="Remove duplicates based on" help="" >
@@ -40,9 +38,9 @@
 			<option value="60_0">>60% class</option>
 		</param>
 		<param name="empty_region_filter" type="select" label="Sequence starts at" help="" >
-			<option value="FR1" selected="true">FR1 : exclude empty CDR1,FR2,CDR2,FR3</option>
+			<option value="FR1" selected="true">FR1: exclude empty CDR1,FR2,CDR2,FR3</option>
 			<option value="CDR1">CDR1: exclude empty FR2,CDR2,FR3</option>
-			<option value="FR2">FR2: exclude empty,CDR2,FR3</option>
+			<option value="FR2">FR2: exclude empty CDR2,FR3</option>
 		</param>
 		<conditional name="naive_output_cond">
 			<param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/summary_to_fasta.py	Thu Aug 11 10:35:52 2016 -0400
@@ -0,0 +1,42 @@
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file")
+parser.add_argument("--fasta", help="The output fasta file")
+
+args = parser.parse_args()
+
+infile = args.input
+fasta = args.fasta
+
+with open(infile, 'r') as i, open(fasta, 'w') as o:
+	first = True
+	id_col = 0
+	seq_col = 0
+	no_results = 0
+	no_seqs = 0
+	passed = 0
+	for line in i:
+		splt = line.split("\t")
+		if first:
+			id_col = splt.index("Sequence ID")
+			seq_col = splt.index("Sequence")
+			first = False
+			continue
+		if len(splt) < 5:
+			no_results += 1
+			continue
+
+		ID = splt[id_col]
+		seq = splt[seq_col]
+
+		if not len(seq) > 0:
+			no_seqs += 1
+			continue
+
+		o.write(">" + ID + "\n" + seq + "\n")
+		passed += 1
+
+	print "No results:", no_results
+	print "No sequences:", no_seqs
+	print "Written to fasta file:", passed
--- a/wrapper.sh	Thu Aug 11 08:00:00 2016 -0400
+++ b/wrapper.sh	Thu Aug 11 10:35:52 2016 -0400
@@ -53,9 +53,12 @@
 #cat $PWD/files/*/8_* > $PWD/mutationstats.txt
 #cat $PWD/files/*/10_* > $PWD/hotspots.txt

-#BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin"
-
-echo "${BLASTN_DIR}"
+if [[ ${#BLASTN_DIR} -ge 5 ]] ; then
+	echo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"
+else
+	BLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin"
+	echo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}"
+fi

 echo "---------------- identification ($method) ----------------"
 echo "---------------- identification ($method) ----------------<br />" >> $log
@@ -63,16 +66,24 @@
 if [[ "${method}" == "custom" ]] ; then
 	python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt
 else
-	ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
-	ID_index=$((ID_index+1))
-	sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
-	sequence_index=$((sequence_index+1))
+	#ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l)
+	#ID_index=$((ID_index+1))
+	#sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l)
+	#sequence_index=$((sequence_index+1))
+
+	#echo "${ID_index}, ${sequence_index}"

-	cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.tmp
+	#cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.tmp
+	#cat $PWD/summary.txt | tail -n+2 | awk -v id="${ID_index}" -v seq="${sequence_index}" 'BEGIN{FS="\t"} if(NF>10 && length($seq) > 0) {print ">" $id "\n" $seq} {}' > $PWD/sequences.fasta
+
+	#cat $PWD/sequences.tmp | grep -B1 -vE ">.*|^$" | grep -v "^\-\-$" > sequences.fasta #filter out empty sequences

-	cat $PWD/sequences.tmp | grep -B1 -vE "^$" sequences.fasta #filter out empty sequences
+	echo "---------------- summary_to_fasta.py ----------------"
+	echo "---------------- summary_to_fasta.py ----------------<br />" >> $log

-	rm $PWD/sequences.tmp
+	python $dir/summary_to_fasta.py --input $PWD/summary.txt --fasta $PWD/sequences.fasta
+
+	#rm $PWD/sequences.tmp

 	echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt
 	${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt