Mercurial > repos > davidvanzessen > mutation_analysis
changeset 19:c518cf0d4adb draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 01 Apr 2015 05:09:59 -0400 |
parents | 9dd557f17153 |
children | 850857bc8605 |
files | gene_identification.py merge_and_filter.r mutation_analysis.xml subclass_definition.db.nhr subclass_definition.db.nin subclass_definition.db.nsq wrapper.sh |
diffstat | 7 files changed, 68 insertions(+), 27 deletions(-) [+] |
line wrap: on
line diff
--- a/gene_identification.py Mon Mar 30 07:58:36 2015 -0400 +++ b/gene_identification.py Wed Apr 01 05:09:59 2015 -0400 @@ -5,7 +5,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") -parser.add_argument("--output", help="The annotated summary output file") +parser.add_argument("--output", help="The annotated output file to be merged back with the summary file") args = parser.parse_args() @@ -112,16 +112,17 @@ currentIDHits = hits[ID] seq = dic[ID] lastindex = 0 - start = [0] * len(seq) + start_zero = len(searchstrings[key]) #allows the reference sequence to start before search sequence (start_locations of < 0) + start = [0] * (len(seq) + start_zero) for i, regexp in enumerate(regularexpressions): #for every regular expression relativeStartLocation = lastindex - (chunklength / 2) * i - if relativeStartLocation < 0 or relativeStartLocation >= len(seq): + if relativeStartLocation >= len(seq): break regex, hasVar = regexp matches = regex.finditer(seq[lastindex:]) for match in matches: #for every match with the current regex, only uses the first hit lastindex += match.start() - start[relativeStartLocation] += 1 + start[relativeStartLocation + start_zero] += 1 if hasVar: #if the regex has a variable nt in it chunkstart = chunklength / 2 * i #where in the reference does this chunk start chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end @@ -140,7 +141,7 @@ continue #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] currentIDHits[key + "_hits"] += 1 - start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if len(start) > 0 and max(start) > 1]) + start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1]) #start_location[ID + "_" + key] = str(start.index(max(start))) @@ -161,7 +162,7 @@ for line in f: total += 1 if first: - o.write(line.rstrip() + "\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + o.write("Sequence ID\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") first = False continue linesplt = line.split("\t") @@ -169,6 +170,7 @@ pass ID = linesplt[1] currentIDHits = hits[ID] + print currentIDHits possibleca = float(len(compiledregex["ca"])) possiblecg = float(len(compiledregex["cg"])) possiblecm = float(len(compiledregex["cm"])) @@ -179,24 +181,24 @@ ca1hits = currentIDHits["ca1"] ca2hits = currentIDHits["ca2"] if ca1hits >= ca2hits: - o.write(line.rstrip() + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + o.write(ID + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") else: - o.write(line.rstrip() + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + o.write(ID + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") elif cghits >= cahits and cghits >= cmhits: #its a cg gene cg1hits = currentIDHits["cg1"] cg2hits = currentIDHits["cg2"] cg3hits = currentIDHits["cg3"] cg4hits = currentIDHits["cg4"] if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene - o.write(line.rstrip() + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene - o.write(line.rstrip() + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene - o.write(line.rstrip() + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") else: #cg4 gene - o.write(line.rstrip() + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") else: #its a cm gene - o.write(line.rstrip() + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + o.write(ID + "\tcm\t0\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") seq_write_count += 1 print "Time: %i" % (int(time.time() * 1000) - starttime)
--- a/merge_and_filter.r Mon Mar 30 07:58:36 2015 -0400 +++ b/merge_and_filter.r Wed Apr 01 05:09:59 2015 -0400 @@ -5,28 +5,40 @@ mutationanalysisfile = args[2] mutationstatsfile = args[3] hotspotsfile = args[4] -output = args[5] -unmatchedfile = args[6] +gene_identification_file= args[5] +output = args[6] +unmatchedfile = args[7] +method=args[8] summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F) mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F) mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F) hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F) +if(method == "blastn"){ + "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" + gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] + ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) + gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) + gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 + gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] + colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") + +} +summ = merge(summ, gene_identification, by="Sequence.ID") summ = summ[summ$Functionality != "No results",] higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70) -tmp = summ[higher_than,] unmatched = summ[!higher_than,] unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] -summ = tmp -rm(tmp) +summ = summ[higher_than,] if(length(summ$Sequence.ID) == 0){ stop("No data remaining after filter") } -result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-2])], by="Sequence.ID") +result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")
--- a/mutation_analysis.xml Mon Mar 30 07:58:36 2015 -0400 +++ b/mutation_analysis.xml Wed Apr 01 05:09:59 2015 -0400 @@ -1,10 +1,14 @@ <tool id="mutation_analysis_shm" name="Mutation Analysis" version="1.0"> <description></description> <command interpreter="bash"> - wrapper.sh $in_file $out_file $out_file.files_path ${in_file.name} + wrapper.sh $in_file $method $out_file $out_file.files_path ${in_file.name} </command> <inputs> <param name="in_file" type="data" label="IMGT zip file to be analysed" /> + <param name="method" type="select" label="Identification method" help="" > + <option value="custom" selected="true">custom</option> + <option value="blastn">blastn</option> + </param> </inputs> <outputs> <data format="html" name="out_file" label = "Mutation analysis on ${in_file.name}"/>
--- a/wrapper.sh Mon Mar 30 07:58:36 2015 -0400 +++ b/wrapper.sh Wed Apr 01 05:09:59 2015 -0400 @@ -2,9 +2,10 @@ set -e dir="$(cd "$(dirname "$0")" && pwd)" input=$1 -output=$2 -outdir=$3 -title=$4 +method=$2 +output=$3 +outdir=$4 +title=$5 mkdir $outdir unzip $input -d $PWD/files/ > $PWD/unziplog.log @@ -13,13 +14,35 @@ cat $PWD/files/*/8_* > $PWD/mutationstats.txt cat $PWD/files/*/10_* > $PWD/hotspots.txt -echo "${BLASTN}" +BLASTN_DIR="/home/galaxy/tmp/blast/ncbi-blast-2.2.30+/bin" + +echo "${BLASTN_DIR}" + + -echo "identification" -python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/annotatedsummary.txt +echo "identification ($method)" + +if [[ "${method}" == "custom" ]] ; then + python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt +else + ID_index=$(cat $PWD/summary.txt | grep -o -P ".+Sequence ID" | grep -o -P "\t" | wc -l) + ID_index=$((ID_index+1)) + sequence_index=$(cat $PWD/summary.txt | grep -o -P ".+\tSequence" | grep -o -P "\t" | wc -l) + sequence_index=$((sequence_index+1)) + + echo "$ID_index ${sequence_index}" + + cat $PWD/summary.txt | tail -n+2 | cut -f ${ID_index},${sequence_index} | awk '{print ">" $1 "\n" $2}' > $PWD/sequences.fasta + + echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt + ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt +fi + + + echo "merging" -Rscript $dir/merge_and_filter.r $outdir/annotatedsummary.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/merged.txt $outdir/unmatched.txt +Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/unmatched.txt $method genes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm" echo "R mutation analysis"