Mercurial > repos > davidvanzessen > combined_immune_repertoire_pipeline
changeset 9:8d83319a0f3d draft
Uploaded
| author | davidvanzessen | 
|---|---|
| date | Tue, 10 Dec 2013 05:53:08 -0500 | 
| parents | 00d432c66fb8 | 
| children | a5c224bb0be5 | 
| files | RScript.r combined.sh combined.xml combined_imgt.xml igblastmerge.py imgtconvert.py imgtconvert.sh tool_dependencies.xml | 
| diffstat | 8 files changed, 421 insertions(+), 150 deletions(-) [+] | 
line wrap: on
 line diff
--- a/RScript.r Mon Nov 25 09:21:55 2013 -0500 +++ b/RScript.r Tue Dec 10 05:53:08 2013 -0500 @@ -1,4 +1,4 @@ -options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +#options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) args <- commandArgs(trailingOnly = TRUE) @@ -30,11 +30,6 @@ test = test[test$Sample != "",] -if("Replicate" %in% colnames(test)) -{ - test$SRID = do.call(paste, c(test[c("Sample", "Replicate")], sep = "-")) -} - test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene) test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene) test$Top.J.Gene = gsub("[*]([0-9]+)", "", test$Top.J.Gene) @@ -43,6 +38,9 @@ test$VDJCDR3 = do.call(paste, c(test[unlist(strsplit(clonalType, ","))], sep = ":")) PROD = test[test$VDJ.Frame != "In-frame with stop codon" & test$VDJ.Frame != "Out-of-frame" & test$CDR3.Found.How != "NOT_FOUND" , ] +if("Functionality" %in% colnames(test)) { + PROD = test[test$Functionality == "productive" | test$Functionality == "productive (see comment)", ] +} NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ] @@ -74,7 +72,7 @@ PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) -V = c("v.name\tchr.orderV\nIGHV7-81\t1\nIGHV3-74\t2\nIGHV3-73\t3\nIGHV3-72\t4\nIGHV3-71\t5\nIGHV2-70\t6\nIGHV1-69\t7\nIGHV3-66\t8\nIGHV3-64\t9\nIGHV4-61\t10\nIGHV4-59\t11\nIGHV1-58\t12\nIGHV3-53\t13\nIGHV3-52\t14\nIGHV5-a\t15\nIGHV5-51\t16\nIGHV3-49\t17\nIGHV3-48\t18\nIGHV3-47\t19\nIGHV1-46\t20\nIGHV1-45\t21\nIGHV3-43\t22\nIGHV4-39\t23\nIGHV3-35\t24\nIGHV4-34\t25\nIGHV3-33\t26\nIGHV4-31\t27\nIGHV4-30-4\t28\nIGHV4-30-2\t29\nIGHV3-30-3\t30\nIGHV3-30\t31\nIGHV4-28\t32\nIGHV2-26\t33\nIGHV1-24\t34\nIGHV3-23\t35\nIGHV3-22\t36\nIGHV3-21\t37\nIGHV3-20\t38\nIGHV3-19\t39\nIGHV1-18\t40\nIGHV3-15\t41\nIGHV3-13\t42\nIGHV3-11\t43\nIGHV3-9\t44\nIGHV1-8\t45\nIGHV3-7\t46\nIGHV2-5\t47\nIGHV7-4-1\t48\nIGHV4-4\t49\nIGHV4-b\t50\nIGHV1-3\t51\nIGHV1-2\t52\nIGHV6-1\t53") +V = c("v.name\tchr.orderV\nIGHV7-81\t1\nIGHV3-74\t2\nIGHV3-73\t3\nIGHV3-72\t4\nIGHV2-70\t6\nIGHV1-69\t7\nIGHV3-66\t8\nIGHV3-64\t9\nIGHV4-61\t10\nIGHV4-59\t11\nIGHV1-58\t12\nIGHV3-53\t13\nIGHV5-a\t15\nIGHV5-51\t16\nIGHV3-49\t17\nIGHV3-48\t18\nIGHV1-46\t20\nIGHV1-45\t21\nIGHV3-43\t22\nIGHV4-39\t23\nIGHV3-35\t24\nIGHV4-34\t25\nIGHV3-33\t26\nIGHV4-31\t27\nIGHV4-30-4\t28\nIGHV4-30-2\t29\nIGHV3-30-3\t30\nIGHV3-30\t31\nIGHV4-28\t32\nIGHV2-26\t33\nIGHV1-24\t34\nIGHV3-23\t35\nIGHV3-21\t37\nIGHV3-20\t38\nIGHV1-18\t40\nIGHV3-15\t41\nIGHV3-13\t42\nIGHV3-11\t43\nIGHV3-9\t44\nIGHV1-8\t45\nIGHV3-7\t46\nIGHV2-5\t47\nIGHV7-4-1\t48\nIGHV4-4\t49\nIGHV4-b\t50\nIGHV1-3\t51\nIGHV1-2\t52\nIGHV6-1\t53") tcV = textConnection(V) Vchain = read.table(tcV, sep="\t", header=TRUE) PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE) @@ -95,6 +93,8 @@ setwd(outDir) +write.table(PRODF, "allUnique.tsv", sep="\t",quote=F,row.names=T,col.names=T) + pV = ggplot(PRODFV) pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") @@ -131,7 +131,7 @@ img = ggplot() + geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0,1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + xlab("D genes") + ylab("V Genes") @@ -166,7 +166,7 @@ img = ggplot() + geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0,1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + xlab("J genes") + ylab("V Genes") @@ -198,7 +198,7 @@ img = ggplot() + geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0,1)) + + scale_fill_gradient(low="gold", high="blue", na.value="white") + ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + xlab("J genes") + ylab("D Genes") @@ -229,3 +229,88 @@ un = paste(un, sep="\n") writeLines(un, sampleFile) close(sampleFile) + + +if("Replicate" %in% colnames(test)) +{ + clonalityFrame = PROD + clonalityFrame$ReplicateConcat = do.call(paste, c(clonalityFrame[c("VDJCDR3", "Sample", "Replicate")], sep = ":")) + clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ] + write.table(clonalityFrame, "clonalityComplete.tsv", sep="\t",quote=F,row.names=T,col.names=T) + + ClonalitySampleReplicatePrint <- function(dat){ + write.table(dat, paste("clonality_", unique(dat$Sample) , "_", unique(dat$Replicate), ".tsv", sep=""), sep="\t",quote=F,row.names=T,col.names=T) + } + + clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,c("Sample", "Replicate")]) + lapply(clonalityFrameSplit, FUN=ClonalitySampleReplicatePrint) + + ClonalitySamplePrint <- function(dat){ + write.table(dat, paste("clonality_", unique(dat$Sample) , ".tsv", sep=""), sep="\t",quote=F,row.names=T,col.names=T) + } + + clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,"Sample"]) + lapply(clonalityFrameSplit, FUN=ClonalitySamplePrint) + + clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "VDJCDR3")]) + clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")]) + clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count + clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample") + + ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15') + tcct = textConnection(ct) + CT = read.table(tcct, sep="\t", header=TRUE) + close(tcct) + clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T) + clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight + + ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "VDJCDR3")]) + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")]) + ReplicateReads$squared = ReplicateReads$Reads * ReplicateReads$Reads + + ReplicatePrint <- function(dat){ + write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSplit, FUN=ReplicatePrint) + + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(Reads), ReadsSquaredSum=sum(squared)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) + + + ReplicateSumPrint <- function(dat){ + write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSumSplit, FUN=ReplicateSumPrint) + + clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T) + + clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2) + clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1) + + ClonalityScorePrint <- function(dat){ + write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityScore = clonalFreqCount[c("Sample", "Result")] + clonalityScore = unique(clonalityScore) + + clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"]) + lapply(clonalityScoreSplit, FUN=ClonalityScorePrint) + + clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")] + + + + ClonalityOverviewPrint <- function(dat){ + write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample) + lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint) +}
--- a/combined.sh Mon Nov 25 09:21:55 2013 -0500 +++ b/combined.sh Tue Dec 10 05:53:08 2013 -0500 @@ -1,55 +1,42 @@ #!/bin/bash -#export IGDATA=/home/david/Galaxy/galaxy-dist/toolsheddependencies/igBlastn/1.0.0/davidvanzessen/igblast_human/1c64c977624e/ncbi-igblast-1.0.0/; +export IGDATA=/home/david/tmp/ncbi-igblast-1.0.0; clonalType=${@:(-3):1} html=${@:(-2):1} imageDir=${@:(-1):1} +dataCount=`expr $# - 3` +inputData=${@:(1):dataCount} dir="$(cd "$(dirname "$0")" && pwd)" -fileCount=`expr $# - 3` array=("$@") echo "<html><h3>Progress</h3><table><tr><td>info</td></tr>" > $html echo "<tr><td>-----------------------------------</td></tr>" >> $html -limit=`expr $fileCount / 2` -function blastAndParse { - echo "<tr><td>Starting blast of $2</td></tr>" >> $html - fileName=$(basename $1) - $IGDATA/bin/igblastn -germline_db_V $IGDATA/database/human_gl_V -germline_db_J $IGDATA/database/human_gl_J -germline_db_D $IGDATA/database/human_gl_D -domain_system imgt -query $1 -auxiliary_data $IGDATA/optional_file/human_gl.aux -show_translation -outfmt 3 > $PWD/$fileName - echo "<tr><td>Finished blast of $2</td></tr>" >> $html - - echo "<tr><td>Starting parse of $2</td></tr>" >> $html - parsedFileName="${fileName%.*}" - parsedFileName="$parsedFileName.parsed" - perl $dir/igparse.pl $PWD/$fileName 0 | grep -v "D:" | cut -f2- > $parsedFileName - echo "<tr><td>Finished parse of $2</td></tr>" >> $html -} - -for ((i=0;i<$fileCount;i=$((i+2)))) -do - next=$((i+1)) - blastAndParse ${array[$i]} ${array[$next]} & +id=${inputData[0]} +forwardSlash="/" +mergerInput=() +count=0 +for current in $inputData; do + if [[ "$current" != *"$forwardSlash"* ]]; then + id=$current + count=0 + mergerInput+=($id) + continue + fi + fileName=$(basename $current) + convertedFileName="${fileName%.*}" + convertedFileName="$PWD/$convertedFileName.converted" + bash $dir/imgtconvert.sh $current $id $count $convertedFileName + mergerInput+=($convertedFileName) + count=$((count+1)) done -wait echo "<tr><td>-----------------------------------</td></tr>" >> $html echo "<tr><td>merging</td></tr>" >> $html -count=0 -for ((i=0;i<$fileCount;i=$((i+2)))) -do - id=$((i+1)) - place=$((count+limit)) - fn=$(basename ${array[$i]}) - fn="${fn%.*}" - mergeInputs[$count]="$PWD/$fn.parsed" - mergeIDs[$place]=${array[$id]} - count=$((count+1)) -done - -python $dir/igblastmerge.py --input ${mergeInputs[*]} --id ${mergeIDs[*]} --output $PWD/merged.txt +python $dir/igblastmerge.py ${mergerInput[*]} --output $PWD/merged.txt echo "<tr><td>done</td></tr>" >> $html echo "<tr><td>-----------------------------------</td></tr>" >> $html @@ -68,10 +55,43 @@ samples=`cat $outputDir/samples.txt` count=1 -echo "<table border='1'><caption><h3>$clonalType</h3></caption>" >> $outputFile +echo "<table border='1'><caption><a href='allUnique.tsv'><h3>$clonalType</h3></a></caption>" >> $outputFile +hasReplicateColumn="$(if head -n 1 $inputFile | grep -q 'Replicate'; then echo 'Yes'; else echo 'No'; fi)" for sample in $samples; do - echo "<tr><td colspan='3' height='100'></td>" >> $outputFile + clonalityScore="$(cat $outputDir/ClonalityScore_$sample.csv)" + echo "<tr><td colspan='3' height='100'></td></tr>" >> $outputFile echo "<tr><td colspan='3'><h1>$sample</h1></td></tr>" >> $outputFile + + echo "$hasReplicateColumn" + #if its a 'new' merged file with replicate info + if [[ "$hasReplicateColumn" == "Yes" ]] ; then + echo "<tr><td colspan='3'><a href='clonality_$sample.tsv'><h2>Clonality Score: $clonalityScore</h2></a></td></tr>" >> $outputFile + + #replicate,reads,squared + echo "<tr><td colspan='3'><table border='1'><tr><th>Replicate ID</th><th>Number of Reads</th><th>Reads Squared</th></tr>" >> $outputFile + while IFS=, read replicate reads squared + do + + echo "<tr><td><a href='clonality_${sample}_$replicate.tsv'>$replicate</a></td><td>$reads</td><td>$squared</td></tr>" >> $outputFile + done < $outputDir/ReplicateReads_$sample.csv + + #sum of reads and reads squared + while IFS=, read readsSum squaredSum + do + echo "<tr><td>Sum</td><td>$readsSum</td><td>$squaredSum</td></tr>" >> $outputFile + done < $outputDir/ReplicateSumReads_$sample.csv + + echo "</table></td></tr>" >> $outputFile + + #overview + echo "<tr><td colspan='3'><table border='1'><tr><th>Coincidence Type</th><th>Raw Coincidence Freq</th><th>Coincidence Weight</th><th>Coincidences, Weighted</th></tr>" >> $outputFile + while IFS=, read type count weight weightedCount + do + echo "<tr><td>$type</td><td>$count</td><td>$weight</td><td>$weightedCount</td></tr>" >> $outputFile + done < $outputDir/ClonalityOverView_$sample.csv + echo "</table></td></tr>" >> $outputFile + fi + echo "<tr><td><h2>V-D Heatmap:</h2></td><td><h2>V-J Heatmap:</h2></td><td><h2>D-J Heatmap:</h2></td></tr><tr>" >> $outputFile mv "$outputDir/HeatmapVD_$sample.png" "$outputDir/VD_$sample.png" echo "<td><img src='VD_$sample.png'/></td>" >> $outputFile @@ -83,4 +103,4 @@ done echo "</table>" >> $outputFile -echo "</html>" >> $2 +echo "</html>" >> $outputFile
--- a/combined.xml Mon Nov 25 09:21:55 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -<tool id="immunerepertoirecombined" name="Immunerepertoire combined" version="1.0"> - <description>pipeline</description> - <command interpreter="bash"> - combined.sh - #for $i, $f in enumerate($files) - ${f.file} - ${f.id} - #end for - "$clonaltype_select" $out_file $out_file.files_path - </command> - <inputs> - <repeat name="files" title="File" min="1" default="1"> - <param name="file" format="fasta" type="data" label="Data to Process" /> - <param name="id" type="text" label="ID" /> - </repeat> - <param name="clonaltype_select" type="select" label="Clonal Type Definition"> - <option value="Top.V.Gene,CDR3.Seq">Top.V.Gene, CDR3.Seq</option> - <option value="Top.V.Gene,CDR3.Seq.DNA">Top.V.Gene, CDR3.Seq.DNA</option> - <option value="Top.V.Gene,Top.J.Gene,CDR3.Seq">Top.V.Gene, Top.J.Gene, CDR3.Seq</option> - <option value="Top.V.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.J.Gene, CDR3.Seq.DNA</option> - <option value="Top.V.Gene,Top.D.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.D.Gene, Top.J.Gene, CDR3.Seq.DNA</option> - </param> - </inputs> - <outputs> - <data format="html" name="out_file" /> - </outputs> - <help> - The entire Immune Repertoire pipeline as a single tool, input several FASTA files, give them an ID and it will BLAST, parse, merge and plot them. - </help> - <requirements> - <requirement type="package" version="1.0.0">igBlastn</requirement> - </requirements> -</tool> -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combined_imgt.xml Tue Dec 10 05:53:08 2013 -0500 @@ -0,0 +1,35 @@ +<tool id="immunerepertoirecombined_imgt" name="Immunerepertoire combined" version="1.0"> + <description>pipeline for IMGT</description> + <command interpreter="bash"> + combined.sh + #for $i, $f in enumerate($patients) + ${f.id} + #for $j, $g in enumerate($f.samples) + ${g.sample} + #end for + #end for + "$clonaltype_select" $out_file $out_file.files_path + </command> + <inputs> + <repeat name="patients" title="Patients" min="1" default="1"> + <repeat name="samples" title="Sample" min="1" default="1"> + <param name="sample" type="data" label="Sample to Process" /> + </repeat> + <param name="id" type="text" label="ID" /> + </repeat> + <param name="clonaltype_select" type="select" label="Clonal Type Definition"> + <option value="Top.V.Gene,CDR3.Seq">Top.V.Gene, CDR3.Seq</option> + <option value="Top.V.Gene,CDR3.Seq.DNA">Top.V.Gene, CDR3.Seq.DNA</option> + <option value="Top.V.Gene,Top.J.Gene,CDR3.Seq">Top.V.Gene, Top.J.Gene, CDR3.Seq</option> + <option value="Top.V.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.J.Gene, CDR3.Seq.DNA</option> + <option value="Top.V.Gene,Top.D.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.D.Gene, Top.J.Gene, CDR3.Seq.DNA</option> + </param> + </inputs> + <outputs> + <data format="html" name="out_file" /> + </outputs> + <help> + The entire Immune Repertoire pipeline as a single tool, input several IMGT zip files, give them an ID and it will parse, merge and plot them. + </help> +</tool> +
--- a/igblastmerge.py Mon Nov 25 09:21:55 2013 -0500 +++ b/igblastmerge.py Tue Dec 10 05:53:08 2013 -0500 @@ -1,4 +1,3 @@ -import argparse import sys # error def stop_err( msg ): @@ -7,35 +6,45 @@ # main def main(): - parser = argparse.ArgumentParser() #docs.python.org/dev/library/argparse.html - parser.add_argument("--input", help="Input file(s)", nargs="+") - parser.add_argument("--id", help="Input file(s) id's", nargs="+") - parser.add_argument("--output", help="Output file") - - args = parser.parse_args() - try: - o = open(args.output, 'w') - i = open(args.input[-1], 'r') - separator = "\t" - newline = "\n" - header = "Sample" - line = i.readline() - o.write(line[:line.rfind(newline)] + separator + header + newline) #write the header - i.close() - - for cf,i in zip(args.input,args.id): - f = open(cf, 'r') - line = f.readline() - line = f.readline() #skip header - while line: - o.write(line[:line.rfind(newline)] + separator + i + newline) - line = f.readline() - f.close() - o.close() + args = sys.argv[1:-2] - except Exception, ex: - stop_err('Error running new_column.py\n' + str(ex)) - sys.exit(0) + try: + o = open(sys.argv[-1], 'w') + i = open(args[1], 'r') + separator = "\t" + newline = "\n" + line = i.readline() + #write the header + o.write(line[:line.rfind(newline)] + separator + "Sample" + separator + "Replicate" + newline) + i.close() + + current = 1 + sampleID = args[0] + count = 1 + + while True: + f = open(args[current], 'r') + line = f.readline() + line = f.readline() + while line: + o.write(line[:line.rfind(newline)] + separator + sampleID + separator + str(count) + newline) + line = f.readline() + f.close() + + if current >= (len(args) - 1): + break + if args[current + 1].find("/") is -1: + sampleID = args[current + 1] + current += 1 + count = 1 + else: + count += 1 + current += 1 + o.close() + + except Exception, ex: + stop_err('Error running new_column.py\n' + str(ex)) + sys.exit(0) if __name__ == "__main__": print sys.argv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgtconvert.py Tue Dec 10 05:53:08 2013 -0500 @@ -0,0 +1,139 @@ +import pandas as pd +import re +import argparse +import os + +def stop_err( msg, ret=1 ): + sys.stderr.write( msg ) + sys.exit( ret ) + +#docs.python.org/dev/library/argparse.html +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Input folder with files") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +old_summary_columns = [u'Sequence ID', u'JUNCTION frame', u'V-GENE and allele', u'D-GENE and allele', u'J-GENE and allele', u'CDR1-IMGT length', u'CDR2-IMGT length', u'CDR3-IMGT length', u'Orientation'] +old_sequence_columns = [u'CDR1-IMGT', u'CDR2-IMGT', u'CDR3-IMGT'] +old_junction_columns = [u'JUNCTION'] + +added_summary_columns = [u'V-REGION identity %', u'V-REGION identity nt', u'D-REGION reading frame', u'AA JUNCTION', u'Functionality comment', u'Sequence'] +added_sequence_columns = [u'FR1-IMGT', u'FR2-IMGT', u'FR3-IMGT', u'CDR3-IMGT', u'JUNCTION', u'J-REGION', u'FR4-IMGT'] +added_junction_columns = [u"P3'V-nt nb", u'N1-REGION-nt nb', u"P5'D-nt nb", u"P3'D-nt nb", u'N2-REGION-nt nb', u"P5'J-nt nb", u"3'V-REGION trimmed-nt nb", u"5'D-REGION trimmed-nt nb", u"3'D-REGION trimmed-nt nb", u"5'J-REGION trimmed-nt nb"] + +inputFolder = args.input + +dirContents = os.listdir(inputFolder) +if len(dirContents) == 1: + inputFolder = os.path.join(inputFolder, dirContents[0]) + if os.path.isdir(inputFolder): + print "is dir" + dirContents = os.listdir(inputFolder) +files = sorted([os.path.join(inputFolder, f) for f in dirContents]) + +if len(files) % 3 is not 0: + stop_err("Files in zip not a multiple of 3, it should contain the all the 1_, 5_ and 6_ files for a sample") + import sys + sys.exit() + + +triplets = [] +step = len(files) / 3 +for i in range(0, step): + triplets.append((files[i], files[i + step], files[i + step + step])) + +outFile = args.output + +fSummary = pd.read_csv(triplets[0][0], sep="\t") +fSequence = pd.read_csv(triplets[0][1], sep="\t") +fJunction = pd.read_csv(triplets[0][2], sep="\t") +tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]] + +tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"] +tmp["CDR1 Length"] = fSummary["CDR1-IMGT length"] + +tmp["CDR2 Seq"] = fSequence["CDR2-IMGT"] +tmp["CDR2 Length"] = fSummary["CDR2-IMGT length"] + +tmp["CDR3 Seq"] = fSequence["CDR3-IMGT"] +tmp["CDR3 Length"] = fSummary["CDR3-IMGT length"] + +tmp["CDR3 Seq DNA"] = fJunction["JUNCTION"] +tmp["CDR3 Length DNA"] = '1' +tmp["Strand"] = fSummary["Orientation"] +tmp["CDR3 Found How"] = 'a' + +for col in added_summary_columns: + tmp[col] = fSummary[col] + +for col in added_sequence_columns: + tmp[col] = fSequence[col] + +for col in added_junction_columns: + tmp[col] = fJunction[col] + +outFrame = tmp + +for triple in triplets[1:]: + fSummary = pd.read_csv(triple[0], sep="\t") + fSequence = pd.read_csv(triple[1], sep="\t") + fJunction = pd.read_csv(triple[2], sep="\t") + + tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]] + + tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"] + tmp["CDR1 Length"] = fSummary["CDR1-IMGT length"] + + tmp["CDR2 Seq"] = fSequence["CDR2-IMGT"] + tmp["CDR2 Length"] = fSummary["CDR2-IMGT length"] + + tmp["CDR3 Seq"] = fSequence["CDR3-IMGT"] + tmp["CDR3 Length"] = fSummary["CDR3-IMGT length"] + + tmp["CDR3 Seq DNA"] = fJunction["JUNCTION"] + tmp["CDR3 Length DNA"] = '1' + tmp["Strand"] = fSummary["Orientation"] + tmp["CDR3 Found How"] = 'a' + + for col in added_summary_columns: + tmp[col] = fSummary[col] + + for col in added_sequence_columns: + tmp[col] = fSequence[col] + + for col in added_junction_columns: + tmp[col] = fJunction[col] + + outFrame = outFrame.append(tmp) + +outFrame.columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb'] + +vPattern = re.compile(r"IGHV[1-9]-[0-9ab]+-?[1-9]?") +dPattern = re.compile(r"IGHD[1-9]-[0-9ab]+") +jPattern = re.compile(r"IGHJ[1-9]") + +def filterGenes(s, pattern): + if type(s) is not str: + return "NA" + res = pattern.search(s) + if res: + return res.group(0) + return "NA" + + +outFrame["Top V Gene"] = outFrame["Top V Gene"].apply(lambda x: filterGenes(x, vPattern)) +outFrame["Top D Gene"] = outFrame["Top D Gene"].apply(lambda x: filterGenes(x, dPattern)) +outFrame["Top J Gene"] = outFrame["Top J Gene"].apply(lambda x: filterGenes(x, jPattern)) + + + +tmp = outFrame["VDJ Frame"] +tmp = tmp.replace("in-frame", "In-frame") +tmp = tmp.replace("null", "Out-of-frame") +tmp = tmp.replace("out-of-frame", "Out-of-frame") +outFrame["VDJ Frame"] = tmp +outFrame["CDR3 Length"] = outFrame["CDR3 Seq DNA"].map(str).map(len) +safeLength = lambda x: len(x) if type(x) == str else 0 +outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top D Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows? +outFrame.to_csv(outFile, sep="\t", index=False) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgtconvert.sh Tue Dec 10 05:53:08 2013 -0500 @@ -0,0 +1,57 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" +mkdir $PWD/$2_$3 + + +#!/bin/bash +f=$(file $1) +zip7Type="7-zip archive" +tarType="tar archive" +bzip2Type="bzip2 compressed" +gzipType="gzip compressed" +zipType="Zip archive" +rarType="RAR archive" + +if [[ "$f" == *"$zip7Type"* ]]; then + echo "7-zip" + echo "Trying: 7za e $1 -o$PWD/$2_$3/" + 7za e $1 -o$PWD/$2_$3/ +fi + +if [[ "$f" == *"$tarType"* ]] +then + echo "tar archive" + echo "Trying: tar xvf $1 -C $PWD/$2_$3/" + tar xvf $1 -C $PWD/$2_$3/ +fi + +if [[ "$f" == *"$bzip2Type"* ]] +then + echo "bzip2 compressed data" + echo "Trying: tar jxf $1 -C $PWD/$2_$3/" + tar jxf $1 -C $PWD/$2_$3/ +fi + +if [[ "$f" == *"$gzipType"* ]] +then + echo "gzip compressed data" + echo "Trying: tar xvzf $1 -C $PWD/$2_$3/" + tar xvzf $1 -C $PWD/$2_$3/ +fi + +if [[ "$f" == *"$zipType"* ]] +then + echo "Zip archive" + echo "Trying: unzip $1 -d $PWD/$2_$3/" + unzip $1 -d $PWD/$2_$3/ +fi + +if [[ "$f" == *"$rarType"* ]] +then + echo "RAR archive" + echo "Trying: unrar e $1 $PWD/$2_$3/" + unrar e $1 $PWD/$2_$3/ +fi +find $PWD/$2_$3/ -type f | grep -v "1_Summary_\|5_AA-sequences_\|6_Junction_" | xargs rm -f +python $dir/imgtconvert.py --input $PWD/$2_$3 --output $4 +
--- a/tool_dependencies.xml Mon Nov 25 09:21:55 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <set_environment version="1.0"> - </set_environment> - - <package name="igBlastn" version="1.0.0"> - <install version="1.0"> - <actions> - <action type="download_by_url" target_filename="ncbi-igblast-1.0.0-x64-linux.tar.gz"> - ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/1.0.0/ncbi-igblast-1.0.0-x64-linux.tar.gz - </action> - - - <action type="move_directory_files"> - <source_directory>ncbi-igblast-1.0.0</source_directory> - <destination_directory>$INSTALL_DIR/ncbi-igblast-1.0.0</destination_directory> - </action> - <action type="set_environment"> - <environment_variable name="IGDATA" action="set_to">$INSTALL_DIR/ncbi-igblast-1.0.0</environment_variable> - </action> - <action type="set_environment"> - <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/ncbi-igblast-1.0.0/bin</environment_variable> - </action> - - <action type="make_directory">$INSTALL_DIR/ncbi-igblast-1.0.0/internal_data</action> - <action type="make_directory">$INSTALL_DIR/ncbi-igblast-1.0.0/internal_data/human</action> - - <action type="shell_command">wget -r -np -nH --cut-dirs=4 -R index.html -P $INSTALL_DIR/ncbi-igblast-1.0.0 "ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data"</action> - - <action type="shell_command">wget -r -np -nH --cut-dirs=4 -R index.html -P $INSTALL_DIR/ncbi-igblast-1.0.0 "ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/database"</action> - - <action type="shell_command">wget -r -np -nH --cut-dirs=4 -R index.html -P $INSTALL_DIR/ncbi-igblast-1.0.0 "ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file"</action> - - </actions> - </install> - <readme> - Downloads igBlast with the latest database provided by NCBI, output may change depending on the database available at ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/ - </readme> - </package> -</tool_dependency>
