# HG changeset patch # User davidvanzessen # Date 1409831942 14400 # Node ID 24d5d9120d93f2039f9576d407e96915a7975d0f # Parent 0dafe2f9ceb8eeb2c1b0aca5c118f8bca6c992cb Uploaded diff -r 0dafe2f9ceb8 -r 24d5d9120d93 complete_immunerepertoire.xml diff -r 0dafe2f9ceb8 -r 24d5d9120d93 imgtconvert.py --- a/imgtconvert.py Thu Sep 04 07:51:11 2014 -0400 +++ b/imgtconvert.py Thu Sep 04 07:59:02 2014 -0400 @@ -32,12 +32,11 @@ if len(dirContents) == 1: inputFolder = os.path.join(inputFolder, dirContents[0]) if os.path.isdir(inputFolder): - print "is dir" dirContents = os.listdir(inputFolder) -files = sorted([os.path.join(inputFolder, f) for f in dirContents]) +files = sorted([os.path.join(inputFolder, f) for f in dirContents if os.path.isfile(os.path.join(inputFolder, f))]) if len(files) % 3 is not 0: - stop_err("Files in zip not a multiple of 3, it should contain the all the 1_, 5_ and 6_ files for a sample") + stop_err("Files in zip not a multiple of 3, it should contain all the 1_, 5_ and 6_ files for a sample") import sys sys.exit() @@ -48,9 +47,12 @@ outFile = args.output -fSummary = pd.read_csv(triplets[0][0], sep="\t") -fSequence = pd.read_csv(triplets[0][1], sep="\t") -fJunction = pd.read_csv(triplets[0][2], sep="\t") +#fSummary = pd.read_csv(triplets[0][0], sep="\t", low_memory=False) +fSummary = pd.read_csv(triplets[0][0], sep="\t", dtype=object) +#fSequence = pd.read_csv(triplets[0][1], sep="\t", low_memory=False) +fSequence = pd.read_csv(triplets[0][1], sep="\t", dtype=object) +#fJunction = pd.read_csv(triplets[0][2], sep="\t", low_memory=False) +fJunction = pd.read_csv(triplets[0][2], sep="\t", dtype=object) tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]] tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"] @@ -78,10 +80,12 @@ outFrame = tmp + + for triple in triplets[1:]: - fSummary = pd.read_csv(triple[0], sep="\t") - fSequence = pd.read_csv(triple[1], sep="\t") - fJunction = pd.read_csv(triple[2], sep="\t") + fSummary = pd.read_csv(triple[0], sep="\t", dtype=object) + fSequence = pd.read_csv(triple[1], sep="\t", dtype=object) + fJunction = pd.read_csv(triple[2], sep="\t", dtype=object) tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]] @@ -110,11 +114,57 @@ outFrame = outFrame.append(tmp) + outFrame.columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb'] -vPattern = re.compile(r"IGHV[1-9]-[0-9ab]+-?[1-9]?") -dPattern = re.compile(r"IGHD[1-9]-[0-9ab]+") -jPattern = re.compile(r"IGHJ[1-9]") +""" +IGHV[0-9]-[0-9ab]+-?[0-9]?D? +TRBV[0-9]{1,2}-?[0-9]?-?[123]? +IGKV[0-3]D?-[0-9]{1,2} +IGLV[0-9]-[0-9]{1,2} +TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])? +TRGV[234589] +TRDV[1-3] + +IGHD[0-9]-[0-9ab]+ +TRBD[12] +TRDD[1-3] + +IGHJ[1-6] +TRBJ[12]-[1-7] +IGKJ[1-5] +IGLJ[12367] +TRAJ[0-9]{1,2} +TRGJP?[12] +TRDJ[1-4] +""" + +vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?)", + r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", + r"(IGKV[0-3]D?-[0-9]{1,2})", + r"(IGLV[0-9]-[0-9]{1,2})", + r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", + r"(TRGV[234589])", + r"(TRDV[1-3])"] + +dPattern = [r"(IGHD[0-9]-[0-9ab]+)", + r"(TRBD[12])", + r"(TRDD[1-3])"] + +jPattern = [r"(IGHJ[1-6])", + r"(TRBJ[12]-[1-7])", + r"(IGKJ[1-5])", + r"(IGLJ[12367])", + r"(TRAJ[0-9]{1,2})", + r"(TRGJP?[12])", + r"(TRDJ[1-4])"] + +vPattern = re.compile(r"|".join(vPattern)) + +dPattern = re.compile(r"|".join(dPattern)) + +jPattern = re.compile(r"|".join(jPattern)) + def filterGenes(s, pattern): if type(s) is not str: @@ -125,11 +175,12 @@ return "NA" + outFrame["Top V Gene"] = outFrame["Top V Gene"].apply(lambda x: filterGenes(x, vPattern)) outFrame["Top D Gene"] = outFrame["Top D Gene"].apply(lambda x: filterGenes(x, dPattern)) outFrame["Top J Gene"] = outFrame["Top J Gene"].apply(lambda x: filterGenes(x, jPattern)) - +print outFrame tmp = outFrame["VDJ Frame"] tmp = tmp.replace("in-frame", "In-frame") @@ -138,5 +189,6 @@ outFrame["VDJ Frame"] = tmp outFrame["CDR3 Length DNA"] = outFrame["CDR3 Seq DNA"].map(str).map(len) safeLength = lambda x: len(x) if type(x) == str else 0 -outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top D Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows? +outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows? +#outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top D Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows? outFrame.to_csv(outFile, sep="\t", index=False, index_label="index") diff -r 0dafe2f9ceb8 -r 24d5d9120d93 imgtconvert.sh --- a/imgtconvert.sh Thu Sep 04 07:51:11 2014 -0400 +++ b/imgtconvert.sh Thu Sep 04 07:59:02 2014 -0400 @@ -1,6 +1,6 @@ #!/bin/bash dir="$(cd "$(dirname "$0")" && pwd)" -mkdir $PWD/$2_$3 +mkdir $PWD/files #!/bin/bash @@ -14,44 +14,44 @@ if [[ "$f" == *"$zip7Type"* ]]; then echo "7-zip" - echo "Trying: 7za e $1 -o$PWD/$2_$3/" - 7za e $1 -o$PWD/$2_$3/ + echo "Trying: 7za e $1 -o$PWD/files/" + 7za e $1 -o$PWD/files/ fi if [[ "$f" == *"$tarType"* ]] then echo "tar archive" - echo "Trying: tar xvf $1 -C $PWD/$2_$3/" - tar xvf $1 -C $PWD/$2_$3/ + echo "Trying: tar xvf $1 -C $PWD/files/" + tar xvf $1 -C $PWD/files/ fi if [[ "$f" == *"$bzip2Type"* ]] then echo "bzip2 compressed data" - echo "Trying: tar jxf $1 -C $PWD/$2_$3/" - tar jxf $1 -C $PWD/$2_$3/ + echo "Trying: tar jxf $1 -C $PWD/files/" + tar jxf $1 -C $PWD/files/ fi if [[ "$f" == *"$gzipType"* ]] then echo "gzip compressed data" - echo "Trying: tar xvzf $1 -C $PWD/$2_$3/" - tar xvzf $1 -C $PWD/$2_$3/ + echo "Trying: tar xvzf $1 -C $PWD/files/" + tar xvzf $1 -C $PWD/files/ fi if [[ "$f" == *"$zipType"* ]] then echo "Zip archive" - echo "Trying: unzip $1 -d $PWD/$2_$3/" - unzip $1 -d $PWD/$2_$3/ + echo "Trying: unzip $1 -d $PWD/files/" + unzip $1 -d $PWD/files/ > $PWD/unziplog.log fi if [[ "$f" == *"$rarType"* ]] then echo "RAR archive" - echo "Trying: unrar e $1 $PWD/$2_$3/" - unrar e $1 $PWD/$2_$3/ + echo "Trying: unrar e $1 $PWD/files/" + unrar e $1 $PWD/files/ fi -find $PWD/$2_$3/ -type f | grep -v "1_Summary_\|5_AA-sequences_\|6_Junction_" | xargs rm -f -python $dir/imgtconvert.py --input $PWD/$2_$3 --output $4 +find $PWD/files/ -type f | grep -v "1_Summary_\|5_AA-sequences_\|6_Junction_" | xargs rm -f +python $dir/imgtconvert.py --input $PWD/files --output $2