Mercurial > repos > davidvanzessen > igblast_human
changeset 63:7997fea1f16a draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 06 Nov 2014 03:31:15 -0500 |
parents | a700a552f031 |
children | |
files | database.zip databasenew.zip igblast_post.py igblastn.sh igblastn.xml |
diffstat | 5 files changed, 254 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/igblast_post.py Thu Nov 06 03:31:15 2014 -0500 @@ -0,0 +1,184 @@ +import urllib2 +import urllib +import httplib +import re +import sys +import time +import httplib + +infile = "d:/wd/igblast_post/Stanford_S22.fasta" +infile = "d:/wd/igblast_post/Stanford_S22_Small.fasta" +infile = "d:/wd/igblast_post/Stanford_S22_Smallest.fasta" +outfile = "d:/wd/igblast_post/out.txt" + +#needed or else response.read() fails on large results +httplib.HTTPConnection._http_vsn = 10 +httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' + +formregex = re.compile("http:\/\/www\.ncbi\.nlm\.nih\.gov\/igblast\/igblast.cgi\?.*ctg_time=(\d+)&job_key=(.{28})") + +seq = "" +seq_dic = dict() + +current_id = "" +current_seq = "" + +with open(infile) as i: + for line in i: + seq += line + if line.startswith(">"): + new_id = line.replace(">", "").rstrip() + if current_id != new_id: + if current_seq != "": + seq_dic[current_id] = current_seq + current_id = new_id + current_seq = "" + else: + current_seq += line.rstrip() + +seqcount = seq.count(">") +print "%i sequences" % (seqcount) +url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi" +headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } +values = {"queryseq": seq, + "germline_db_V": "IG_DB/imgt.Homo_sapiens.V.f.orf.p", + "germline_db_D": "IG_DB/imgt.Homo_sapiens.D.f.orf", + "germline_db_J": "IG_DB/imgt.Homo_sapiens.J.f.orf", + "num_alignments_V": "1", + "num_alignments_D": "1", + "num_alignments_J": "1", + "outfmt": "7", + "domain": "imgt", + "program": "blastn"} + +ctg_time = "" +job_key = "" +check_url = "" +regions = ["FR1-IMGT", "CDR1-IMGT", "FR2-IMGT","CDR2-IMGT","FR3-IMGT","CDR3-IMGT","FR4-IMGT"] +region_loc = {region: i*len(regions) for i,region in enumerate(regions)} +VDJ_loc = {loci: i*11 for i,loci in enumerate("VDJ")} +data = urllib.urlencode(values) +req = urllib2.Request(url, data, headers) +response = urllib2.urlopen(req) +html = response.read() +re_match = formregex.search(html) +with open(outfile, 'w') as o: + if re_match: + ctg_time = re_match.group(1) + job_key = re_match.group(2) + o.write("NCBI igblast is processing...") + o.write("ctg_time: %s" % (ctg_time)) + o.write("job_key: %s" % (job_key)) + check_url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?ctg_time=%s&job_key=%s" % (ctg_time, job_key) + print "URL: %s" % check_url + o.write("URL: %s" % check_url) + total_time = 0 + while re_match: + time.sleep(10) + total_time += 10 + o.write("Waited %i seconds, checking progress..." % (total_time)) + req = urllib2.Request(check_url) + response = urllib2.urlopen(req) + html = response.read() + if html.find("Job failed.") != -1: + sys.exit("Job Failed") + if html.find("Job is canceled.") != -1: + sys.exit("Job canceled") + re_match = formregex.search(html) + else: + o.write("NCBI is done analysing, parsing result...") + else: + test = re.compile("Formatting Results: Job id = (.{28})") + s = test.search(html) + o.write("http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=" + s.group(1)) + print "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=%s" % s.group(1) + + html = html.split("\n") + + queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1] + o.write("%i results" % len(queries)) + retry_count = 1 + while seqcount != len(queries) and retry_count <= 3: + #sys.stderr.write("Error, %i sequences in input and %i in result, happens sometimes, rerun?" % (seqcount, len(queries))) + o.write("Error, %i sequences in input and %i in result, requesting results again. (%i)" % (seqcount, len(queries), retry_count)) + req = urllib2.Request(check_url) + response = urllib2.urlopen(req) + html = response.read() + html = html.split("\n") + queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1] + retry_count += 1 + if seqcount == len(queries): + o.write("Success, continuing with analysis") + + + + + +header = "ID,Top V Gene,Top D Gene,Top J Gene,Chain type,stop codon,VDJ Frame,Productive,Strand,V end,V-D junction,D region,D-J junction,J start".split(",") +alignments_summ_headers = "from,to,length,matches,mismatches,gaps,percent identity".split(",") +for region in regions: + header += ["%s %s" % (region, x) for x in alignments_summ_headers] +header += ["%s seq" % region for region in regions] +hit_table_headers = "% identity,alignment length,mismatches,gap opens,gaps,q. start,q. end,s.start,s. end,evalue,bit score".split(",") +for g in ["V","D","J"]: + header += ["%s %s" % (g, x) for x in hit_table_headers] + +with open(outfile, 'w') as o: + o.write("\t".join(header) + "\n") + for i in range(len(queries)-1): + frm = queries[i] + to = 0 + if len(queries) == i+1: + to = len(html) + else: + to = queries[i+1]-1 + IDLine = queries[i] + + ID = html[IDLine].replace("# Query: ", "") + + if html[IDLine + 3] == "# 0 hits found": + print "No match on", ID + continue + result_row = [ID] + currentLine = IDLine + 5 #V-(D)-J rearrangement summary + if html[currentLine-1].find("Top D gene") == -1: #sometimes ncbi leaves out the D gene, not even a N/A... + split_row = html[currentLine].split("\t") + result_row += [split_row[0]] + result_row += ["N/A"] + result_row += split_row[1:] + else: + result_row += html[currentLine].split("\t") + currentLine += 3 #V-(D)-J junction details + result_row += html[currentLine].split("\t")[:-1] #[:-1] because the igblast tabular output $#@* sucks + currentLine += 2 #Alignment summary + al_summ = ["N/A"] * (7 * len(regions)) + al_seqs = {region: "N/A" for region in regions} + if html[currentLine].startswith("# Alignment summary"): #Alignment summary might not be included, add N/A if not, otherwise parse it + for region in regions: + currentLine += 1 + if html[currentLine].startswith("# Hit table"): + break + splitrow = html[currentLine].split("\t") + current_region = splitrow[0].replace(" (germline)", "") + if current_region in regions: + al_summ[region_loc[current_region]:region_loc[current_region] + len(regions)] = splitrow[1:] + al_seqs[current_region] = seq_dic[ID][int(splitrow[1])-1:int(splitrow[2])-1] + result_row += al_summ + result_row += [al_seqs[region] for region in regions] + VDJ_info = ["N/A"] * (11 * len(VDJ_loc.keys())) + if html[currentLine].startswith("# Hit table"): + currentLine += 3 + while html[currentLine] != "": + splitrow = html[currentLine].split("\t") + loci = splitrow[0] + VDJ_info[VDJ_loc[loci]:VDJ_loc[loci] + 11] = splitrow[3:] + currentLine +=1 + result_row += VDJ_info + + o.write("\t".join(result_row) + "\n") + + +#print V, D, J, frame, strand + + +
--- a/igblastn.sh Wed Aug 20 10:04:09 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -dir="$(cd "$(dirname "$0")" && pwd)" -mkdir $PWD/igblastdatabase -unzip $dir/database.zip -d $PWD/igblastdatabase/ - -export IGDATA=$PWD/igblastdatabase/ - -/home/galaxy/galaxy/igblast/igblastn -germline_db_V $PWD/igblastdatabase/database/human_gl_V -germline_db_J $PWD/igblastdatabase/database/human_gl_J -germline_db_D $PWD/igblastdatabase/database/human_gl_D -domain_system imgt -query $1 -auxiliary_data $PWD/igblastdatabase/optional_file/human_gl.aux -show_translation -outfmt 3 > $2
--- a/igblastn.xml Wed Aug 20 10:04:09 2014 -0400 +++ b/igblastn.xml Thu Nov 06 03:31:15 2014 -0500 @@ -1,16 +1,82 @@ <tool id="igblastn" name="igBLASTn" version="0.1.0"> <description> </description> - <command interpreter="bash"> - igblastn.sh $input $output + <command interpreter="python"> + igblast_post.py --input $input --output $output </command> <inputs> <param name="input" type="data" format="fasta" label="Fasta file"/> </inputs> <outputs> - <data name="output" format="text" label="${input.name}-igBLASTn Report"/> + <data name="output" format="tabular" type="data" label="${input.name}-igBLASTn Report"/> <!--<data name="log" format="text" label="log"/>--> </outputs> + <requirements> + <requirement type="package" version="1.0.0">igBlastn</requirement> + </requirements> <help> - Step 1 of the Immune Repertoire tools, generates a report to be parsed with the "igparse" tool. +============ +iReport +============ + +This tool uses the online igBLAST website hosted by NCBI to blast a FASTA file, it retrieves the result and generates a convenient tabular format for further processing. + +**NOTE** + +.. class:: warningmark + +- Everything goes through the servers of NCBI, so if you have sensitive data that that isn't allowed to leave your local network, this isn't the tool the use. + +**USAGE** + +.. class:: infomark + +- This tool uses a free service provided by NCBI, and although there doesn't seem to be any restrictions on usage, avoid unnecessary usage to lighten the load on NCBI's servers. + + +**INPUT** + +This tool accepts FASTA files as input: + +:: + + >lcl|FLN1FA002RWEZA.1| + ggctggagtgggtttcatacattagtagtaatagtggtgccatatactacgcagactctgtgaagggccgattcaccatc + tccagaaacaatgccaaggactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgc + gagagcgatcccccggtattactatgatactagtggcccaaacgactactggggccagggaaccctggtcaccgtctcct + cag + >lcl|FLN1FA001BLION.1| + aggcttgagtggatgggatggatcaacgctggcaatggtaacacaaaatattcacagaagttccagggcagagtcaccat + taccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtg + cgagagtgggcagcagctggtctgatgcttttgattatctggggccaagggacaatggtcaccgtctcctcag + +**OUTPUT** + +The following data is used for ARGalaxy + ++-----------------+----------------------------------------------+ +| Column name | Column contents | ++-----------------+----------------------------------------------+ +| ID | The Sequence ID provided by the sequencer. | ++-----------------+----------------------------------------------+ +| VDJ Frame | In-frame/Out-frame | ++-----------------+----------------------------------------------+ +| Top V Gene | The best matching V gene found. | ++-----------------+----------------------------------------------+ +| Top D Gene | The best matching D gene found. | ++-----------------+----------------------------------------------+ +| Top J Gene | The best matching J gene found. | ++-----------------+----------------------------------------------+ +| CDR3 Seq | The CDR3 region. | ++-----------------+----------------------------------------------+ +| CDR3 Length | The length of the CDR3 region. | ++-----------------+----------------------------------------------+ +| CDR3 Seq DNA | The CDR3 sequence region. | ++-----------------+----------------------------------------------+ +| CDR3 Length DNA | The length of the CDR3 sequence region. | ++-----------------+----------------------------------------------+ +| Functionality | If sequence is productive/unproductive | ++-----------------+----------------------------------------------+ + + </help> </tool>