Mercurial > repos > davidvanzessen > igblast_human
view igblast_post.py @ 63:7997fea1f16a draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 06 Nov 2014 03:31:15 -0500 |
parents | |
children |
line wrap: on
line source
import urllib2 import urllib import httplib import re import sys import time import httplib infile = "d:/wd/igblast_post/Stanford_S22.fasta" infile = "d:/wd/igblast_post/Stanford_S22_Small.fasta" infile = "d:/wd/igblast_post/Stanford_S22_Smallest.fasta" outfile = "d:/wd/igblast_post/out.txt" #needed or else response.read() fails on large results httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' formregex = re.compile("http:\/\/www\.ncbi\.nlm\.nih\.gov\/igblast\/igblast.cgi\?.*ctg_time=(\d+)&job_key=(.{28})") seq = "" seq_dic = dict() current_id = "" current_seq = "" with open(infile) as i: for line in i: seq += line if line.startswith(">"): new_id = line.replace(">", "").rstrip() if current_id != new_id: if current_seq != "": seq_dic[current_id] = current_seq current_id = new_id current_seq = "" else: current_seq += line.rstrip() seqcount = seq.count(">") print "%i sequences" % (seqcount) url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi" headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } values = {"queryseq": seq, "germline_db_V": "IG_DB/imgt.Homo_sapiens.V.f.orf.p", "germline_db_D": "IG_DB/imgt.Homo_sapiens.D.f.orf", "germline_db_J": "IG_DB/imgt.Homo_sapiens.J.f.orf", "num_alignments_V": "1", "num_alignments_D": "1", "num_alignments_J": "1", "outfmt": "7", "domain": "imgt", "program": "blastn"} ctg_time = "" job_key = "" check_url = "" regions = ["FR1-IMGT", "CDR1-IMGT", "FR2-IMGT","CDR2-IMGT","FR3-IMGT","CDR3-IMGT","FR4-IMGT"] region_loc = {region: i*len(regions) for i,region in enumerate(regions)} VDJ_loc = {loci: i*11 for i,loci in enumerate("VDJ")} data = urllib.urlencode(values) req = urllib2.Request(url, data, headers) response = urllib2.urlopen(req) html = response.read() re_match = formregex.search(html) with open(outfile, 'w') as o: if re_match: ctg_time = re_match.group(1) job_key = re_match.group(2) o.write("NCBI igblast is processing...") o.write("ctg_time: %s" % (ctg_time)) o.write("job_key: %s" % (job_key)) check_url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?ctg_time=%s&job_key=%s" % (ctg_time, job_key) print "URL: %s" % check_url o.write("URL: %s" % check_url) total_time = 0 while re_match: time.sleep(10) total_time += 10 o.write("Waited %i seconds, checking progress..." % (total_time)) req = urllib2.Request(check_url) response = urllib2.urlopen(req) html = response.read() if html.find("Job failed.") != -1: sys.exit("Job Failed") if html.find("Job is canceled.") != -1: sys.exit("Job canceled") re_match = formregex.search(html) else: o.write("NCBI is done analysing, parsing result...") else: test = re.compile("Formatting Results: Job id = (.{28})") s = test.search(html) o.write("http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=" + s.group(1)) print "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=%s" % s.group(1) html = html.split("\n") queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1] o.write("%i results" % len(queries)) retry_count = 1 while seqcount != len(queries) and retry_count <= 3: #sys.stderr.write("Error, %i sequences in input and %i in result, happens sometimes, rerun?" % (seqcount, len(queries))) o.write("Error, %i sequences in input and %i in result, requesting results again. (%i)" % (seqcount, len(queries), retry_count)) req = urllib2.Request(check_url) response = urllib2.urlopen(req) html = response.read() html = html.split("\n") queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1] retry_count += 1 if seqcount == len(queries): o.write("Success, continuing with analysis") header = "ID,Top V Gene,Top D Gene,Top J Gene,Chain type,stop codon,VDJ Frame,Productive,Strand,V end,V-D junction,D region,D-J junction,J start".split(",") alignments_summ_headers = "from,to,length,matches,mismatches,gaps,percent identity".split(",") for region in regions: header += ["%s %s" % (region, x) for x in alignments_summ_headers] header += ["%s seq" % region for region in regions] hit_table_headers = "% identity,alignment length,mismatches,gap opens,gaps,q. start,q. end,s.start,s. end,evalue,bit score".split(",") for g in ["V","D","J"]: header += ["%s %s" % (g, x) for x in hit_table_headers] with open(outfile, 'w') as o: o.write("\t".join(header) + "\n") for i in range(len(queries)-1): frm = queries[i] to = 0 if len(queries) == i+1: to = len(html) else: to = queries[i+1]-1 IDLine = queries[i] ID = html[IDLine].replace("# Query: ", "") if html[IDLine + 3] == "# 0 hits found": print "No match on", ID continue result_row = [ID] currentLine = IDLine + 5 #V-(D)-J rearrangement summary if html[currentLine-1].find("Top D gene") == -1: #sometimes ncbi leaves out the D gene, not even a N/A... split_row = html[currentLine].split("\t") result_row += [split_row[0]] result_row += ["N/A"] result_row += split_row[1:] else: result_row += html[currentLine].split("\t") currentLine += 3 #V-(D)-J junction details result_row += html[currentLine].split("\t")[:-1] #[:-1] because the igblast tabular output $#@* sucks currentLine += 2 #Alignment summary al_summ = ["N/A"] * (7 * len(regions)) al_seqs = {region: "N/A" for region in regions} if html[currentLine].startswith("# Alignment summary"): #Alignment summary might not be included, add N/A if not, otherwise parse it for region in regions: currentLine += 1 if html[currentLine].startswith("# Hit table"): break splitrow = html[currentLine].split("\t") current_region = splitrow[0].replace(" (germline)", "") if current_region in regions: al_summ[region_loc[current_region]:region_loc[current_region] + len(regions)] = splitrow[1:] al_seqs[current_region] = seq_dic[ID][int(splitrow[1])-1:int(splitrow[2])-1] result_row += al_summ result_row += [al_seqs[region] for region in regions] VDJ_info = ["N/A"] * (11 * len(VDJ_loc.keys())) if html[currentLine].startswith("# Hit table"): currentLine += 3 while html[currentLine] != "": splitrow = html[currentLine].split("\t") loci = splitrow[0] VDJ_info[VDJ_loc[loci]:VDJ_loc[loci] + 11] = splitrow[3:] currentLine +=1 result_row += VDJ_info o.write("\t".join(result_row) + "\n") #print V, D, J, frame, strand