Mercurial > repos > davidvanzessen > igblast_human

import urllib2
import urllib
import httplib
import re
import sys
import time
import httplib

infile = "d:/wd/igblast_post/Stanford_S22.fasta"
infile = "d:/wd/igblast_post/Stanford_S22_Small.fasta"
infile = "d:/wd/igblast_post/Stanford_S22_Smallest.fasta"
outfile = "d:/wd/igblast_post/out.txt"

#needed or else response.read() fails on large results
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'

formregex = re.compile("http:\/\/www\.ncbi\.nlm\.nih\.gov\/igblast\/igblast.cgi\?.*ctg_time=(\d+)&job_key=(.{28})")

seq = ""
seq_dic = dict()

current_id = ""
current_seq = ""

with open(infile) as i:
  for line in i:
    seq += line
    if line.startswith(">"):
      new_id = line.replace(">", "").rstrip()
      if current_id != new_id:
        if current_seq != "":
          seq_dic[current_id] = current_seq
        current_id = new_id
        current_seq = ""
    else:
      current_seq += line.rstrip()

seqcount = seq.count(">")
print "%i sequences" % (seqcount)
url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi"
headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }
values = {"queryseq": seq,
          "germline_db_V": "IG_DB/imgt.Homo_sapiens.V.f.orf.p",
          "germline_db_D": "IG_DB/imgt.Homo_sapiens.D.f.orf",
          "germline_db_J": "IG_DB/imgt.Homo_sapiens.J.f.orf",
          "num_alignments_V": "1",
          "num_alignments_D": "1",
          "num_alignments_J": "1",
          "outfmt": "7",
          "domain": "imgt",
          "program": "blastn"}

ctg_time = ""
job_key = ""
check_url = ""
regions = ["FR1-IMGT", "CDR1-IMGT", "FR2-IMGT","CDR2-IMGT","FR3-IMGT","CDR3-IMGT","FR4-IMGT"]
region_loc = {region: i*len(regions) for i,region in enumerate(regions)}
VDJ_loc = {loci: i*11 for i,loci in enumerate("VDJ")}
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
html = response.read()
re_match = formregex.search(html)
with open(outfile, 'w') as o:
  if re_match:
    ctg_time = re_match.group(1)
    job_key = re_match.group(2)
    o.write("NCBI igblast is processing...")
    o.write("ctg_time: %s" % (ctg_time))
    o.write("job_key: %s" % (job_key))
    check_url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?ctg_time=%s&job_key=%s" % (ctg_time, job_key)
    print "URL: %s" % check_url
    o.write("URL: %s" % check_url)
    total_time = 0
    while re_match:
      time.sleep(10)
      total_time += 10
      o.write("Waited %i seconds, checking progress..." % (total_time))
      req = urllib2.Request(check_url)
      response = urllib2.urlopen(req)
      html = response.read()
      if html.find("Job failed.") != -1:
        sys.exit("Job Failed")
      if html.find("Job is canceled.") != -1:
        sys.exit("Job canceled")
      re_match = formregex.search(html)
    else:
      o.write("NCBI is done analysing, parsing result...")
  else:
    test = re.compile("Formatting Results: Job id = (.{28})")
    s = test.search(html)
    o.write("http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=" + s.group(1))
    print "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=%s" % s.group(1)

  html = html.split("\n")

  queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1]
  o.write("%i results" % len(queries))
  retry_count = 1
  while seqcount != len(queries) and retry_count <= 3:
    #sys.stderr.write("Error, %i sequences in input and %i in result, happens sometimes, rerun?" % (seqcount, len(queries)))
    o.write("Error, %i sequences in input and %i in result, requesting results again. (%i)" % (seqcount, len(queries), retry_count))
    req = urllib2.Request(check_url)
    response = urllib2.urlopen(req)
    html = response.read()
    html = html.split("\n")
    queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1]
    retry_count += 1
    if seqcount == len(queries):
      o.write("Success, continuing with analysis")


header = "ID,Top V Gene,Top D Gene,Top J Gene,Chain type,stop codon,VDJ Frame,Productive,Strand,V end,V-D junction,D region,D-J junction,J start".split(",")
alignments_summ_headers = "from,to,length,matches,mismatches,gaps,percent identity".split(",")
for region in regions:
  header += ["%s %s" % (region, x) for x in alignments_summ_headers]
header += ["%s seq" % region for region in regions]
hit_table_headers = "% identity,alignment length,mismatches,gap opens,gaps,q. start,q. end,s.start,s. end,evalue,bit score".split(",")
for g in ["V","D","J"]:
  header += ["%s %s" % (g, x) for x in hit_table_headers]

with open(outfile, 'w') as o:
  o.write("\t".join(header) + "\n")
  for i in range(len(queries)-1):
    frm = queries[i]
    to = 0
    if len(queries) == i+1:
      to = len(html)
    else:
      to = queries[i+1]-1
    IDLine = queries[i]

    ID = html[IDLine].replace("# Query: ", "")

    if html[IDLine + 3] == "# 0 hits found":
      print "No match on", ID
      continue
    result_row = [ID]
    currentLine = IDLine + 5 #V-(D)-J rearrangement summary
    if html[currentLine-1].find("Top D gene") == -1: #sometimes ncbi leaves out the D gene, not even a N/A...
      split_row = html[currentLine].split("\t")
      result_row += [split_row[0]]
      result_row += ["N/A"]
      result_row += split_row[1:]
    else:
      result_row += html[currentLine].split("\t")
    currentLine += 3 #V-(D)-J junction details
    result_row += html[currentLine].split("\t")[:-1] #[:-1] because the igblast tabular output $#@* sucks
    currentLine += 2 #Alignment summary
    al_summ = ["N/A"] * (7 * len(regions))
    al_seqs = {region: "N/A" for region in regions}
    if html[currentLine].startswith("# Alignment summary"): #Alignment summary might not be included, add N/A if not, otherwise parse it
      for region in regions:
        currentLine += 1
        if html[currentLine].startswith("# Hit table"):
          break
        splitrow = html[currentLine].split("\t")
        current_region = splitrow[0].replace(" (germline)", "")
        if current_region in regions:
          al_summ[region_loc[current_region]:region_loc[current_region] + len(regions)] = splitrow[1:]
          al_seqs[current_region] = seq_dic[ID][int(splitrow[1])-1:int(splitrow[2])-1]
    result_row += al_summ
    result_row += [al_seqs[region] for region in regions]
    VDJ_info = ["N/A"] * (11 * len(VDJ_loc.keys()))
    if html[currentLine].startswith("# Hit table"):
      currentLine += 3
      while html[currentLine] != "":
        splitrow = html[currentLine].split("\t")
        loci = splitrow[0]
        VDJ_info[VDJ_loc[loci]:VDJ_loc[loci] + 11] = splitrow[3:]
        currentLine +=1
    result_row += VDJ_info

    o.write("\t".join(result_row) + "\n")


#print V, D, J, frame, strand
author	davidvanzessen
date	Thu, 06 Nov 2014 03:31:15 -0500
parents
children