changeset 63:7997fea1f16a draft default tip

Uploaded
author davidvanzessen
date Thu, 06 Nov 2014 03:31:15 -0500
parents a700a552f031
children
files database.zip databasenew.zip igblast_post.py igblastn.sh igblastn.xml
diffstat 5 files changed, 254 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
Binary file database.zip has changed
Binary file databasenew.zip has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/igblast_post.py	Thu Nov 06 03:31:15 2014 -0500
@@ -0,0 +1,184 @@
+import urllib2
+import urllib
+import httplib
+import re
+import sys
+import time
+import httplib
+
+infile = "d:/wd/igblast_post/Stanford_S22.fasta"
+infile = "d:/wd/igblast_post/Stanford_S22_Small.fasta"
+infile = "d:/wd/igblast_post/Stanford_S22_Smallest.fasta"
+outfile = "d:/wd/igblast_post/out.txt"
+
+#needed or else response.read() fails on large results
+httplib.HTTPConnection._http_vsn = 10
+httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
+
+formregex = re.compile("http:\/\/www\.ncbi\.nlm\.nih\.gov\/igblast\/igblast.cgi\?.*ctg_time=(\d+)&job_key=(.{28})")
+
+seq = ""
+seq_dic = dict()
+
+current_id = ""
+current_seq = ""
+
+with open(infile) as i:
+  for line in i:
+    seq += line
+    if line.startswith(">"):
+      new_id = line.replace(">", "").rstrip()
+      if current_id != new_id:
+        if current_seq != "":
+          seq_dic[current_id] = current_seq
+        current_id = new_id
+        current_seq = ""
+    else:
+      current_seq += line.rstrip()
+
+seqcount = seq.count(">")
+print "%i sequences" % (seqcount)
+url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi"
+headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }
+values = {"queryseq": seq,
+          "germline_db_V": "IG_DB/imgt.Homo_sapiens.V.f.orf.p",
+          "germline_db_D": "IG_DB/imgt.Homo_sapiens.D.f.orf",
+          "germline_db_J": "IG_DB/imgt.Homo_sapiens.J.f.orf",
+          "num_alignments_V": "1",
+          "num_alignments_D": "1",
+          "num_alignments_J": "1",
+          "outfmt": "7",
+          "domain": "imgt",
+          "program": "blastn"}
+
+ctg_time = ""
+job_key = ""
+check_url = ""
+regions = ["FR1-IMGT", "CDR1-IMGT", "FR2-IMGT","CDR2-IMGT","FR3-IMGT","CDR3-IMGT","FR4-IMGT"]
+region_loc = {region: i*len(regions) for i,region in enumerate(regions)}
+VDJ_loc = {loci: i*11 for i,loci in enumerate("VDJ")}
+data = urllib.urlencode(values)
+req = urllib2.Request(url, data, headers)
+response = urllib2.urlopen(req)
+html = response.read()
+re_match = formregex.search(html)
+with open(outfile, 'w') as o:
+  if re_match:
+    ctg_time = re_match.group(1)
+    job_key = re_match.group(2)
+    o.write("NCBI igblast is processing...")
+    o.write("ctg_time: %s" % (ctg_time))
+    o.write("job_key: %s" % (job_key))
+    check_url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?ctg_time=%s&job_key=%s" % (ctg_time, job_key)
+    print "URL: %s" % check_url
+    o.write("URL: %s" % check_url)
+    total_time = 0
+    while re_match:
+      time.sleep(10)
+      total_time += 10
+      o.write("Waited %i seconds, checking progress..." % (total_time))
+      req = urllib2.Request(check_url)
+      response = urllib2.urlopen(req)
+      html = response.read()
+      if html.find("Job failed.") != -1:
+        sys.exit("Job Failed")
+      if html.find("Job is canceled.") != -1:
+        sys.exit("Job canceled")
+      re_match = formregex.search(html)
+    else:
+      o.write("NCBI is done analysing, parsing result...")
+  else:
+    test = re.compile("Formatting Results: Job id = (.{28})")
+    s = test.search(html)
+    o.write("http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=" + s.group(1))
+    print "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=%s" % s.group(1)
+
+  html = html.split("\n")
+
+  queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1]
+  o.write("%i results" % len(queries))
+  retry_count = 1
+  while seqcount != len(queries) and retry_count <= 3:
+    #sys.stderr.write("Error, %i sequences in input and %i in result, happens sometimes, rerun?" % (seqcount, len(queries)))
+    o.write("Error, %i sequences in input and %i in result, requesting results again. (%i)" % (seqcount, len(queries), retry_count))
+    req = urllib2.Request(check_url)
+    response = urllib2.urlopen(req)
+    html = response.read()
+    html = html.split("\n")
+    queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1]
+    retry_count += 1
+    if seqcount == len(queries):
+      o.write("Success, continuing with analysis")
+    
+    
+    
+
+
+header = "ID,Top V Gene,Top D Gene,Top J Gene,Chain type,stop codon,VDJ Frame,Productive,Strand,V end,V-D junction,D region,D-J junction,J start".split(",")
+alignments_summ_headers = "from,to,length,matches,mismatches,gaps,percent identity".split(",")
+for region in regions:
+  header += ["%s %s" % (region, x) for x in alignments_summ_headers]
+header += ["%s seq" % region for region in regions]
+hit_table_headers = "% identity,alignment length,mismatches,gap opens,gaps,q. start,q. end,s.start,s. end,evalue,bit score".split(",")
+for g in ["V","D","J"]:
+  header += ["%s %s" % (g, x) for x in hit_table_headers]
+
+with open(outfile, 'w') as o:
+  o.write("\t".join(header) + "\n")
+  for i in range(len(queries)-1):
+    frm = queries[i]
+    to = 0
+    if len(queries) == i+1:
+      to = len(html)
+    else:
+      to = queries[i+1]-1
+    IDLine = queries[i]
+    
+    ID = html[IDLine].replace("# Query: ", "")
+
+    if html[IDLine + 3] == "# 0 hits found":
+      print "No match on", ID
+      continue
+    result_row = [ID]
+    currentLine = IDLine + 5 #V-(D)-J rearrangement summary
+    if html[currentLine-1].find("Top D gene") == -1: #sometimes ncbi leaves out the D gene, not even a N/A...
+      split_row = html[currentLine].split("\t")
+      result_row += [split_row[0]]
+      result_row += ["N/A"]
+      result_row += split_row[1:]
+    else:
+      result_row += html[currentLine].split("\t")
+    currentLine += 3 #V-(D)-J junction details
+    result_row += html[currentLine].split("\t")[:-1] #[:-1] because the igblast tabular output $#@* sucks
+    currentLine += 2 #Alignment summary
+    al_summ = ["N/A"] * (7 * len(regions))
+    al_seqs = {region: "N/A" for region in regions}
+    if html[currentLine].startswith("# Alignment summary"): #Alignment summary might not be included, add N/A if not, otherwise parse it
+      for region in regions:
+        currentLine += 1
+        if html[currentLine].startswith("# Hit table"):
+          break
+        splitrow = html[currentLine].split("\t")
+        current_region = splitrow[0].replace(" (germline)", "")
+        if current_region in regions:
+          al_summ[region_loc[current_region]:region_loc[current_region] + len(regions)] = splitrow[1:]
+          al_seqs[current_region] = seq_dic[ID][int(splitrow[1])-1:int(splitrow[2])-1]
+    result_row += al_summ
+    result_row += [al_seqs[region] for region in regions]
+    VDJ_info = ["N/A"] * (11 * len(VDJ_loc.keys()))
+    if html[currentLine].startswith("# Hit table"):
+      currentLine += 3
+      while html[currentLine] != "":
+        splitrow = html[currentLine].split("\t")
+        loci = splitrow[0]
+        VDJ_info[VDJ_loc[loci]:VDJ_loc[loci] + 11] = splitrow[3:]
+        currentLine +=1
+    result_row += VDJ_info
+
+    o.write("\t".join(result_row) + "\n")
+    
+
+#print V, D, J, frame, strand
+
+
+
--- a/igblastn.sh	Wed Aug 20 10:04:09 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-dir="$(cd "$(dirname "$0")" && pwd)"
-mkdir $PWD/igblastdatabase
-unzip $dir/database.zip -d $PWD/igblastdatabase/
-
-export IGDATA=$PWD/igblastdatabase/
-
-/home/galaxy/galaxy/igblast/igblastn -germline_db_V $PWD/igblastdatabase/database/human_gl_V -germline_db_J $PWD/igblastdatabase/database/human_gl_J -germline_db_D $PWD/igblastdatabase/database/human_gl_D -domain_system imgt -query $1 -auxiliary_data $PWD/igblastdatabase/optional_file/human_gl.aux -show_translation -outfmt 3 > $2
--- a/igblastn.xml	Wed Aug 20 10:04:09 2014 -0400
+++ b/igblastn.xml	Thu Nov 06 03:31:15 2014 -0500
@@ -1,16 +1,82 @@
 <tool id="igblastn" name="igBLASTn" version="0.1.0">
     <description> </description>
-    <command interpreter="bash">
-		igblastn.sh $input $output
+    <command interpreter="python">
+		igblast_post.py --input $input --output $output
     </command>
     <inputs>
         <param name="input" type="data" format="fasta" label="Fasta file"/>
     </inputs>
     <outputs>
-        <data name="output" format="text" label="${input.name}-igBLASTn Report"/>
+        <data name="output" format="tabular" type="data" label="${input.name}-igBLASTn Report"/>
 	<!--<data name="log" format="text" label="log"/>-->
     </outputs>
+	<requirements>
+		<requirement type="package" version="1.0.0">igBlastn</requirement>
+	</requirements>
     <help>
-        Step 1 of the Immune Repertoire tools, generates a report to be parsed with the "igparse" tool.
+============
+iReport
+============
+
+This tool uses the online igBLAST website hosted by NCBI to blast a FASTA file, it retrieves the result and generates a convenient tabular format for further processing.
+
+**NOTE**
+
+.. class:: warningmark
+
+- Everything goes through the servers of NCBI, so if you have sensitive data that that isn't allowed to leave your local network, this isn't the tool the use.
+
+**USAGE**
+
+.. class:: infomark
+
+- This tool uses a free service provided by NCBI, and although there doesn't seem to be any restrictions on usage, avoid unnecessary usage to lighten the load on NCBI's servers.
+
+
+**INPUT**
+
+This tool accepts FASTA files as input:
+
+::
+
+		>lcl|FLN1FA002RWEZA.1| 
+		ggctggagtgggtttcatacattagtagtaatagtggtgccatatactacgcagactctgtgaagggccgattcaccatc
+		tccagaaacaatgccaaggactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgc
+		gagagcgatcccccggtattactatgatactagtggcccaaacgactactggggccagggaaccctggtcaccgtctcct
+		cag
+		>lcl|FLN1FA001BLION.1| 
+		aggcttgagtggatgggatggatcaacgctggcaatggtaacacaaaatattcacagaagttccagggcagagtcaccat
+		taccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtg
+		cgagagtgggcagcagctggtctgatgcttttgattatctggggccaagggacaatggtcaccgtctcctcag
+
+**OUTPUT**
+
+The following data is used for ARGalaxy
+
++-----------------+----------------------------------------------+
+| Column name     | Column contents                              |
++-----------------+----------------------------------------------+
+| ID              | The Sequence ID provided by the sequencer.   |
++-----------------+----------------------------------------------+
+| VDJ Frame       | In-frame/Out-frame                           |
++-----------------+----------------------------------------------+
+| Top V Gene      | The best matching V gene found.              |
++-----------------+----------------------------------------------+
+| Top D Gene      | The best matching D gene found.              |
++-----------------+----------------------------------------------+
+| Top J Gene      | The best matching J gene found.              |
++-----------------+----------------------------------------------+
+| CDR3 Seq        | The CDR3 region.                             |
++-----------------+----------------------------------------------+
+| CDR3 Length     | The length of the CDR3 region.               |
++-----------------+----------------------------------------------+
+| CDR3 Seq DNA    | The CDR3 sequence region.                    |
++-----------------+----------------------------------------------+
+| CDR3 Length DNA | The length of the CDR3 sequence region.      |
++-----------------+----------------------------------------------+
+| Functionality   | If sequence is productive/unproductive       |
++-----------------+----------------------------------------------+
+
+
     </help>
 </tool>