# HG changeset patch
# User devteam
# Date 1390498314 18000
# Node ID 1358db75fd49289d54233880f6ddb65d128633aa
Imported from capsule None
diff -r 000000000000 -r 1358db75fd49 gi2taxonomy.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gi2taxonomy.py Thu Jan 23 12:31:54 2014 -0500
@@ -0,0 +1,174 @@
+import sys
+import string
+import tempfile
+import subprocess
+from os import path
+
+# -----------------------------------------------------------------------------------
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+# -----------------------------------------------------------------------------------
+def gi_name_to_sorted_list(file_name, gi_col, name_col):
+ """ Suppose input file looks like this:
+ a 2
+ b 4
+ c 5
+ d 5
+ where column 1 is gi_col and column 0 is name_col
+ output of this function will look like this:
+ [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
+ """
+
+ result = []
+ try:
+ F = open( file_name, 'r' )
+ try:
+ for line in F:
+ file_cols = string.split(line.rstrip(), '\t')
+ file_cols[gi_col] = int( file_cols[gi_col] )
+ result.append( [ file_cols[gi_col], file_cols[name_col] ] )
+ except:
+ print >>sys.stderr, 'Non numeric GI field...skipping'
+
+ except Exception, e:
+ stop_err('%s\n' % e)
+ F.close()
+ result.sort()
+ return result
+
+# -----------------------------------------------------------------------------------
+
+def collapse_repeating_gis( L ):
+ """ Accepts 2-d array of gi-key pairs such as this
+ L = [
+ [gi1, 'key1'],
+ [gi1, 'key2'],
+ [gi2','key3']
+ ]
+
+ Returns this:
+ [ [gi1, 'key1', 'key2'],
+ [gi2, 'key3' ]
+ ]
+
+ The first value in each sublist MUST be int
+ """
+ gi = []
+ i = 0
+ result = []
+
+ try:
+ for item in L:
+ if i == 0:
+ prev = item[0]
+
+ if prev != item[0]:
+ prev_L = []
+ prev_L.append( prev )
+ result.append( prev_L + gi )
+ prev = item[0]
+ gi =[]
+
+ gi.append( item[1] )
+ i += 1
+
+ except Exception, e:
+ stop_err('%s\n' % e)
+
+ prev_L = []
+ prev_L.append( prev )
+ result.append( prev_L + gi )
+ del(L)
+ return result
+
+# -----------------------------------------------------------------------------------
+
+def get_taxId( gi2tax_file, gi_name_list, out_file ):
+ """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
+ prints result to out_file
+
+ gi2tax_file MUST be sorted on GI column
+
+ gi_name_list is a list that look slike this:
+ [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
+ where the first element of each sublist is a GI number
+ this list MUST also be sorted on GI
+
+ This function searches through 117,000,000 rows of gi2taxId file from NCBI
+ in approximately 4 minutes. This time is not dependent on the length of
+ gi_name_list
+ """
+
+ L = gi_name_list.pop(0)
+ my_gi = L[0]
+ F = open( out_file, 'w' )
+ gi = 0
+ for line in file( gi2tax_file ):
+ line = line.rstrip()
+ gi, taxId = string.split( line, '\t' )
+ gi = int( gi )
+
+ if gi > my_gi:
+ try:
+ while ( my_gi < gi ):
+ L = gi_name_list.pop(0)
+ my_gi = L[0]
+ except:
+ break
+
+ if gi == my_gi:
+ for i in range( 1,len( L ) ):
+ print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
+ try:
+ L = gi_name_list.pop(0)
+ my_gi = L[0]
+ except:
+ break
+
+# -----------------------------------------------------------------------------------
+
+
+try:
+ in_f = sys.argv[1] # input file with GIs
+ gi_col = int( sys.argv[2] ) - 1 # column in input containing GIs
+ name_col = int( sys.argv[3] ) - 1 # column containing sequence names
+ out_f = sys.argv[4] # output file
+ tool_data = sys.argv[5]
+except:
+ stop_err('Check arguments\n')
+
+# GI2TAX point to a file produced by concatenation of:
+# ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
+# and
+# ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
+# a sorting using this command:
+# sort -n -k 1
+
+GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )
+
+# NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
+# files contained within:
+# ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+
+NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
+NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )
+
+g2n = gi_name_to_sorted_list(in_f, gi_col, name_col)
+
+if len(g2n) == 0:
+ stop_err('No valid GI-containing fields. Please, check your column assignments.\n')
+
+tb_F = tempfile.NamedTemporaryFile('w')
+
+get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )
+
+try:
+ tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
+ retcode = subprocess.call( tb_cmd, shell=True )
+ if retcode < 0:
+ print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
+except OSError, e:
+ print >>sys.stderr, "Execution of taxBuilder2tree failed:", e
diff -r 000000000000 -r 1358db75fd49 gi2taxonomy.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gi2taxonomy.xml Thu Jan 23 12:31:54 2014 -0500
@@ -0,0 +1,102 @@
+
+
+
+ taxonomy
+
+ gi2taxonomy.py $input $giField $idField $out_file1 ${GALAXY_DATA_INDEX_DIR}
+
+
+
+
+
+
+
+
+
+ taxBuilder
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Use *Filter and Sort->Filter* to restrict output of this tool to desired taxonomic ranks. You can also use *Text Manipulation->Cut* to remove unwanted columns from the output.
+
+------
+
+**What it does**
+
+Fetches taxonomic information for a list of GI numbers (sequences identifiers used by the National Center for Biotechnology Information http://www.ncbi.nlm.nih.gov).
+
+-------
+
+**Example**
+
+Suppose you have BLAST output that looks like this::
+
+ +-----------------------+----------+----------+-----------------+------------+------+--------+
+ | queryId | targetGI | identity | alignmentLength | mismatches | gaps | score |
+ +-----------------------+----------+----------+-----------------+------------+------+--------+
+ | 1L_EYKX4VC01BXWX1_265 | 1430919 | 90.09 | 212 | 15 | 6 | 252.00 |
+ +-----------------------+----------+----------+-----------------+------------+------+--------+
+
+and you want to obtain full taxonomic representation for GIs listed in *targetGI* column. If you set parameters as shown here:
+
+.. image:: ${static_path}/images/fetchTax.png
+
+
+the tool will generate the following output (you may need to scroll sideways to see the entire line)::
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
+ 1L_EYKX4VC01BXWX1_265 9606 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n n n Homo n Homo sapiens n 1430919
+
+In other words the tool printed *Name column*, *taxonomy Id*, appended 22 columns containing taxonomic ranks from Superkingdom to Subspecies and added *GI* as the last column. Below is a formal definition of the output columns::
+
+ Column Definition
+ ------- ------------------------------------------
+ 1 Name (specified by 'Name column' dropdown)
+ 2 GI (specified by 'GI column' dropdown)
+ 3 root
+ 4 superkingdom
+ 5 kingdom
+ 6 subkingdom
+ 7 superphylum
+ 8 phylum
+ 9 subphylum
+ 10 superclass
+ 11 class
+ 12 subclass
+ 13 superorder
+ 14 order
+ 15 suborder
+ 16 superfamily
+ 17 family
+ 18 subfamily
+ 19 tribe
+ 20 subtribe
+ 21 genus
+ 22 subgenus
+ 23 species
+ 24 subspecies
+
+------
+
+.. class:: warningmark
+
+**Why do I have these "n" things?**
+
+Be aware that the NCBI taxonomy (ftp://ftp.ncbi.nih.gov/pub/taxonomy/) this tool relies upon is incomplete. This means that for many species one or more ranks are absent and represented as "**n**". In the above example *subkingdom*, *superphylum* etc. are missing.
+
+
+
+
+
+
diff -r 000000000000 -r 1358db75fd49 test-data/taxonomy2gi-input.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxonomy2gi-input.tabular Thu Jan 23 12:31:54 2014 -0500
@@ -0,0 +1,4 @@
+2 9913
+4 9646
+15 9915
+16 9771
diff -r 000000000000 -r 1358db75fd49 test-data/taxonomy2gi-output.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxonomy2gi-output.tabular Thu Jan 23 12:31:54 2014 -0500
@@ -0,0 +1,4 @@
+9913 9913 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria n Ruminantia n Bovidae Bovinae n n Bos n Bos taurus n 2
+9646 9646 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria Carnivora Caniformia n Ursidae n n n Ailuropoda n Ailuropoda melanoleuca n 4
+9915 9915 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria n Ruminantia n Bovidae Bovinae n n Bos n Bos indicus n 15
+9771 9771 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria Cetacea Mysticeti n Balaenopteridae n n n Balaenoptera n Balaenoptera musculus n 16
\ No newline at end of file
diff -r 000000000000 -r 1358db75fd49 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Thu Jan 23 12:31:54 2014 -0500
@@ -0,0 +1,6 @@
+
+
+
+
+
+