| 0 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 """ | 
|  | 4 Read metaphaln output summarizing taxonomic distribution and format in PhyloXML format | 
|  | 5 | 
|  | 6 usage: %prog metaphlan.txt phylo.xml | 
|  | 7 """ | 
|  | 8 | 
|  | 9 import sys | 
|  | 10 | 
|  | 11 # Metaphlan output looks like: | 
|  | 12 #   k__Bacteria   99.07618 | 
|  | 13 #   k__Archaea  0.92382 | 
|  | 14 #   k__Bacteria|p__Proteobacteria   82.50732 | 
|  | 15 #   k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria    81.64905 | 
|  | 16 | 
|  | 17 rank_map = { 'k__': 'kingdom', 'p__': 'phylum', 'c__': 'class', 'o__': 'order', 'f__': 'family', 'g__': 'genus', 's__': 'species' } | 
|  | 18 | 
|  | 19 class Node( object ): | 
|  | 20     """Node in a taxonomy""" | 
|  | 21     def __init__( self, rank=None, name=None ): | 
|  | 22         self.rank = rank | 
|  | 23         self.name = name | 
|  | 24         self.value = None | 
|  | 25         self.children = dict() | 
|  | 26     @staticmethod | 
|  | 27     def from_metaphlan_file( file ): | 
|  | 28         """ | 
|  | 29         Build tree from metaphlan output | 
|  | 30         """ | 
|  | 31         root = Node() | 
|  | 32         for line in file: | 
|  | 33             taxa, abundance = line.split() | 
|  | 34             parts = taxa.split( "|" ) | 
|  | 35             root.add( parts, abundance ) | 
|  | 36         return root | 
|  | 37     def add( self, parts, value ): | 
|  | 38         """ | 
|  | 39         Parts is a list of node names, recursively add nodes until we reach | 
|  | 40         the last part, and then attach the value to that node. | 
|  | 41         """ | 
|  | 42         if len( parts ) == 0: | 
|  | 43             self.value = value | 
|  | 44         else: | 
|  | 45             next_part = parts.pop(0) | 
|  | 46             rank = rank_map[ next_part[:3] ] | 
|  | 47             name = next_part[3:] | 
|  | 48             if name not in self.children: | 
|  | 49                 self.children[name] = Node( rank, name ) | 
|  | 50             self.children[name].add( parts, value ) | 
|  | 51     def __str__( self ): | 
|  | 52         if self.children: | 
|  | 53             return "(" + ",".join( str( child ) for child in self.children.itervalues() ) + "):" + self.name | 
|  | 54         else: | 
|  | 55             return self.name | 
|  | 56     def to_phyloxml( self, out ): | 
|  | 57         print >>out, "<clade>" | 
|  | 58         if self.name: | 
|  | 59             print >>out, "<name>%s</name>" % self.name | 
|  | 60             print >>out, "<taxonomy><scientific_name>%s</scientific_name><rank>%s</rank></taxonomy>" % ( self.name, self.rank ) | 
|  | 61         if self.value: | 
|  | 62             print >>out, "<property datatype='xsd:float' ref='metaphlan:abundance' applies_to='node'>%s</property>" % self.value | 
|  | 63             ## print >>out, "<confidence type='abundance'>%s</confidence>" % self.value | 
|  | 64         for child in self.children.itervalues(): | 
|  | 65             child.to_phyloxml( out ) | 
|  | 66         print >>out, "</clade>" | 
|  | 67 | 
|  | 68 out = open( sys.argv[2], 'w' ) | 
|  | 69 | 
|  | 70 print >>out, '<phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.phyloxml.org" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd">' | 
|  | 71 print >>out, '<phylogeny rooted="true">' | 
|  | 72 | 
|  | 73 Node.from_metaphlan_file( open( sys.argv[1] ) ).to_phyloxml( out ) | 
|  | 74 | 
|  | 75 print >>out, '</phylogeny>' | 
|  | 76 print >>out, '</phyloxml>' |