diff mol2gspan.py @ 1:5f97004c7f57 draft

Uploaded
author bgruening
date Tue, 15 Apr 2014 12:44:17 -0400
parents 18eb78773d87
children
line wrap: on
line diff
--- a/mol2gspan.py	Tue Oct 29 11:15:59 2013 -0400
+++ b/mol2gspan.py	Tue Apr 15 12:44:17 2014 -0400
@@ -4,45 +4,53 @@
     Converts a SD-file to a GSPAN file.
 """
 
-
-import os, sys
+import os
+import sys
 import argparse
+import openbabel
+import pybel
 
 def main( args ):
 
-    begin = False
-    iid = 0
-    graph_counter = 1
+    for infile in args.infile:
+        file_extension = args.format or os.path.splitext( infile )[-1].lstrip('.')
+
+        if not args.format and file_extension not in ['smi', 'sdf', 'inchi', 'mol']:
+            sys.exit('Could not guess the format from the file extension please specify with the --format option.')
+
+        molecules = pybel.readfile(file_extension, infile)
+        for mol in molecules:
+            args.outfile.write( 't # id %s\n' % mol.title.strip() )
+            for atom in openbabel.OBMolAtomIter( mol.OBMol):
+                label = atom.GetAtomicNum()
+                vertex_index = atom.GetIdx()
+                args.outfile.write('v %s %s\n' % (vertex_index, label))
 
-    for line in args.infile:
-        if line.rstrip():
-            if line.strip().endswith('END'):
-                begin = False
-            elif line.strip() == '$$$$':
-                graph_counter += 1
-                iid = 0
-            else:
-                # found header line, like:  21 21  0  0  0  0  0  0  0  0999 V2000
-                if len(line.split()) >= 5 and line.split()[-1] == 'V2000':
-                    args.outfile.write('t # id %s\n' % graph_counter)
-                    begin=True
-                    continue
-                # connection or coordinate/atom table
-                if len(line.split()) >= 4 and begin:
-                    # coordinate/atom table
-                    if not line.startswith('M'):
-                        if line.split()[3].isalpha() or line.split()[3] == '*':
-                            args.outfile.write( 'v %s %s \n' % (iid, line.split()[3]) )
-                            iid += 1
-                        else:
-                            #connection table
-                            id, node, edge, trash = line.split(None, 3)
-                            args.outfile.write( 'e %s %s %s\n' % ( int(id) - 1 , int(node) -1, edge ) )
+            for bond in openbabel.OBMolBondIter( mol.OBMol):
+                src_index = bond.GetBeginAtomIdx()
+                dest_index = bond.GetEndAtomIdx()
+                assert(src_index > 0)
+                assert(dest_index > 0)
+                if bond.IsAromatic():
+                    label = 'a'
+                elif bond.IsSingle():
+                    label = 's'
+                elif bond.IsDouble():
+                    label = 'd'
+                elif bond.IsTriple():
+                    label = 't'
+                atom1 = bond.GetBeginAtom()
+                atom2 = bond.GetEndAtom()
+                args.outfile.write('e %s %s %s\n' % (src_index, dest_index, label))
+
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--infile', nargs='?', type=argparse.FileType('r'),
-        default=sys.stdin, help="Specify one or more input files")
+    parser.add_argument('-i', '--infile', nargs='*',
+        help="Specify one or more input files")
+    parser.add_argument('-f', '--format',
+        help="Format of the input file.")
     parser.add_argument('--outfile', type=argparse.FileType('w'),
         default=sys.stdout, help="Specify one output file")
     args = parser.parse_args()