comparison molecules.py @ 4:1a070566e9c6

Uploaded
author bgruening
date Sat, 11 May 2013 17:09:49 -0400
parents ad5ecf08508a
children
comparison
equal deleted inserted replaced
3:ad5ecf08508a 4:1a070566e9c6
4 import logging 4 import logging
5 from galaxy.datatypes.sniff import get_headers, get_test_fname 5 from galaxy.datatypes.sniff import get_headers, get_test_fname
6 from galaxy.datatypes.data import get_file_peek 6 from galaxy.datatypes.data import get_file_peek
7 from galaxy.datatypes.tabular import Tabular 7 from galaxy.datatypes.tabular import Tabular
8 from galaxy.datatypes.binary import Binary 8 from galaxy.datatypes.binary import Binary
9 from galaxy.datatypes.xml import GenericXml
9 import subprocess 10 import subprocess
10 import os 11 import os
11 #import pybel 12 #import pybel
12 #import openbabel 13 #import openbabel
13 #openbabel.obErrorLog.StopLogging() 14 #openbabel.obErrorLog.StopLogging()
181 elif split_params['split_mode'] == 'to_size': 182 elif split_params['split_mode'] == 'to_size':
182 chunk_size = int(split_params['split_size']) 183 chunk_size = int(split_params['split_size'])
183 else: 184 else:
184 raise Exception('Unsupported split mode %s' % split_params['split_mode']) 185 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
185 186
186 def _read_sdf_records( filename ): 187 def _read_mol2_records( filename ):
187 lines = [] 188 lines = []
188 start = True 189 start = True
189 with open(filename) as handle: 190 with open(filename) as handle:
190 for line in handle: 191 for line in handle:
191 if line.startswith("@<TRIPOS>MOLECULE"): 192 if line.startswith("@<TRIPOS>MOLECULE"):
202 part_file = open(part_path, 'w') 203 part_file = open(part_path, 'w')
203 part_file.writelines( accumulated_lines ) 204 part_file.writelines( accumulated_lines )
204 part_file.close() 205 part_file.close()
205 206
206 try: 207 try:
207 sdf_records = _read_sdf_records( input_files[0] ) 208 mol2_records = _read_mol2_records( input_files[0] )
208 sdf_lines_accumulated = [] 209 mol2_lines_accumulated = []
209 for counter, sdf_record in enumerate( sdf_records, start = 1): 210 for counter, mol2_record in enumerate( mol2_records, start = 1):
210 sdf_lines_accumulated.extend( sdf_record ) 211 mol2_lines_accumulated.extend( mol2_record )
211 if counter % chunk_size == 0: 212 if counter % chunk_size == 0:
212 _write_part_mol2_file( sdf_lines_accumulated ) 213 _write_part_mol2_file( mol2_lines_accumulated )
213 sdf_lines_accumulated = [] 214 mol2_lines_accumulated = []
214 if sdf_lines_accumulated: 215 if mol2_lines_accumulated:
215 _write_part_mol2_file( sdf_lines_accumulated ) 216 _write_part_mol2_file( mol2_lines_accumulated )
216 except Exception, e: 217 except Exception, e:
217 log.error('Unable to split files: %s' % str(e)) 218 log.error('Unable to split files: %s' % str(e))
218 raise 219 raise
219 split = classmethod(split) 220 split = classmethod(split)
220 221
221 222
222 223
223 class FPS( GenericMolFile ): 224 class FPS( GenericMolFile ):
224 """ 225 """
225 chemfp fingerprint file: http://code.google.com/p/chem-fingerprints/wiki/FPS 226 chemfp fingerprint file: http://code.google.com/p/chem-fingerprints/wiki/FPS
226 """ 227 """
227 file_ext = "fps" 228 file_ext = "fps"
228 def sniff( self, filename ): 229 def sniff( self, filename ):
229 header = get_headers( filename, sep='\t', count=1 ) 230 header = get_headers( filename, sep='\t', count=1 )
230 if header[0][0].strip() == '#FPS1': 231 if header[0][0].strip() == '#FPS1':
234 235
235 def set_meta( self, dataset, **kwd ): 236 def set_meta( self, dataset, **kwd ):
236 """ 237 """
237 Set the number of lines of data in dataset. 238 Set the number of lines of data in dataset.
238 """ 239 """
239 dataset.metadata.number_of_molecules = count_special_lines('^#', dataset.file_name, invert = True)#self.count_data_lines(dataset) 240 dataset.metadata.number_of_molecules = count_special_lines('^#', dataset.file_name, invert = True)
240 241
241 242
242 def split( cls, input_datasets, subdir_generator_function, split_params): 243 def split( cls, input_datasets, subdir_generator_function, split_params):
243 """ 244 """
244 Split the input files by fingerprint records. 245 Split the input files by fingerprint records.
390 391
391 def set_meta( self, dataset, **kwd ): 392 def set_meta( self, dataset, **kwd ):
392 """ 393 """
393 Set the number of lines of data in dataset. 394 Set the number of lines of data in dataset.
394 """ 395 """
395 dataset.metadata.number_of_molecules = count_special_lines('\"ligand id\"', dataset.file_name, invert = True)#self.count_data_lines(dataset) 396 dataset.metadata.number_of_molecules = count_special_lines('\"ligand id\"', dataset.file_name, invert = True)
396 397
397 398
398 class PHAR( GenericMolFile ): 399 class PHAR( GenericMolFile ):
400 """
401 Pharmacophore database format from silicos-it.
402 """
399 file_ext = "phar" 403 file_ext = "phar"
400 def set_peek( self, dataset, is_multi_byte=False ): 404 def set_peek( self, dataset, is_multi_byte=False ):
401 if not dataset.dataset.purged: 405 if not dataset.dataset.purged:
402 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) 406 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
403 dataset.blurb = "pharmacophore" 407 dataset.blurb = "pharmacophore"
405 dataset.peek = 'file does not exist' 409 dataset.peek = 'file does not exist'
406 dataset.blurb = 'file purged from disk' 410 dataset.blurb = 'file purged from disk'
407 411
408 412
409 class PDB( GenericMolFile ): 413 class PDB( GenericMolFile ):
414 """
415 Protein Databank format.
416 http://www.wwpdb.org/documentation/format33/v3.3.html
417 """
410 file_ext = "pdb" 418 file_ext = "pdb"
411 def sniff( self, filename ): 419 def sniff( self, filename ):
412 headers = get_headers( filename, sep=' ', count=300 ) 420 headers = get_headers( filename, sep=' ', count=300 )
413 h = t = c = s = k = e = False 421 h = t = c = s = k = e = False
414 for line in headers: 422 for line in headers:
558 else: 566 else:
559 return False 567 return False
560 ''' 568 '''
561 569
562 570
571
572 class CML( GenericXml ):
573 """
574 Chemical Markup Language
575 http://cml.sourceforge.net/
576 """
577 file_ext = "cml"
578 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
579
580
581 def set_meta( self, dataset, **kwd ):
582 """
583 Set the number of lines of data in dataset.
584 """
585 dataset.metadata.number_of_molecules = count_special_lines( '^\s*<molecule', dataset.file_name )
586
587 def set_peek( self, dataset, is_multi_byte=False ):
588 if not dataset.dataset.purged:
589 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
590 if (dataset.metadata.number_of_molecules == 1):
591 dataset.blurb = "1 molecule"
592 else:
593 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
594 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
595 else:
596 dataset.peek = 'file does not exist'
597 dataset.blurb = 'file purged from disk'
598
599 def sniff( self, filename ):
600 """
601 Try to guess if the file is a CML file.
602 TODO: add true positive test, need to submit a CML example
603
604 >>> fname = get_test_fname( 'interval.interval' )
605 >>> CML().sniff( fname )
606 False
607 """
608 handle = open(filename)
609 line = handle.readline()
610 if line.strip() != '<?xml version="1.0"?>':
611 handle.close()
612 return False
613 line = handle.readline()
614 if line.strip().find('http://www.xml-cml.org/schema') == -1:
615 handle.close()
616 return False
617 handle.close()
618 return True
619
620
621 def split( cls, input_datasets, subdir_generator_function, split_params):
622 """
623 Split the input files by molecule records.
624 """
625 if split_params is None:
626 return None
627
628 if len(input_datasets) > 1:
629 raise Exception("CML-file splitting does not support multiple files")
630 input_files = [ds.file_name for ds in input_datasets]
631
632 chunk_size = None
633 if split_params['split_mode'] == 'number_of_parts':
634 raise Exception('Split mode "%s" is currently not implemented for CML-files.' % split_params['split_mode'])
635 elif split_params['split_mode'] == 'to_size':
636 chunk_size = int(split_params['split_size'])
637 else:
638 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
639
640 def _read_cml_records( filename ):
641 lines = []
642 with open(filename) as handle:
643 for line in handle:
644 if line.lstrip().startswith('<?xml version="1.0"?>') or \
645 line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema') or \
646 line.lstrip().startswith('</cml>'):
647 continue
648 lines.append( line )
649 if line.lstrip().startswith('</molecule>'):
650 yield lines
651 lines = []
652
653 header_lines = ['<?xml version="1.0"?>\n', '<cml xmlns="http://www.xml-cml.org/schema">\n']
654 footer_line = ['</cml>\n']
655 def _write_part_cml_file( accumulated_lines ):
656 part_dir = subdir_generator_function()
657 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
658 part_file = open(part_path, 'w')
659 part_file.writelines( header_lines )
660 part_file.writelines( accumulated_lines )
661 part_file.writelines( footer_line )
662 part_file.close()
663
664 try:
665 cml_records = _read_cml_records( input_files[0] )
666 cml_lines_accumulated = []
667 for counter, cml_record in enumerate( cml_records, start = 1):
668 cml_lines_accumulated.extend( cml_record )
669 if counter % chunk_size == 0:
670 _write_part_cml_file( cml_lines_accumulated )
671 cml_lines_accumulated = []
672 if cml_lines_accumulated:
673 _write_part_cml_file( cml_lines_accumulated )
674 except Exception, e:
675 log.error('Unable to split files: %s' % str(e))
676 raise
677 split = classmethod(split)
678
679
680 def merge(split_files, output_file):
681 """
682 Merging CML files.
683 """
684 if len(split_files) == 1:
685 #For one file only, use base class method (move/copy)
686 return Text.merge(split_files, output_file)
687 if not split_files:
688 raise ValueError("Given no CML files, %r, to merge into %s" \
689 % (split_files, output_file))
690 with open(output_file, "w") as out:
691 for filename in split_files:
692 with open( filename ) as handle:
693 header = handle.readline()
694 if not header:
695 raise ValueError("CML file %s was empty" % f)
696 if not header.lstrip().startswith('<?xml version="1.0"?>'):
697 out.write(header)
698 raise ValueError("%s is not a valid XML file!" % f)
699 line = handle.readline()
700 header += line
701 if not line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema'):
702 out.write(header)
703 raise ValueError("%s is not a CML file!" % f)
704 molecule_found = False
705 for line in handle.readlines():
706 # we found two required header lines, the next line should start with <molecule >
707 if line.lstrip().startswith('</cml>'):
708 continue
709 if line.lstrip().startswith('<molecule'):
710 molecule_found = True
711 if molecule_found:
712 out.write( line )
713 out.write("</cml>\n")
714 merge = staticmethod(merge)
715
716
563 if __name__ == '__main__': 717 if __name__ == '__main__':
564 """ 718 """
565 TODO: We need to figure out, how to put example files under /lib/galaxy/datatypes/test/ from a toolshed, so that doctest can work properly. 719 TODO: We need to figure out, how to put example files under /lib/galaxy/datatypes/test/ from a toolshed, so that doctest can work properly.
566 """ 720 """
567 inchi = get_test_fname('drugbank_drugs.inchi') 721 inchi = get_test_fname('drugbank_drugs.inchi')
568 smiles = get_test_fname('drugbank_drugs.smi') 722 smiles = get_test_fname('drugbank_drugs.smi')
569 sdf = get_test_fname('drugbank_drugs.sdf') 723 sdf = get_test_fname('drugbank_drugs.sdf')
570 fps = get_test_fname('50_chemfp_fingerprints_FPS1.fps') 724 fps = get_test_fname('50_chemfp_fingerprints_FPS1.fps')
571 pdb = get_test_fname('2zbz.pdb') 725 pdb = get_test_fname('2zbz.pdb')
572 726 cml = get_test_fname('/home/bag/Downloads/approved.cml')
727
728 print 'CML test'
729 print CML().sniff(cml), 'cml'
730 print CML().sniff(inchi)
731 print CML().sniff(pdb)
732 CML().split()
733 """
573 print 'SMILES test' 734 print 'SMILES test'
574 print SMILES().sniff(smiles), 'smi' 735 print SMILES().sniff(smiles), 'smi'
575 print SMILES().sniff(inchi) 736 print SMILES().sniff(inchi)
576 print SMILES().sniff(pdb) 737 print SMILES().sniff(pdb)
577 738 """
578 print 'InChI test' 739 print 'InChI test'
579 print InChI().sniff(smiles) 740 print InChI().sniff(smiles)
580 print InChI().sniff(sdf) 741 print InChI().sniff(sdf)
581 print InChI().sniff(inchi), 'inchi' 742 print InChI().sniff(inchi), 'inchi'
582 743