/lib/galaxy/datatypes/xml.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 118 lines · 106 code · 4 blank · 8 comment · 2 complexity · ed8adc0fcd5cf77747dd859be6a3edc3 MD5 · raw file

  1. """
  2. XML format classes
  3. """
  4. import data
  5. import logging
  6. from galaxy.datatypes.sniff import *
  7. import dataproviders
  8. log = logging.getLogger(__name__)
  9. @dataproviders.decorators.has_dataproviders
  10. class GenericXml( data.Text ):
  11. """Base format class for any XML file."""
  12. file_ext = "xml"
  13. def set_peek( self, dataset, is_multi_byte=False ):
  14. """Set the peek and blurb text"""
  15. if not dataset.dataset.purged:
  16. dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  17. dataset.blurb = 'XML data'
  18. else:
  19. dataset.peek = 'file does not exist'
  20. dataset.blurb = 'file purged from disk'
  21. def sniff( self, filename ):
  22. """
  23. Determines whether the file is XML or not
  24. >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
  25. >>> GenericXml().sniff( fname )
  26. True
  27. >>> fname = get_test_fname( 'interval.interval' )
  28. >>> GenericXml().sniff( fname )
  29. False
  30. """
  31. #TODO - Use a context manager on Python 2.5+ to close handle
  32. handle = open(filename)
  33. line = handle.readline()
  34. handle.close()
  35. #TODO - Is there a more robust way to do this?
  36. return line.startswith('<?xml ')
  37. def merge(split_files, output_file):
  38. """Merging multiple XML files is non-trivial and must be done in subclasses."""
  39. if len(split_files) > 1:
  40. raise NotImplementedError("Merging multiple XML files is non-trivial and must be implemented for each XML type")
  41. #For one file only, use base class method (move/copy)
  42. data.Text.merge(split_files, output_file)
  43. merge = staticmethod(merge)
  44. @dataproviders.decorators.dataprovider_factory( 'xml', dataproviders.hierarchy.XMLDataProvider.settings )
  45. def xml_dataprovider( self, dataset, **settings ):
  46. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  47. return dataproviders.hierarchy.XMLDataProvider( dataset_source, **settings )
  48. class MEMEXml( GenericXml ):
  49. """MEME XML Output data"""
  50. file_ext = "memexml"
  51. def set_peek( self, dataset, is_multi_byte=False ):
  52. """Set the peek and blurb text"""
  53. if not dataset.dataset.purged:
  54. dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  55. dataset.blurb = 'MEME XML data'
  56. else:
  57. dataset.peek = 'file does not exist'
  58. dataset.blurb = 'file purged from disk'
  59. def sniff( self, filename ):
  60. return False
  61. class CisML( GenericXml ):
  62. """CisML XML data""" #see: http://www.ncbi.nlm.nih.gov/pubmed/15001475
  63. file_ext = "cisml"
  64. def set_peek( self, dataset, is_multi_byte=False ):
  65. """Set the peek and blurb text"""
  66. if not dataset.dataset.purged:
  67. dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  68. dataset.blurb = 'CisML data'
  69. else:
  70. dataset.peek = 'file does not exist'
  71. dataset.blurb = 'file purged from disk'
  72. def sniff( self, filename ):
  73. return False
  74. class Phyloxml( GenericXml ):
  75. """Format for defining phyloxml data http://www.phyloxml.org/"""
  76. file_ext = "phyloxml"
  77. def set_peek( self, dataset, is_multi_byte=False ):
  78. """Set the peek and blurb text"""
  79. if not dataset.dataset.purged:
  80. dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  81. dataset.blurb = 'Phyloxml data'
  82. else:
  83. dataset.peek = 'file does not exist'
  84. dataset.blurb = 'file purged from disk'
  85. def sniff( self, filename ):
  86. """"Checking for keyword - 'phyloxml' always in lowercase in the first few lines"""
  87. f = open( filename, "r" )
  88. firstlines = "".join( f.readlines(5) )
  89. f.close()
  90. if "phyloxml" in firstlines:
  91. return True
  92. return False
  93. def get_visualizations( self, dataset ):
  94. """
  95. Returns a list of visualizations for datatype.
  96. """
  97. return [ 'phyloviz' ]