/lib/galaxy/datatypes/data.py

https://bitbucket.org/h_morita_dbcls/galaxy-central · Python · 490 lines · 421 code · 11 blank · 58 comment · 35 complexity · 02b6a7faf1ce2cd624a39f387bcdc8bf MD5 · raw file

  1. import logging, os, sys, time, tempfile
  2. from galaxy import util
  3. from galaxy.util.odict import odict
  4. from galaxy.util.bunch import Bunch
  5. from cgi import escape
  6. import metadata
  7. import zipfile
  8. from metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions
  9. log = logging.getLogger(__name__)
  10. # Valid first column and strand column values vor bed, other formats
  11. col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho']
  12. valid_strand = ['+', '-', '.']
  13. class DataMeta( type ):
  14. """
  15. Metaclass for Data class. Sets up metadata spec.
  16. """
  17. def __init__( cls, name, bases, dict_ ):
  18. cls.metadata_spec = metadata.MetadataSpecCollection()
  19. for base in bases: #loop through bases (class/types) of cls
  20. if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata
  21. cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls
  22. metadata.Statement.process( cls )
  23. class Data( object ):
  24. """
  25. Base class for all datatypes. Implements basic interfaces as well
  26. as class methods for metadata.
  27. >>> class DataTest( Data ):
  28. ... MetadataElement( name="test" )
  29. ...
  30. >>> DataTest.metadata_spec.test.name
  31. 'test'
  32. >>> DataTest.metadata_spec.test.desc
  33. 'test'
  34. >>> type( DataTest.metadata_spec.test.param )
  35. <class 'galaxy.datatypes.metadata.MetadataParameter'>
  36. """
  37. __metaclass__ = DataMeta
  38. # Add metadata elements
  39. MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" )
  40. # Stores the set of display applications, and viewing methods, supported by this datatype
  41. supported_display_apps = {}
  42. # If False, the peek is regenerated whenever a dataset of this type is copied
  43. copy_safe_peek = True
  44. # The dataset contains binary data --> do not space_to_tab or convert newlines, etc.
  45. # Allow binary file uploads of this type when True.
  46. is_binary = True
  47. # Allow user to change between this datatype and others. If False, this datatype
  48. # cannot be changed from or into.
  49. allow_datatype_change = True
  50. #Composite datatypes
  51. composite_type = None
  52. composite_files = odict()
  53. primary_file_name = 'index'
  54. #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata
  55. _max_optional_metadata_filesize = None
  56. def __init__(self, **kwd):
  57. """Initialize the datatype"""
  58. object.__init__(self, **kwd)
  59. self.supported_display_apps = self.supported_display_apps.copy()
  60. self.composite_files = self.composite_files.copy()
  61. self.display_applications = odict()
  62. def write_from_stream(self, dataset, stream):
  63. """Writes data from a stream"""
  64. fd = open(dataset.file_name, 'wb')
  65. while 1:
  66. chunk = stream.read(1048576)
  67. if not chunk:
  68. break
  69. os.write(fd, chunk)
  70. os.close(fd)
  71. def set_raw_data(self, dataset, data):
  72. """Saves the data on the disc"""
  73. fd = open(dataset.file_name, 'wb')
  74. os.write(fd, data)
  75. os.close(fd)
  76. def get_raw_data( self, dataset ):
  77. """Returns the full data. To stream it open the file_name and read/write as needed"""
  78. try:
  79. return file(datset.file_name, 'rb').read(-1)
  80. except OSError, e:
  81. log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name))
  82. return ''
  83. def groom_dataset_content( self, file_name ):
  84. """This function is called on an output dataset file after the content is initially generated."""
  85. pass
  86. def init_meta( self, dataset, copy_from=None ):
  87. # Metadata should be left mostly uninitialized. Dataset will
  88. # handle returning default values when metadata is not set.
  89. # copy_from allows metadata to be passed in that will be
  90. # copied. (although this seems ambiguous, see
  91. # Dataset.set_metadata. It always copies the rhs in order to
  92. # flag the object as modified for SQLAlchemy.
  93. if copy_from:
  94. dataset.metadata = copy_from.metadata
  95. def set_meta( self, dataset, overwrite = True, **kwd ):
  96. """Unimplemented method, allows guessing of metadata from contents of file"""
  97. return True
  98. def missing_meta( self, dataset, check = [], skip = [] ):
  99. """
  100. Checks for empty metadata values, Returns True if non-optional metadata is missing
  101. Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored
  102. Specifying a list of 'skip' items will return True even when a named metadata value is missing
  103. """
  104. if check:
  105. to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ]
  106. else:
  107. to_check = dataset.metadata.items()
  108. for key, value in to_check:
  109. if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ):
  110. continue #we skip check for optional and nonrequested values here
  111. if not value:
  112. return True
  113. return False
  114. def set_max_optional_metadata_filesize( self, max_value ):
  115. try:
  116. max_value = int( max_value )
  117. except:
  118. return
  119. self.__class__._max_optional_metadata_filesize = max_value
  120. def get_max_optional_metadata_filesize( self ):
  121. rval = self.__class__._max_optional_metadata_filesize
  122. if rval is None:
  123. return -1
  124. return rval
  125. max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize )
  126. def set_peek( self, dataset, is_multi_byte=False ):
  127. """Set the peek and blurb text"""
  128. if not dataset.dataset.purged:
  129. dataset.peek = ''
  130. dataset.blurb = 'data'
  131. else:
  132. dataset.peek = 'file does not exist'
  133. dataset.blurb = 'file purged from disk'
  134. def display_peek(self, dataset ):
  135. """Create HTML table, used for displaying peek"""
  136. out = ['<table cellspacing="0" cellpadding="3">']
  137. try:
  138. if not dataset.peek:
  139. dataset.set_peek()
  140. data = dataset.peek
  141. lines = data.splitlines()
  142. for line in lines:
  143. line = line.strip()
  144. if not line:
  145. continue
  146. if type( line ) is unicode:
  147. out.append( '<tr><td>%s</td></tr>' % escape( line ) )
  148. else:
  149. out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
  150. out.append( '</table>' )
  151. out = "".join( out )
  152. except Exception, exc:
  153. out = "Can't create peek %s" % str( exc )
  154. return out
  155. def display_name(self, dataset):
  156. """Returns formatted html of dataset name"""
  157. try:
  158. if type ( dataset.name ) is unicode:
  159. return escape( dataset.name )
  160. else:
  161. return escape( unicode( dataset.name, 'utf-8 ') )
  162. except:
  163. return "name unavailable"
  164. def display_info(self, dataset):
  165. """Returns formatted html of dataset info"""
  166. try:
  167. # Change new line chars to html
  168. info = escape( dataset.info )
  169. if info.find( '\r\n' ) >= 0:
  170. info = info.replace( '\r\n', '<br/>' )
  171. if info.find( '\r' ) >= 0:
  172. info = info.replace( '\r', '<br/>' )
  173. if info.find( '\n' ) >= 0:
  174. info = info.replace( '\n', '<br/>' )
  175. # Convert to unicode to display non-ascii characters.
  176. if type( info ) is not unicode:
  177. info = unicode( info, 'utf-8')
  178. return info
  179. except:
  180. return "info unavailable"
  181. def validate(self, dataset):
  182. """Unimplemented validate, return no exceptions"""
  183. return list()
  184. def repair_methods(self, dataset):
  185. """Unimplemented method, returns dict with method/option for repairing errors"""
  186. return None
  187. def get_mime(self):
  188. """Returns the mime type of the datatype"""
  189. return 'application/octet-stream'
  190. def add_display_app ( self, app_id, label, file_function, links_function ):
  191. """
  192. Adds a display app to the datatype.
  193. app_id is a unique id
  194. label is the primary display label, e.g., display at 'UCSC'
  195. file_function is a string containing the name of the function that returns a properly formatted display
  196. links_function is a string containing the name of the function that returns a list of (link_name,link)
  197. """
  198. self.supported_display_apps = self.supported_display_apps.copy()
  199. self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function}
  200. def remove_display_app (self, app_id):
  201. """Removes a display app from the datatype"""
  202. self.supported_display_apps = self.supported_display_apps.copy()
  203. try:
  204. del self.supported_display_apps[app_id]
  205. except:
  206. log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) )
  207. def clear_display_apps( self ):
  208. self.supported_display_apps = {}
  209. def add_display_application( self, display_application ):
  210. """New style display applications"""
  211. assert display_application.id not in self.display_applications, 'Attempted to add a display application twice'
  212. self.display_applications[ display_application.id ] = display_application
  213. def get_display_application( self, key, default = None ):
  214. return self.display_applications.get( key, default )
  215. def get_display_applications_by_dataset( self, dataset, trans ):
  216. rval = odict()
  217. for key, value in self.display_applications.iteritems():
  218. value = value.filter_by_dataset( dataset, trans )
  219. if value.links:
  220. rval[key] = value
  221. return rval
  222. def get_display_types(self):
  223. """Returns display types available"""
  224. return self.supported_display_apps.keys()
  225. def get_display_label(self, type):
  226. """Returns primary label for display app"""
  227. try:
  228. return self.supported_display_apps[type]['label']
  229. except:
  230. return 'unknown'
  231. def as_display_type(self, dataset, type, **kwd):
  232. """Returns modified file contents for a particular display type """
  233. try:
  234. if type in self.get_display_types():
  235. return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd)
  236. except:
  237. log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) )
  238. return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext)
  239. def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ):
  240. """
  241. Returns a list of tuples of (name, link) for a particular display type. No check on
  242. 'access' permissions is done here - if you can view the dataset, you can also save it
  243. or send it to a destination outside of Galaxy, so Galaxy security restrictions do not
  244. apply anyway.
  245. """
  246. try:
  247. if type in self.get_display_types():
  248. return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd )
  249. except:
  250. log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \
  251. % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) )
  252. return []
  253. def get_converter_types(self, original_dataset, datatypes_registry):
  254. """Returns available converters by type for this dataset"""
  255. return datatypes_registry.get_converters_by_datatype(original_dataset.ext)
  256. def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ):
  257. """Returns ( target_ext, existing converted dataset )"""
  258. return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd )
  259. def convert_dataset(self, trans, original_dataset, target_type, return_output = False, visible = True, deps=None):
  260. """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
  261. converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type )
  262. if converter is None:
  263. raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) )
  264. #Generate parameter dictionary
  265. params = {}
  266. #determine input parameter name and add to params
  267. input_name = 'input1'
  268. for key, value in converter.inputs.items():
  269. if (deps) and (value.name in deps):
  270. params[value.name] = deps[value.name]
  271. elif value.type == 'data':
  272. input_name = key
  273. params[input_name] = original_dataset
  274. #Run converter, job is dispatched through Queue
  275. converted_dataset = converter.execute( trans, incoming = params, set_output_hid = visible )[1]
  276. if len(params) > 0:
  277. trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id )
  278. if not visible:
  279. for name, value in converted_dataset.iteritems():
  280. value.visible = False
  281. if return_output:
  282. return converted_dataset
  283. return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
  284. #We need to clear associated files before we set metadata
  285. #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after
  286. #We'll also clear after setting metadata, for backwards compatibility
  287. def after_setting_metadata( self, dataset ):
  288. """This function is called on the dataset after metadata is set."""
  289. dataset.clear_associated_files( metadata_safe = True )
  290. def before_setting_metadata( self, dataset ):
  291. """This function is called on the dataset before metadata is set."""
  292. dataset.clear_associated_files( metadata_safe = True )
  293. def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, space_to_tab = False, **kwds ):
  294. kwds[ 'name' ] = name
  295. kwds[ 'optional' ] = optional
  296. kwds[ 'mimetype' ] = mimetype
  297. kwds[ 'description' ] = description
  298. kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata
  299. kwds[ 'is_binary' ] = is_binary
  300. kwds[ 'space_to_tab' ] = space_to_tab
  301. return Bunch( **kwds )
  302. def add_composite_file( self, name, **kwds ):
  303. #self.composite_files = self.composite_files.copy()
  304. self.composite_files[ name ] = self.__new_composite_file( name, **kwds )
  305. def __substitute_composite_key( self, key, composite_file, dataset = None ):
  306. if composite_file.substitute_name_with_metadata:
  307. if dataset:
  308. meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
  309. else:
  310. meta_value = self.spec[composite_file.substitute_name_with_metadata].default
  311. return key % meta_value
  312. return key
  313. @property
  314. def writable_files( self, dataset = None ):
  315. files = odict()
  316. if self.composite_type != 'auto_primary_file':
  317. files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name )
  318. for key, value in self.get_composite_files( dataset = dataset ).iteritems():
  319. files[ key ] = value
  320. return files
  321. def get_composite_files( self, dataset = None ):
  322. def substitute_composite_key( key, composite_file ):
  323. if composite_file.substitute_name_with_metadata:
  324. if dataset:
  325. meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
  326. else:
  327. meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default
  328. return key % meta_value
  329. return key
  330. files = odict()
  331. for key, value in self.composite_files.iteritems():
  332. files[ substitute_composite_key( key, value ) ] = value
  333. return files
  334. def generate_auto_primary_file( self, dataset = None ):
  335. raise Exception( "generate_auto_primary_file is not implemented for this datatype." )
  336. @property
  337. def has_resolution(self):
  338. return False
  339. class Text( Data ):
  340. file_ext = 'txt'
  341. """Add metadata elements"""
  342. MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 )
  343. def write_from_stream(self, dataset, stream):
  344. """Writes data from a stream"""
  345. # write it twice for now
  346. fd, temp_name = tempfile.mkstemp()
  347. while 1:
  348. chunk = stream.read(1048576)
  349. if not chunk:
  350. break
  351. os.write(fd, chunk)
  352. os.close(fd)
  353. # rewrite the file with unix newlines
  354. fp = open(dataset.file_name, 'wt')
  355. for line in file(temp_name, "U"):
  356. line = line.strip() + '\n'
  357. fp.write(line)
  358. fp.close()
  359. def set_raw_data(self, dataset, data):
  360. """Saves the data on the disc"""
  361. fd, temp_name = tempfile.mkstemp()
  362. os.write(fd, data)
  363. os.close(fd)
  364. # rewrite the file with unix newlines
  365. fp = open(dataset.file_name, 'wt')
  366. for line in file(temp_name, "U"):
  367. line = line.strip() + '\n'
  368. fp.write(line)
  369. fp.close()
  370. os.remove( temp_name )
  371. def get_mime(self):
  372. """Returns the mime type of the datatype"""
  373. return 'text/plain'
  374. def set_meta( self, dataset, **kwd ):
  375. """
  376. Set the number of lines of data in dataset,
  377. skipping all blank lines and comments.
  378. """
  379. data_lines = 0
  380. for line in file( dataset.file_name ):
  381. line = line.strip()
  382. if line and not line.startswith( '#' ):
  383. data_lines += 1
  384. dataset.metadata.data_lines = data_lines
  385. def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
  386. if not dataset.dataset.purged:
  387. # The file must exist on disk for the get_file_peek() method
  388. dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  389. if line_count is None:
  390. # See if line_count is stored in the metadata
  391. if dataset.metadata.data_lines:
  392. dataset.blurb = "%s lines" % util.commaify( str( dataset.metadata.data_lines ) )
  393. else:
  394. # Number of lines is not known ( this should not happen ), and auto-detect is
  395. # needed to set metadata
  396. dataset.blurb = "? lines"
  397. else:
  398. dataset.blurb = "%s lines" % util.commaify( str( line_count ) )
  399. else:
  400. dataset.peek = 'file does not exist'
  401. dataset.blurb = 'file purged from disk'
  402. class Newick( Text ):
  403. pass
  404. # ------------- Utility methods --------------
  405. def get_test_fname( fname ):
  406. """Returns test data filename"""
  407. path, name = os.path.split(__file__)
  408. full_path = os.path.join( path, 'test', fname )
  409. return full_path
  410. def nice_size(size):
  411. """
  412. Returns a readably formatted string with the size
  413. >>> nice_size(100)
  414. '100.0 bytes'
  415. >>> nice_size(10000)
  416. '9.8 Kb'
  417. >>> nice_size(1000000)
  418. '976.6 Kb'
  419. >>> nice_size(100000000)
  420. '95.4 Mb'
  421. """
  422. words = [ 'bytes', 'Kb', 'Mb', 'Gb' ]
  423. try:
  424. size = float( size )
  425. except:
  426. return '??? bytes'
  427. for ind, word in enumerate(words):
  428. step = 1024 ** (ind + 1)
  429. if step > size:
  430. size = size / float(1024 ** ind)
  431. out = "%.1f %s" % (size, word)
  432. return out
  433. return '??? bytes'
  434. def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5 ):
  435. """
  436. Returns the first LINE_COUNT lines wrapped to WIDTH
  437. ## >>> fname = get_test_fname('4.bed')
  438. ## >>> get_file_peek(fname)
  439. ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n'
  440. """
  441. lines = []
  442. count = 0
  443. file_type = None
  444. data_checked = False
  445. temp = open( file_name, "U" )
  446. while count <= LINE_COUNT:
  447. line = temp.readline( WIDTH )
  448. if line and not is_multi_byte and not data_checked:
  449. # See if we have a compressed or binary file
  450. if line[0:2] == util.gzip_magic:
  451. file_type = 'gzipped'
  452. break
  453. else:
  454. for char in line:
  455. if ord( char ) > 128:
  456. file_type = 'binary'
  457. break
  458. data_checked = True
  459. if file_type in [ 'gzipped', 'binary' ]:
  460. break
  461. lines.append( line )
  462. count += 1
  463. temp.close()
  464. if file_type in [ 'gzipped', 'binary' ]:
  465. text = "%s file" % file_type
  466. else:
  467. try:
  468. text = unicode( '\n'.join( lines ), 'utf-8' )
  469. except UnicodeDecodeError:
  470. text = "binary/unknown file"
  471. return text