PageRenderTime 281ms CodeModel.GetById 32ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/data.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 960 lines | 810 code | 42 blank | 108 comment | 107 complexity | 68f760e515d11695493f99021755a242 MD5 | raw file
  1. import logging
  2. import metadata
  3. import mimetypes
  4. import os
  5. import shutil
  6. import sys
  7. import tempfile
  8. import zipfile
  9. from cgi import escape
  10. from inspect import isclass
  11. from galaxy import util
  12. from galaxy.datatypes.metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions
  13. from galaxy.util import inflector
  14. from galaxy.util.bunch import Bunch
  15. from galaxy.util.odict import odict
  16. from galaxy.util.sanitize_html import sanitize_html
  17. import dataproviders
  18. from galaxy import eggs
  19. eggs.require( "Paste" )
  20. import paste
  21. XSS_VULNERABLE_MIME_TYPES = [
  22. 'image/svg+xml', # Unfiltered by Galaxy and may contain JS that would be executed by some browsers.
  23. 'application/xml', # Some browsers will evalute SVG embedded JS in such XML documents.
  24. ]
  25. DEFAULT_MIME_TYPE = 'text/plain' # Vulnerable mime types will be replaced with this.
  26. log = logging.getLogger(__name__)
  27. comptypes=[] # Is this being used anywhere, why was this here? -JohnC
  28. try:
  29. import zlib
  30. comptypes.append( 'zip' )
  31. except ImportError:
  32. pass
  33. # Valid first column and strand column values vor bed, other formats
  34. col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho']
  35. valid_strand = ['+', '-', '.']
  36. class DataMeta( type ):
  37. """
  38. Metaclass for Data class. Sets up metadata spec.
  39. """
  40. def __init__( cls, name, bases, dict_ ):
  41. cls.metadata_spec = metadata.MetadataSpecCollection()
  42. for base in bases: #loop through bases (class/types) of cls
  43. if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata
  44. cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls
  45. metadata.Statement.process( cls )
  46. @dataproviders.decorators.has_dataproviders
  47. class Data( object ):
  48. """
  49. Base class for all datatypes. Implements basic interfaces as well
  50. as class methods for metadata.
  51. >>> class DataTest( Data ):
  52. ... MetadataElement( name="test" )
  53. ...
  54. >>> DataTest.metadata_spec.test.name
  55. 'test'
  56. >>> DataTest.metadata_spec.test.desc
  57. 'test'
  58. >>> type( DataTest.metadata_spec.test.param )
  59. <class 'galaxy.datatypes.metadata.MetadataParameter'>
  60. """
  61. # Data is not chunkable by default.
  62. CHUNKABLE = False
  63. #: dictionary of metadata fields for this datatype::
  64. metadata_spec = None
  65. __metaclass__ = DataMeta
  66. # Add metadata elements
  67. MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" )
  68. # Stores the set of display applications, and viewing methods, supported by this datatype
  69. supported_display_apps = {}
  70. # If False, the peek is regenerated whenever a dataset of this type is copied
  71. copy_safe_peek = True
  72. # The dataset contains binary data --> do not space_to_tab or convert newlines, etc.
  73. # Allow binary file uploads of this type when True.
  74. is_binary = True
  75. # Allow user to change between this datatype and others. If False, this datatype
  76. # cannot be changed from or into.
  77. allow_datatype_change = True
  78. #Composite datatypes
  79. composite_type = None
  80. composite_files = odict()
  81. primary_file_name = 'index'
  82. #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata
  83. _max_optional_metadata_filesize = None
  84. # Trackster track type.
  85. track_type = None
  86. # Data sources.
  87. data_sources = {}
  88. def __init__(self, **kwd):
  89. """Initialize the datatype"""
  90. object.__init__(self, **kwd)
  91. self.supported_display_apps = self.supported_display_apps.copy()
  92. self.composite_files = self.composite_files.copy()
  93. self.display_applications = odict()
  94. def write_from_stream(self, dataset, stream):
  95. """Writes data from a stream"""
  96. fd = open(dataset.file_name, 'wb')
  97. while 1:
  98. chunk = stream.read(1048576)
  99. if not chunk:
  100. break
  101. os.write(fd, chunk)
  102. os.close(fd)
  103. def set_raw_data(self, dataset, data):
  104. """Saves the data on the disc"""
  105. fd = open(dataset.file_name, 'wb')
  106. os.write(fd, data)
  107. os.close(fd)
  108. def get_raw_data( self, dataset ):
  109. """Returns the full data. To stream it open the file_name and read/write as needed"""
  110. try:
  111. return file(dataset.file_name, 'rb').read(-1)
  112. except OSError, e:
  113. log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name))
  114. return ''
  115. def dataset_content_needs_grooming( self, file_name ):
  116. """This function is called on an output dataset file after the content is initially generated."""
  117. return False
  118. def groom_dataset_content( self, file_name ):
  119. """This function is called on an output dataset file if dataset_content_needs_grooming returns True."""
  120. pass
  121. def init_meta( self, dataset, copy_from=None ):
  122. # Metadata should be left mostly uninitialized. Dataset will
  123. # handle returning default values when metadata is not set.
  124. # copy_from allows metadata to be passed in that will be
  125. # copied. (although this seems ambiguous, see
  126. # Dataset.set_metadata. It always copies the rhs in order to
  127. # flag the object as modified for SQLAlchemy.
  128. if copy_from:
  129. dataset.metadata = copy_from.metadata
  130. def set_meta( self, dataset, overwrite = True, **kwd ):
  131. """Unimplemented method, allows guessing of metadata from contents of file"""
  132. return True
  133. def missing_meta( self, dataset, check = [], skip = [] ):
  134. """
  135. Checks for empty metadata values, Returns True if non-optional metadata is missing
  136. Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored
  137. Specifying a list of 'skip' items will return True even when a named metadata value is missing
  138. """
  139. if check:
  140. to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ]
  141. else:
  142. to_check = dataset.metadata.items()
  143. for key, value in to_check:
  144. if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ):
  145. continue #we skip check for optional and nonrequested values here
  146. if not value:
  147. return True
  148. return False
  149. def set_max_optional_metadata_filesize( self, max_value ):
  150. try:
  151. max_value = int( max_value )
  152. except:
  153. return
  154. self.__class__._max_optional_metadata_filesize = max_value
  155. def get_max_optional_metadata_filesize( self ):
  156. rval = self.__class__._max_optional_metadata_filesize
  157. if rval is None:
  158. return -1
  159. return rval
  160. max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize )
  161. def set_peek( self, dataset, is_multi_byte=False ):
  162. """Set the peek and blurb text"""
  163. if not dataset.dataset.purged:
  164. dataset.peek = ''
  165. dataset.blurb = 'data'
  166. else:
  167. dataset.peek = 'file does not exist'
  168. dataset.blurb = 'file purged from disk'
  169. def display_peek(self, dataset ):
  170. """Create HTML table, used for displaying peek"""
  171. out = ['<table cellspacing="0" cellpadding="3">']
  172. try:
  173. if not dataset.peek:
  174. dataset.set_peek()
  175. data = dataset.peek
  176. lines = data.splitlines()
  177. for line in lines:
  178. line = line.strip()
  179. if not line:
  180. continue
  181. if type( line ) is unicode:
  182. out.append( '<tr><td>%s</td></tr>' % escape( line ) )
  183. else:
  184. out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
  185. out.append( '</table>' )
  186. out = "".join( out )
  187. except Exception, exc:
  188. out = "Can't create peek %s" % str( exc )
  189. return out
  190. def _archive_main_file(self, archive, display_name, data_filename):
  191. """Called from _archive_composite_dataset to add central file to archive.
  192. Unless subclassed, this will add the main dataset file (argument data_filename)
  193. to the archive, as an HTML file with its filename derived from the dataset name
  194. (argument outfname).
  195. Returns a tuple of boolean, string, string: (error, msg, messagetype)
  196. """
  197. error, msg, messagetype = False, "", ""
  198. archname = '%s.html' % display_name # fake the real nature of the html file
  199. try:
  200. archive.add(data_filename, archname)
  201. except IOError:
  202. error = True
  203. log.exception("Unable to add composite parent %s to temporary library download archive" % data_filename)
  204. msg = "Unable to create archive for download, please report this error"
  205. messagetype = "error"
  206. return error, msg, messagetype
  207. def _archive_composite_dataset( self, trans, data=None, **kwd ):
  208. # save a composite object into a compressed archive for downloading
  209. params = util.Params( kwd )
  210. valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
  211. outfname = data.name[0:150]
  212. outfname = ''.join(c in valid_chars and c or '_' for c in outfname)
  213. if (params.do_action == None):
  214. params.do_action = 'zip' # default
  215. msg = util.restore_text( params.get( 'msg', '' ) )
  216. messagetype = params.get( 'messagetype', 'done' )
  217. if not data:
  218. msg = "You must select at least one dataset"
  219. messagetype = 'error'
  220. else:
  221. error = False
  222. try:
  223. if (params.do_action == 'zip'):
  224. # Can't use mkstemp - the file must not exist first
  225. tmpd = tempfile.mkdtemp()
  226. util.umask_fix_perms( tmpd, trans.app.config.umask, 0777, trans.app.config.gid )
  227. tmpf = os.path.join( tmpd, 'library_download.' + params.do_action )
  228. archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED, True )
  229. archive.add = lambda x, y: archive.write( x, y.encode('CP437') )
  230. elif params.do_action == 'tgz':
  231. archive = util.streamball.StreamBall( 'w|gz' )
  232. elif params.do_action == 'tbz':
  233. archive = util.streamball.StreamBall( 'w|bz2' )
  234. except (OSError, zipfile.BadZipFile):
  235. error = True
  236. log.exception( "Unable to create archive for download" )
  237. msg = "Unable to create archive for %s for download, please report this error" % outfname
  238. messagetype = 'error'
  239. if not error:
  240. current_user_roles = trans.get_current_user_roles()
  241. ext = data.extension
  242. path = data.file_name
  243. fname = os.path.split(path)[-1]
  244. efp = data.extra_files_path
  245. #Add any central file to the archive,
  246. display_name = os.path.splitext(outfname)[0]
  247. if not display_name.endswith(ext):
  248. display_name = '%s_%s' % (display_name, ext)
  249. error, msg, messagetype = self._archive_main_file(archive, display_name, path)
  250. if not error:
  251. #Add any child files to the archive,
  252. for root, dirs, files in os.walk(efp):
  253. for fname in files:
  254. fpath = os.path.join(root,fname)
  255. rpath = os.path.relpath(fpath,efp)
  256. try:
  257. archive.add( fpath,rpath )
  258. except IOError:
  259. error = True
  260. log.exception( "Unable to add %s to temporary library download archive" % rpath)
  261. msg = "Unable to create archive for download, please report this error"
  262. messagetype = 'error'
  263. continue
  264. if not error:
  265. if params.do_action == 'zip':
  266. archive.close()
  267. tmpfh = open( tmpf )
  268. # CANNOT clean up - unlink/rmdir was always failing because file handle retained to return - must rely on a cron job to clean up tmp
  269. trans.response.set_content_type( "application/x-zip-compressed" )
  270. trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="%s.zip"' % outfname
  271. return tmpfh
  272. else:
  273. trans.response.set_content_type( "application/x-tar" )
  274. outext = 'tgz'
  275. if params.do_action == 'tbz':
  276. outext = 'tbz'
  277. trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="%s.%s"' % (outfname,outext)
  278. archive.wsgi_status = trans.response.wsgi_status()
  279. archive.wsgi_headeritems = trans.response.wsgi_headeritems()
  280. return archive.stream
  281. return trans.show_error_message( msg )
  282. def _serve_raw(self, trans, dataset, to_ext):
  283. trans.response.headers['Content-Length'] = int( os.stat( dataset.file_name ).st_size )
  284. valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
  285. fname = ''.join(c in valid_chars and c or '_' for c in dataset.name)[0:150]
  286. trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename
  287. trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (dataset.hid, fname, to_ext)
  288. return open( dataset.file_name )
  289. def display_data(self, trans, data, preview=False, filename=None, to_ext=None, size=None, offset=None, **kwd):
  290. """ Old display method, for transition - though still used by API and
  291. test framework. Datatypes should be very careful if overridding this
  292. method and this interface between datatypes and Galaxy will likely
  293. change.
  294. TOOD: Document alternatives to overridding this method (data
  295. providers?).
  296. """
  297. #Relocate all composite datatype display to a common location.
  298. composite_extensions = trans.app.datatypes_registry.get_composite_extensions( )
  299. composite_extensions.append('html') # for archiving composite datatypes
  300. #Prevent IE8 from sniffing content type since we're explicit about it. This prevents intentionally text/plain
  301. #content from being rendered in the browser
  302. trans.response.headers['X-Content-Type-Options'] = 'nosniff'
  303. if isinstance( data, basestring ):
  304. return data
  305. if filename and filename != "index":
  306. # For files in extra_files_path
  307. file_path = trans.app.object_store.get_filename(data.dataset, extra_dir='dataset_%s_files' % data.dataset.id, alt_name=filename)
  308. if os.path.exists( file_path ):
  309. if os.path.isdir( file_path ):
  310. return trans.show_error_message( "Directory listing is not allowed." ) #TODO: Reconsider allowing listing of directories?
  311. mime, encoding = mimetypes.guess_type( file_path )
  312. if not mime:
  313. try:
  314. mime = trans.app.datatypes_registry.get_mimetype_by_extension( ".".split( file_path )[-1] )
  315. except:
  316. mime = "text/plain"
  317. self._clean_and_set_mime_type( trans, mime )
  318. return open( file_path )
  319. else:
  320. return trans.show_error_message( "Could not find '%s' on the extra files path %s." % ( filename, file_path ) )
  321. self._clean_and_set_mime_type( trans, data.get_mime() )
  322. trans.log_event( "Display dataset id: %s" % str( data.id ) )
  323. from galaxy import datatypes #DBTODO REMOVE THIS AT REFACTOR
  324. if to_ext or isinstance(data.datatype, datatypes.binary.Binary): # Saving the file, or binary file
  325. if data.extension in composite_extensions:
  326. return self._archive_composite_dataset( trans, data, **kwd )
  327. else:
  328. trans.response.headers['Content-Length'] = int( os.stat( data.file_name ).st_size )
  329. if not to_ext:
  330. to_ext = data.extension
  331. valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
  332. fname = ''.join(c in valid_chars and c or '_' for c in data.name)[0:150]
  333. trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename
  334. trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (data.hid, fname, to_ext)
  335. return open( data.file_name )
  336. if not os.path.exists( data.file_name ):
  337. raise paste.httpexceptions.HTTPNotFound( "File Not Found (%s)." % data.file_name )
  338. max_peek_size = 1000000 # 1 MB
  339. if isinstance(data.datatype, datatypes.images.Html):
  340. max_peek_size = 10000000 # 10 MB for html
  341. preview = util.string_as_bool( preview )
  342. if not preview or isinstance(data.datatype, datatypes.images.Image) or os.stat( data.file_name ).st_size < max_peek_size:
  343. if trans.app.config.sanitize_all_html and trans.response.get_content_type() == "text/html":
  344. # Sanitize anytime we respond with plain text/html content.
  345. return sanitize_html(open( data.file_name ).read())
  346. return open( data.file_name )
  347. else:
  348. trans.response.set_content_type( "text/html" )
  349. return trans.stream_template_mako( "/dataset/large_file.mako",
  350. truncated_data = open( data.file_name ).read(max_peek_size),
  351. data = data)
  352. def display_name(self, dataset):
  353. """Returns formatted html of dataset name"""
  354. try:
  355. if type ( dataset.name ) is unicode:
  356. return escape( dataset.name )
  357. else:
  358. return escape( unicode( dataset.name, 'utf-8 ') )
  359. except:
  360. return "name unavailable"
  361. def display_info(self, dataset):
  362. """Returns formatted html of dataset info"""
  363. try:
  364. # Change new line chars to html
  365. info = escape( dataset.info )
  366. if info.find( '\r\n' ) >= 0:
  367. info = info.replace( '\r\n', '<br/>' )
  368. if info.find( '\r' ) >= 0:
  369. info = info.replace( '\r', '<br/>' )
  370. if info.find( '\n' ) >= 0:
  371. info = info.replace( '\n', '<br/>' )
  372. # Convert to unicode to display non-ascii characters.
  373. if type( info ) is not unicode:
  374. info = unicode( info, 'utf-8')
  375. return info
  376. except:
  377. return "info unavailable"
  378. def validate(self, dataset):
  379. """Unimplemented validate, return no exceptions"""
  380. return list()
  381. def repair_methods(self, dataset):
  382. """Unimplemented method, returns dict with method/option for repairing errors"""
  383. return None
  384. def get_mime(self):
  385. """Returns the mime type of the datatype"""
  386. return 'application/octet-stream'
  387. def add_display_app ( self, app_id, label, file_function, links_function ):
  388. """
  389. Adds a display app to the datatype.
  390. app_id is a unique id
  391. label is the primary display label, e.g., display at 'UCSC'
  392. file_function is a string containing the name of the function that returns a properly formatted display
  393. links_function is a string containing the name of the function that returns a list of (link_name,link)
  394. """
  395. self.supported_display_apps = self.supported_display_apps.copy()
  396. self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function}
  397. def remove_display_app (self, app_id):
  398. """Removes a display app from the datatype"""
  399. self.supported_display_apps = self.supported_display_apps.copy()
  400. try:
  401. del self.supported_display_apps[app_id]
  402. except:
  403. log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) )
  404. def clear_display_apps( self ):
  405. self.supported_display_apps = {}
  406. def add_display_application( self, display_application ):
  407. """New style display applications"""
  408. assert display_application.id not in self.display_applications, 'Attempted to add a display application twice'
  409. self.display_applications[ display_application.id ] = display_application
  410. def get_display_application( self, key, default = None ):
  411. return self.display_applications.get( key, default )
  412. def get_display_applications_by_dataset( self, dataset, trans ):
  413. rval = odict()
  414. for key, value in self.display_applications.iteritems():
  415. value = value.filter_by_dataset( dataset, trans )
  416. if value.links:
  417. rval[key] = value
  418. return rval
  419. def get_display_types(self):
  420. """Returns display types available"""
  421. return self.supported_display_apps.keys()
  422. def get_display_label(self, type):
  423. """Returns primary label for display app"""
  424. try:
  425. return self.supported_display_apps[type]['label']
  426. except:
  427. return 'unknown'
  428. def as_display_type(self, dataset, type, **kwd):
  429. """Returns modified file contents for a particular display type """
  430. try:
  431. if type in self.get_display_types():
  432. return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd)
  433. except:
  434. log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) )
  435. return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext)
  436. def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ):
  437. """
  438. Returns a list of tuples of (name, link) for a particular display type. No check on
  439. 'access' permissions is done here - if you can view the dataset, you can also save it
  440. or send it to a destination outside of Galaxy, so Galaxy security restrictions do not
  441. apply anyway.
  442. """
  443. try:
  444. if app.config.enable_old_display_applications and type in self.get_display_types():
  445. return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd )
  446. except:
  447. log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \
  448. % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) )
  449. return target_frame, []
  450. def get_converter_types(self, original_dataset, datatypes_registry):
  451. """Returns available converters by type for this dataset"""
  452. return datatypes_registry.get_converters_by_datatype(original_dataset.ext)
  453. def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ):
  454. """Returns ( target_ext, existing converted dataset )"""
  455. return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd )
  456. def convert_dataset(self, trans, original_dataset, target_type, return_output=False, visible=True, deps=None, set_output_history=True):
  457. """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
  458. converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type )
  459. if converter is None:
  460. raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) )
  461. #Generate parameter dictionary
  462. params = {}
  463. #determine input parameter name and add to params
  464. input_name = 'input1'
  465. for key, value in converter.inputs.items():
  466. if deps and value.name in deps:
  467. params[value.name] = deps[value.name]
  468. elif value.type == 'data':
  469. input_name = key
  470. params[input_name] = original_dataset
  471. #Run converter, job is dispatched through Queue
  472. converted_dataset = converter.execute( trans, incoming=params, set_output_hid=visible, set_output_history=set_output_history)[1]
  473. if len(params) > 0:
  474. trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id )
  475. if not visible:
  476. for name, value in converted_dataset.iteritems():
  477. value.visible = False
  478. if return_output:
  479. return converted_dataset
  480. return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
  481. #We need to clear associated files before we set metadata
  482. #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after
  483. #We'll also clear after setting metadata, for backwards compatibility
  484. def after_setting_metadata( self, dataset ):
  485. """This function is called on the dataset after metadata is set."""
  486. dataset.clear_associated_files( metadata_safe = True )
  487. def before_setting_metadata( self, dataset ):
  488. """This function is called on the dataset before metadata is set."""
  489. dataset.clear_associated_files( metadata_safe = True )
  490. def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, to_posix_lines = True, space_to_tab = False, **kwds ):
  491. kwds[ 'name' ] = name
  492. kwds[ 'optional' ] = optional
  493. kwds[ 'mimetype' ] = mimetype
  494. kwds[ 'description' ] = description
  495. kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata
  496. kwds[ 'is_binary' ] = is_binary
  497. kwds[ 'to_posix_lines' ] = to_posix_lines
  498. kwds[ 'space_to_tab' ] = space_to_tab
  499. return Bunch( **kwds )
  500. def add_composite_file( self, name, **kwds ):
  501. #self.composite_files = self.composite_files.copy()
  502. self.composite_files[ name ] = self.__new_composite_file( name, **kwds )
  503. def __substitute_composite_key( self, key, composite_file, dataset = None ):
  504. if composite_file.substitute_name_with_metadata:
  505. if dataset:
  506. meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
  507. else:
  508. meta_value = self.spec[composite_file.substitute_name_with_metadata].default
  509. return key % meta_value
  510. return key
  511. @property
  512. def writable_files( self, dataset = None ):
  513. files = odict()
  514. if self.composite_type != 'auto_primary_file':
  515. files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name )
  516. for key, value in self.get_composite_files( dataset = dataset ).iteritems():
  517. files[ key ] = value
  518. return files
  519. def get_composite_files( self, dataset = None ):
  520. def substitute_composite_key( key, composite_file ):
  521. if composite_file.substitute_name_with_metadata:
  522. if dataset:
  523. meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
  524. else:
  525. meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default
  526. return key % meta_value
  527. return key
  528. files = odict()
  529. for key, value in self.composite_files.iteritems():
  530. files[ substitute_composite_key( key, value ) ] = value
  531. return files
  532. def generate_auto_primary_file( self, dataset = None ):
  533. raise Exception( "generate_auto_primary_file is not implemented for this datatype." )
  534. @property
  535. def has_resolution(self):
  536. return False
  537. def matches_any( self, target_datatypes ):
  538. """
  539. Check if this datatype is of any of the target_datatypes or is
  540. a subtype thereof.
  541. """
  542. datatype_classes = tuple( [ datatype if isclass( datatype ) else datatype.__class__ for datatype in target_datatypes ] )
  543. return isinstance( self, datatype_classes )
  544. def merge( split_files, output_file):
  545. """
  546. Merge files with copy.copyfileobj() will not hit the
  547. max argument limitation of cat. gz and bz2 files are also working.
  548. """
  549. if not split_files:
  550. raise ValueError('Asked to merge zero files as %s' % output_file)
  551. elif len(split_files) == 1:
  552. shutil.copyfileobj(open(split_files[0], 'rb'), open(output_file, 'wb'))
  553. else:
  554. fdst = open(output_file, 'wb')
  555. for fsrc in split_files:
  556. shutil.copyfileobj(open(fsrc, 'rb'), fdst)
  557. fdst.close()
  558. merge = staticmethod(merge)
  559. def get_visualizations( self, dataset ):
  560. """
  561. Returns a list of visualizations for datatype.
  562. """
  563. if self.track_type:
  564. return [ 'trackster', 'circster' ]
  565. return []
  566. # ------------- Dataproviders
  567. def has_dataprovider( self, data_format ):
  568. """
  569. Returns True if `data_format` is available in `dataproviders`.
  570. """
  571. return ( data_format in self.dataproviders )
  572. def dataprovider( self, dataset, data_format, **settings ):
  573. """
  574. Base dataprovider factory for all datatypes that returns the proper provider
  575. for the given `data_format` or raises a `NoProviderAvailable`.
  576. """
  577. if self.has_dataprovider( data_format ):
  578. return self.dataproviders[ data_format ]( self, dataset, **settings )
  579. raise dataproviders.exceptions.NoProviderAvailable( self, data_format )
  580. @dataproviders.decorators.dataprovider_factory( 'base' )
  581. def base_dataprovider( self, dataset, **settings ):
  582. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  583. return dataproviders.base.DataProvider( dataset_source, **settings )
  584. @dataproviders.decorators.dataprovider_factory( 'chunk', dataproviders.chunk.ChunkDataProvider.settings )
  585. def chunk_dataprovider( self, dataset, **settings ):
  586. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  587. return dataproviders.chunk.ChunkDataProvider( dataset_source, **settings )
  588. @dataproviders.decorators.dataprovider_factory( 'chunk64', dataproviders.chunk.Base64ChunkDataProvider.settings )
  589. def chunk64_dataprovider( self, dataset, **settings ):
  590. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  591. return dataproviders.chunk.Base64ChunkDataProvider( dataset_source, **settings )
  592. def _clean_and_set_mime_type(self, trans, mime):
  593. if mime.lower() in XSS_VULNERABLE_MIME_TYPES:
  594. if not getattr( trans.app.config, "serve_xss_vulnerable_mimetypes", True ):
  595. mime = DEFAULT_MIME_TYPE
  596. trans.response.set_content_type( mime )
  597. @dataproviders.decorators.has_dataproviders
  598. class Text( Data ):
  599. file_ext = 'txt'
  600. line_class = 'line'
  601. """Add metadata elements"""
  602. MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 )
  603. def write_from_stream(self, dataset, stream):
  604. """Writes data from a stream"""
  605. # write it twice for now
  606. fd, temp_name = tempfile.mkstemp()
  607. while 1:
  608. chunk = stream.read(1048576)
  609. if not chunk:
  610. break
  611. os.write(fd, chunk)
  612. os.close(fd)
  613. # rewrite the file with unix newlines
  614. fp = open(dataset.file_name, 'wt')
  615. for line in file(temp_name, "U"):
  616. line = line.strip() + '\n'
  617. fp.write(line)
  618. fp.close()
  619. def set_raw_data(self, dataset, data):
  620. """Saves the data on the disc"""
  621. fd, temp_name = tempfile.mkstemp()
  622. os.write(fd, data)
  623. os.close(fd)
  624. # rewrite the file with unix newlines
  625. fp = open(dataset.file_name, 'wt')
  626. for line in file(temp_name, "U"):
  627. line = line.strip() + '\n'
  628. fp.write(line)
  629. fp.close()
  630. os.remove( temp_name )
  631. def get_mime(self):
  632. """Returns the mime type of the datatype"""
  633. return 'text/plain'
  634. def set_meta( self, dataset, **kwd ):
  635. """
  636. Set the number of lines of data in dataset.
  637. """
  638. dataset.metadata.data_lines = self.count_data_lines(dataset)
  639. def estimate_file_lines( self, dataset ):
  640. """
  641. Perform a rough estimate by extrapolating number of lines from a small read.
  642. """
  643. sample_size = 1048576
  644. dataset_fh = open( dataset.file_name )
  645. dataset_read = dataset_fh.read(sample_size)
  646. dataset_fh.close()
  647. sample_lines = dataset_read.count('\n')
  648. est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
  649. return est_lines
  650. def count_data_lines(self, dataset):
  651. """
  652. Count the number of lines of data in dataset,
  653. skipping all blank lines and comments.
  654. """
  655. data_lines = 0
  656. for line in file( dataset.file_name ):
  657. line = line.strip()
  658. if line and not line.startswith( '#' ):
  659. data_lines += 1
  660. return data_lines
  661. def set_peek( self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[] ):
  662. """
  663. Set the peek. This method is used by various subclasses of Text.
  664. """
  665. if not dataset.dataset.purged:
  666. # The file must exist on disk for the get_file_peek() method
  667. dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars )
  668. if line_count is None:
  669. # See if line_count is stored in the metadata
  670. if dataset.metadata.data_lines:
  671. dataset.blurb = "%s %s" % ( util.commaify( str(dataset.metadata.data_lines) ), inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
  672. else:
  673. # Number of lines is not known ( this should not happen ), and auto-detect is
  674. # needed to set metadata
  675. # This can happen when the file is larger than max_optional_metadata_filesize.
  676. if int(dataset.get_size()) <= 1048576:
  677. #Small dataset, recount all lines and reset peek afterward.
  678. lc = self.count_data_lines(dataset)
  679. dataset.metadata.data_lines = lc
  680. dataset.blurb = "%s %s" % ( util.commaify( str(lc) ), inflector.cond_plural(lc, self.line_class) )
  681. else:
  682. est_lines = self.estimate_file_lines(dataset)
  683. dataset.blurb = "~%s %s" % ( util.commaify(util.roundify(str(est_lines))), inflector.cond_plural(est_lines, self.line_class) )
  684. else:
  685. dataset.blurb = "%s %s" % ( util.commaify( str(line_count) ), inflector.cond_plural(line_count, self.line_class) )
  686. else:
  687. dataset.peek = 'file does not exist'
  688. dataset.blurb = 'file purged from disk'
  689. def split( cls, input_datasets, subdir_generator_function, split_params):
  690. """
  691. Split the input files by line.
  692. """
  693. if split_params is None:
  694. return
  695. if len(input_datasets) > 1:
  696. raise Exception("Text file splitting does not support multiple files")
  697. input_files = [ds.file_name for ds in input_datasets]
  698. lines_per_file = None
  699. chunk_size = None
  700. if split_params['split_mode'] == 'number_of_parts':
  701. lines_per_file = []
  702. # Computing the length is expensive!
  703. def _file_len(fname):
  704. i = 0
  705. f = open(fname)
  706. for i, l in enumerate(f):
  707. pass
  708. f.close()
  709. return i + 1
  710. length = _file_len(input_files[0])
  711. parts = int(split_params['split_size'])
  712. if length < parts:
  713. parts = length
  714. len_each, remainder = divmod(length, parts)
  715. while length > 0:
  716. chunk = len_each
  717. if remainder > 0:
  718. chunk += 1
  719. lines_per_file.append(chunk)
  720. remainder=- 1
  721. length -= chunk
  722. elif split_params['split_mode'] == 'to_size':
  723. chunk_size = int(split_params['split_size'])
  724. else:
  725. raise Exception('Unsupported split mode %s' % split_params['split_mode'])
  726. f = open(input_files[0], 'rt')
  727. try:
  728. chunk_idx = 0
  729. file_done = False
  730. part_file = None
  731. while not file_done:
  732. if lines_per_file is None:
  733. this_chunk_size = chunk_size
  734. elif chunk_idx < len(lines_per_file):
  735. this_chunk_size = lines_per_file[chunk_idx]
  736. chunk_idx += 1
  737. lines_remaining = this_chunk_size
  738. part_file = None
  739. while lines_remaining > 0:
  740. a_line = f.readline()
  741. if a_line == '':
  742. file_done = True
  743. break
  744. if part_file is None:
  745. part_dir = subdir_generator_function()
  746. part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
  747. part_file = open(part_path, 'w')
  748. part_file.write(a_line)
  749. lines_remaining -= 1
  750. if part_file is not None:
  751. part_file.close()
  752. except Exception, e:
  753. log.error('Unable to split files: %s' % str(e))
  754. f.close()
  755. if part_file is not None:
  756. part_file.close()
  757. raise
  758. f.close()
  759. split = classmethod(split)
  760. # ------------- Dataproviders
  761. @dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings )
  762. def line_dataprovider( self, dataset, **settings ):
  763. """
  764. Returns an iterator over the dataset's lines (that have been `strip`ed)
  765. optionally excluding blank lines and lines that start with a comment character.
  766. """
  767. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  768. return dataproviders.line.FilteredLineDataProvider( dataset_source, **settings )
  769. @dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings )
  770. def regex_line_dataprovider( self, dataset, **settings ):
  771. """
  772. Returns an iterator over the dataset's lines
  773. optionally including/excluding lines that match one or more regex filters.
  774. """
  775. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  776. return dataproviders.line.RegexLineDataProvider( dataset_source, **settings )
  777. class GenericAsn1( Text ):
  778. """Class for generic ASN.1 text format"""
  779. file_ext = 'asn1'
  780. class LineCount( Text ):
  781. """
  782. Dataset contains a single line with a single integer that denotes the
  783. line count for a related dataset. Used for custom builds.
  784. """
  785. pass
  786. class Newick( Text ):
  787. """New Hampshire/Newick Format"""
  788. file_ext = "nhx"
  789. def __init__(self, **kwd):
  790. """Initialize foobar datatype"""
  791. Text.__init__( self, **kwd )
  792. def init_meta( self, dataset, copy_from=None ):
  793. Text.init_meta( self, dataset, copy_from=copy_from )
  794. def sniff( self, filename ):
  795. """ Returning false as the newick format is too general and cannot be sniffed."""
  796. return False
  797. def get_visualizations( self, dataset ):
  798. """
  799. Returns a list of visualizations for datatype.
  800. """
  801. return [ 'phyloviz' ]
  802. class Nexus( Text ):
  803. """Nexus format as used By Paup, Mr Bayes, etc"""
  804. file_ext = "nex"
  805. def __init__(self, **kwd):
  806. """Initialize foobar datatype"""
  807. Text.__init__( self, **kwd )
  808. def init_meta( self, dataset, copy_from=None ):
  809. Text.init_meta( self, dataset, copy_from=copy_from )
  810. def sniff( self, filename ):
  811. """All Nexus Files Simply puts a '#NEXUS' in its first line"""
  812. f = open( filename, "r" )
  813. firstline = f.readline().upper()
  814. f.close()
  815. if "#NEXUS" in firstline:
  816. return True
  817. else:
  818. return False
  819. def get_visualizations( self, dataset ):
  820. """
  821. Returns a list of visualizations for datatype.
  822. """
  823. return [ 'phyloviz' ]
  824. # ------------- Utility methods --------------
  825. # nice_size used to be here, but to resolve cyclical dependencies it's been
  826. # moved to galaxy.util. It belongs there anyway since it's used outside
  827. # datatypes.
  828. nice_size = util.nice_size
  829. def get_test_fname( fname ):
  830. """Returns test data filename"""
  831. path, name = os.path.split(__file__)
  832. full_path = os.path.join( path, 'test', fname )
  833. return full_path
  834. def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[] ):
  835. """
  836. Returns the first LINE_COUNT lines wrapped to WIDTH
  837. ## >>> fname = get_test_fname('4.bed')
  838. ## >>> get_file_peek(fname)
  839. ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n'
  840. """
  841. # Set size for file.readline() to a negative number to force it to
  842. # read until either a newline or EOF. Needed for datasets with very
  843. # long lines.
  844. if WIDTH == 'unlimited':
  845. WIDTH = -1
  846. lines = []
  847. count = 0
  848. file_type = None
  849. data_checked = False
  850. temp = open( file_name, "U" )
  851. while count <= LINE_COUNT:
  852. line = temp.readline( WIDTH )
  853. if line and not is_multi_byte and not data_checked:
  854. # See if we have a compressed or binary file
  855. if line[0:2] == util.gzip_magic:
  856. file_type = 'gzipped'
  857. break
  858. else:
  859. for char in line:
  860. if ord( char ) > 128:
  861. file_type = 'binary'
  862. break
  863. data_checked = True
  864. if file_type in [ 'gzipped', 'binary' ]:
  865. break
  866. skip_line = False
  867. for skipchar in skipchars:
  868. if line.startswith( skipchar ):
  869. skip_line = True
  870. break
  871. if not skip_line:
  872. lines.append( line )
  873. count += 1
  874. temp.close()
  875. if file_type in [ 'gzipped', 'binary' ]:
  876. text = "%s file" % file_type
  877. else:
  878. try:
  879. text = util.unicodify( '\n'.join( lines ) )
  880. except UnicodeDecodeError:
  881. text = "binary/unknown file"
  882. return text