/lib/galaxy/datatypes/data.py
Python | 490 lines | 447 code | 10 blank | 33 comment | 37 complexity | 02b6a7faf1ce2cd624a39f387bcdc8bf MD5 | raw file
1import logging, os, sys, time, tempfile 2from galaxy import util 3from galaxy.util.odict import odict 4from galaxy.util.bunch import Bunch 5from cgi import escape 6import metadata 7import zipfile 8from metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions 9 10log = logging.getLogger(__name__) 11 12# Valid first column and strand column values vor bed, other formats 13col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho'] 14valid_strand = ['+', '-', '.'] 15 16class DataMeta( type ): 17 """ 18 Metaclass for Data class. Sets up metadata spec. 19 """ 20 def __init__( cls, name, bases, dict_ ): 21 cls.metadata_spec = metadata.MetadataSpecCollection() 22 for base in bases: #loop through bases (class/types) of cls 23 if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata 24 cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls 25 metadata.Statement.process( cls ) 26 27class Data( object ): 28 """ 29 Base class for all datatypes. Implements basic interfaces as well 30 as class methods for metadata. 31 32 >>> class DataTest( Data ): 33 ... MetadataElement( name="test" ) 34 ... 35 >>> DataTest.metadata_spec.test.name 36 'test' 37 >>> DataTest.metadata_spec.test.desc 38 'test' 39 >>> type( DataTest.metadata_spec.test.param ) 40 <class 'galaxy.datatypes.metadata.MetadataParameter'> 41 42 """ 43 __metaclass__ = DataMeta 44 # Add metadata elements 45 MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" ) 46 # Stores the set of display applications, and viewing methods, supported by this datatype 47 supported_display_apps = {} 48 # If False, the peek is regenerated whenever a dataset of this type is copied 49 copy_safe_peek = True 50 # The dataset contains binary data --> do not space_to_tab or convert newlines, etc. 51 # Allow binary file uploads of this type when True. 52 is_binary = True 53 # Allow user to change between this datatype and others. If False, this datatype 54 # cannot be changed from or into. 55 allow_datatype_change = True 56 #Composite datatypes 57 composite_type = None 58 composite_files = odict() 59 primary_file_name = 'index' 60 #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata 61 _max_optional_metadata_filesize = None 62 63 def __init__(self, **kwd): 64 """Initialize the datatype""" 65 object.__init__(self, **kwd) 66 self.supported_display_apps = self.supported_display_apps.copy() 67 self.composite_files = self.composite_files.copy() 68 self.display_applications = odict() 69 def write_from_stream(self, dataset, stream): 70 """Writes data from a stream""" 71 fd = open(dataset.file_name, 'wb') 72 while 1: 73 chunk = stream.read(1048576) 74 if not chunk: 75 break 76 os.write(fd, chunk) 77 os.close(fd) 78 def set_raw_data(self, dataset, data): 79 """Saves the data on the disc""" 80 fd = open(dataset.file_name, 'wb') 81 os.write(fd, data) 82 os.close(fd) 83 def get_raw_data( self, dataset ): 84 """Returns the full data. To stream it open the file_name and read/write as needed""" 85 try: 86 return file(datset.file_name, 'rb').read(-1) 87 except OSError, e: 88 log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name)) 89 return '' 90 def groom_dataset_content( self, file_name ): 91 """This function is called on an output dataset file after the content is initially generated.""" 92 pass 93 def init_meta( self, dataset, copy_from=None ): 94 # Metadata should be left mostly uninitialized. Dataset will 95 # handle returning default values when metadata is not set. 96 # copy_from allows metadata to be passed in that will be 97 # copied. (although this seems ambiguous, see 98 # Dataset.set_metadata. It always copies the rhs in order to 99 # flag the object as modified for SQLAlchemy. 100 if copy_from: 101 dataset.metadata = copy_from.metadata 102 def set_meta( self, dataset, overwrite = True, **kwd ): 103 """Unimplemented method, allows guessing of metadata from contents of file""" 104 return True 105 def missing_meta( self, dataset, check = [], skip = [] ): 106 """ 107 Checks for empty metadata values, Returns True if non-optional metadata is missing 108 Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored 109 Specifying a list of 'skip' items will return True even when a named metadata value is missing 110 """ 111 if check: 112 to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ] 113 else: 114 to_check = dataset.metadata.items() 115 for key, value in to_check: 116 if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ): 117 continue #we skip check for optional and nonrequested values here 118 if not value: 119 return True 120 return False 121 def set_max_optional_metadata_filesize( self, max_value ): 122 try: 123 max_value = int( max_value ) 124 except: 125 return 126 self.__class__._max_optional_metadata_filesize = max_value 127 def get_max_optional_metadata_filesize( self ): 128 rval = self.__class__._max_optional_metadata_filesize 129 if rval is None: 130 return -1 131 return rval 132 max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize ) 133 def set_peek( self, dataset, is_multi_byte=False ): 134 """Set the peek and blurb text""" 135 if not dataset.dataset.purged: 136 dataset.peek = '' 137 dataset.blurb = 'data' 138 else: 139 dataset.peek = 'file does not exist' 140 dataset.blurb = 'file purged from disk' 141 def display_peek(self, dataset ): 142 """Create HTML table, used for displaying peek""" 143 out = ['<table cellspacing="0" cellpadding="3">'] 144 try: 145 if not dataset.peek: 146 dataset.set_peek() 147 data = dataset.peek 148 lines = data.splitlines() 149 for line in lines: 150 line = line.strip() 151 if not line: 152 continue 153 if type( line ) is unicode: 154 out.append( '<tr><td>%s</td></tr>' % escape( line ) ) 155 else: 156 out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) ) 157 out.append( '</table>' ) 158 out = "".join( out ) 159 except Exception, exc: 160 out = "Can't create peek %s" % str( exc ) 161 return out 162 def display_name(self, dataset): 163 """Returns formatted html of dataset name""" 164 try: 165 if type ( dataset.name ) is unicode: 166 return escape( dataset.name ) 167 else: 168 return escape( unicode( dataset.name, 'utf-8 ') ) 169 except: 170 return "name unavailable" 171 def display_info(self, dataset): 172 """Returns formatted html of dataset info""" 173 try: 174 # Change new line chars to html 175 info = escape( dataset.info ) 176 if info.find( '\r\n' ) >= 0: 177 info = info.replace( '\r\n', '<br/>' ) 178 if info.find( '\r' ) >= 0: 179 info = info.replace( '\r', '<br/>' ) 180 if info.find( '\n' ) >= 0: 181 info = info.replace( '\n', '<br/>' ) 182 183 # Convert to unicode to display non-ascii characters. 184 if type( info ) is not unicode: 185 info = unicode( info, 'utf-8') 186 187 return info 188 except: 189 return "info unavailable" 190 def validate(self, dataset): 191 """Unimplemented validate, return no exceptions""" 192 return list() 193 def repair_methods(self, dataset): 194 """Unimplemented method, returns dict with method/option for repairing errors""" 195 return None 196 def get_mime(self): 197 """Returns the mime type of the datatype""" 198 return 'application/octet-stream' 199 def add_display_app ( self, app_id, label, file_function, links_function ): 200 """ 201 Adds a display app to the datatype. 202 app_id is a unique id 203 label is the primary display label, e.g., display at 'UCSC' 204 file_function is a string containing the name of the function that returns a properly formatted display 205 links_function is a string containing the name of the function that returns a list of (link_name,link) 206 """ 207 self.supported_display_apps = self.supported_display_apps.copy() 208 self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function} 209 def remove_display_app (self, app_id): 210 """Removes a display app from the datatype""" 211 self.supported_display_apps = self.supported_display_apps.copy() 212 try: 213 del self.supported_display_apps[app_id] 214 except: 215 log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) ) 216 def clear_display_apps( self ): 217 self.supported_display_apps = {} 218 def add_display_application( self, display_application ): 219 """New style display applications""" 220 assert display_application.id not in self.display_applications, 'Attempted to add a display application twice' 221 self.display_applications[ display_application.id ] = display_application 222 def get_display_application( self, key, default = None ): 223 return self.display_applications.get( key, default ) 224 def get_display_applications_by_dataset( self, dataset, trans ): 225 rval = odict() 226 for key, value in self.display_applications.iteritems(): 227 value = value.filter_by_dataset( dataset, trans ) 228 if value.links: 229 rval[key] = value 230 return rval 231 def get_display_types(self): 232 """Returns display types available""" 233 return self.supported_display_apps.keys() 234 def get_display_label(self, type): 235 """Returns primary label for display app""" 236 try: 237 return self.supported_display_apps[type]['label'] 238 except: 239 return 'unknown' 240 def as_display_type(self, dataset, type, **kwd): 241 """Returns modified file contents for a particular display type """ 242 try: 243 if type in self.get_display_types(): 244 return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd) 245 except: 246 log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) ) 247 return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext) 248 def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ): 249 """ 250 Returns a list of tuples of (name, link) for a particular display type. No check on 251 'access' permissions is done here - if you can view the dataset, you can also save it 252 or send it to a destination outside of Galaxy, so Galaxy security restrictions do not 253 apply anyway. 254 """ 255 try: 256 if type in self.get_display_types(): 257 return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd ) 258 except: 259 log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \ 260 % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) ) 261 return [] 262 def get_converter_types(self, original_dataset, datatypes_registry): 263 """Returns available converters by type for this dataset""" 264 return datatypes_registry.get_converters_by_datatype(original_dataset.ext) 265 def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ): 266 """Returns ( target_ext, existing converted dataset )""" 267 return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd ) 268 def convert_dataset(self, trans, original_dataset, target_type, return_output = False, visible = True, deps=None): 269 """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure.""" 270 converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type ) 271 272 if converter is None: 273 raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) ) 274 #Generate parameter dictionary 275 params = {} 276 #determine input parameter name and add to params 277 input_name = 'input1' 278 for key, value in converter.inputs.items(): 279 if (deps) and (value.name in deps): 280 params[value.name] = deps[value.name] 281 elif value.type == 'data': 282 input_name = key 283 284 params[input_name] = original_dataset 285 #Run converter, job is dispatched through Queue 286 converted_dataset = converter.execute( trans, incoming = params, set_output_hid = visible )[1] 287 if len(params) > 0: 288 trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id ) 289 if not visible: 290 for name, value in converted_dataset.iteritems(): 291 value.visible = False 292 if return_output: 293 return converted_dataset 294 return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid) 295 #We need to clear associated files before we set metadata 296 #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after 297 #We'll also clear after setting metadata, for backwards compatibility 298 def after_setting_metadata( self, dataset ): 299 """This function is called on the dataset after metadata is set.""" 300 dataset.clear_associated_files( metadata_safe = True ) 301 def before_setting_metadata( self, dataset ): 302 """This function is called on the dataset before metadata is set.""" 303 dataset.clear_associated_files( metadata_safe = True ) 304 def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, space_to_tab = False, **kwds ): 305 kwds[ 'name' ] = name 306 kwds[ 'optional' ] = optional 307 kwds[ 'mimetype' ] = mimetype 308 kwds[ 'description' ] = description 309 kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata 310 kwds[ 'is_binary' ] = is_binary 311 kwds[ 'space_to_tab' ] = space_to_tab 312 return Bunch( **kwds ) 313 def add_composite_file( self, name, **kwds ): 314 #self.composite_files = self.composite_files.copy() 315 self.composite_files[ name ] = self.__new_composite_file( name, **kwds ) 316 def __substitute_composite_key( self, key, composite_file, dataset = None ): 317 if composite_file.substitute_name_with_metadata: 318 if dataset: 319 meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) ) 320 else: 321 meta_value = self.spec[composite_file.substitute_name_with_metadata].default 322 return key % meta_value 323 return key 324 @property 325 def writable_files( self, dataset = None ): 326 files = odict() 327 if self.composite_type != 'auto_primary_file': 328 files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name ) 329 for key, value in self.get_composite_files( dataset = dataset ).iteritems(): 330 files[ key ] = value 331 return files 332 def get_composite_files( self, dataset = None ): 333 def substitute_composite_key( key, composite_file ): 334 if composite_file.substitute_name_with_metadata: 335 if dataset: 336 meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) ) 337 else: 338 meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default 339 return key % meta_value 340 return key 341 files = odict() 342 for key, value in self.composite_files.iteritems(): 343 files[ substitute_composite_key( key, value ) ] = value 344 return files 345 def generate_auto_primary_file( self, dataset = None ): 346 raise Exception( "generate_auto_primary_file is not implemented for this datatype." ) 347 @property 348 def has_resolution(self): 349 return False 350 351class Text( Data ): 352 file_ext = 'txt' 353 354 """Add metadata elements""" 355 MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 ) 356 357 def write_from_stream(self, dataset, stream): 358 """Writes data from a stream""" 359 # write it twice for now 360 fd, temp_name = tempfile.mkstemp() 361 while 1: 362 chunk = stream.read(1048576) 363 if not chunk: 364 break 365 os.write(fd, chunk) 366 os.close(fd) 367 # rewrite the file with unix newlines 368 fp = open(dataset.file_name, 'wt') 369 for line in file(temp_name, "U"): 370 line = line.strip() + '\n' 371 fp.write(line) 372 fp.close() 373 def set_raw_data(self, dataset, data): 374 """Saves the data on the disc""" 375 fd, temp_name = tempfile.mkstemp() 376 os.write(fd, data) 377 os.close(fd) 378 # rewrite the file with unix newlines 379 fp = open(dataset.file_name, 'wt') 380 for line in file(temp_name, "U"): 381 line = line.strip() + '\n' 382 fp.write(line) 383 fp.close() 384 os.remove( temp_name ) 385 def get_mime(self): 386 """Returns the mime type of the datatype""" 387 return 'text/plain' 388 def set_meta( self, dataset, **kwd ): 389 """ 390 Set the number of lines of data in dataset, 391 skipping all blank lines and comments. 392 """ 393 data_lines = 0 394 for line in file( dataset.file_name ): 395 line = line.strip() 396 if line and not line.startswith( '#' ): 397 data_lines += 1 398 dataset.metadata.data_lines = data_lines 399 def set_peek( self, dataset, line_count=None, is_multi_byte=False ): 400 if not dataset.dataset.purged: 401 # The file must exist on disk for the get_file_peek() method 402 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) 403 if line_count is None: 404 # See if line_count is stored in the metadata 405 if dataset.metadata.data_lines: 406 dataset.blurb = "%s lines" % util.commaify( str( dataset.metadata.data_lines ) ) 407 else: 408 # Number of lines is not known ( this should not happen ), and auto-detect is 409 # needed to set metadata 410 dataset.blurb = "? lines" 411 else: 412 dataset.blurb = "%s lines" % util.commaify( str( line_count ) ) 413 else: 414 dataset.peek = 'file does not exist' 415 dataset.blurb = 'file purged from disk' 416 417class Newick( Text ): 418 pass 419 420# ------------- Utility methods -------------- 421 422def get_test_fname( fname ): 423 """Returns test data filename""" 424 path, name = os.path.split(__file__) 425 full_path = os.path.join( path, 'test', fname ) 426 return full_path 427def nice_size(size): 428 """ 429 Returns a readably formatted string with the size 430 431 >>> nice_size(100) 432 '100.0 bytes' 433 >>> nice_size(10000) 434 '9.8 Kb' 435 >>> nice_size(1000000) 436 '976.6 Kb' 437 >>> nice_size(100000000) 438 '95.4 Mb' 439 """ 440 words = [ 'bytes', 'Kb', 'Mb', 'Gb' ] 441 try: 442 size = float( size ) 443 except: 444 return '??? bytes' 445 for ind, word in enumerate(words): 446 step = 1024 ** (ind + 1) 447 if step > size: 448 size = size / float(1024 ** ind) 449 out = "%.1f %s" % (size, word) 450 return out 451 return '??? bytes' 452def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5 ): 453 """ 454 Returns the first LINE_COUNT lines wrapped to WIDTH 455 456 ## >>> fname = get_test_fname('4.bed') 457 ## >>> get_file_peek(fname) 458 ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n' 459 """ 460 lines = [] 461 count = 0 462 file_type = None 463 data_checked = False 464 temp = open( file_name, "U" ) 465 while count <= LINE_COUNT: 466 line = temp.readline( WIDTH ) 467 if line and not is_multi_byte and not data_checked: 468 # See if we have a compressed or binary file 469 if line[0:2] == util.gzip_magic: 470 file_type = 'gzipped' 471 break 472 else: 473 for char in line: 474 if ord( char ) > 128: 475 file_type = 'binary' 476 break 477 data_checked = True 478 if file_type in [ 'gzipped', 'binary' ]: 479 break 480 lines.append( line ) 481 count += 1 482 temp.close() 483 if file_type in [ 'gzipped', 'binary' ]: 484 text = "%s file" % file_type 485 else: 486 try: 487 text = unicode( '\n'.join( lines ), 'utf-8' ) 488 except UnicodeDecodeError: 489 text = "binary/unknown file" 490 return text