/lib/galaxy/datatypes/sequence.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 820 lines · 598 code · 76 blank · 146 comment · 138 complexity · 79558ae694a7238971c1a4f9def0a240 MD5 · raw file

  1. """
  2. Sequence classes
  3. """
  4. from . import data
  5. import gzip
  6. import json
  7. import logging
  8. import os
  9. import re
  10. import string
  11. from cgi import escape
  12. from galaxy import eggs, util
  13. from galaxy.datatypes import metadata
  14. from galaxy.datatypes.checkers import is_gzip
  15. from galaxy.datatypes.sniff import get_test_fname, get_headers
  16. from galaxy.datatypes.metadata import MetadataElement
  17. try:
  18. eggs.require( "bx-python" )
  19. import bx.align.maf
  20. except:
  21. pass
  22. log = logging.getLogger(__name__)
  23. class SequenceSplitLocations( data.Text ):
  24. """
  25. Class storing information about a sequence file composed of multiple gzip files concatenated as
  26. one OR an uncompressed file. In the GZIP case, each sub-file's location is stored in start and end.
  27. The format of the file is JSON::
  28. { "sections" : [
  29. { "start" : "x", "end" : "y", "sequences" : "z" },
  30. ...
  31. ]}
  32. """
  33. def set_peek( self, dataset, is_multi_byte=False ):
  34. if not dataset.dataset.purged:
  35. try:
  36. parsed_data = json.load(open(dataset.file_name))
  37. # dataset.peek = json.dumps(data, sort_keys=True, indent=4)
  38. dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  39. dataset.blurb = '%d sections' % len(parsed_data['sections'])
  40. except Exception, e:
  41. dataset.peek = 'Not FQTOC file'
  42. dataset.blurb = 'Not FQTOC file'
  43. else:
  44. dataset.peek = 'file does not exist'
  45. dataset.blurb = 'file purged from disk'
  46. file_ext = "fqtoc"
  47. def sniff( self, filename ):
  48. if os.path.getsize(filename) < 50000:
  49. try:
  50. data = json.load(open(filename))
  51. sections = data['sections']
  52. for section in sections:
  53. if 'start' not in section or 'end' not in section or 'sequences' not in section:
  54. return False
  55. return True
  56. except:
  57. pass
  58. return False
  59. class Sequence( data.Text ):
  60. """Class describing a sequence"""
  61. """Add metadata elements"""
  62. MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
  63. def set_meta( self, dataset, **kwd ):
  64. """
  65. Set the number of sequences and the number of data lines in dataset.
  66. """
  67. data_lines = 0
  68. sequences = 0
  69. for line in file( dataset.file_name ):
  70. line = line.strip()
  71. if line and line.startswith( '#' ):
  72. # We don't count comment lines for sequence data types
  73. continue
  74. if line and line.startswith( '>' ):
  75. sequences += 1
  76. data_lines +=1
  77. else:
  78. data_lines += 1
  79. dataset.metadata.data_lines = data_lines
  80. dataset.metadata.sequences = sequences
  81. def set_peek( self, dataset, is_multi_byte=False ):
  82. if not dataset.dataset.purged:
  83. dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  84. if dataset.metadata.sequences:
  85. dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) )
  86. else:
  87. dataset.blurb = data.nice_size( dataset.get_size() )
  88. else:
  89. dataset.peek = 'file does not exist'
  90. dataset.blurb = 'file purged from disk'
  91. def get_sequences_per_file(total_sequences, split_params):
  92. if split_params['split_mode'] == 'number_of_parts':
  93. # legacy basic mode - split into a specified number of parts
  94. parts = int(split_params['split_size'])
  95. sequences_per_file = [total_sequences/parts for i in range(parts)]
  96. for i in range(total_sequences % parts):
  97. sequences_per_file[i] += 1
  98. elif split_params['split_mode'] == 'to_size':
  99. # loop through the sections and calculate the number of sequences
  100. chunk_size = long(split_params['split_size'])
  101. rem = total_sequences % chunk_size
  102. sequences_per_file = [chunk_size for i in range(total_sequences / chunk_size)]
  103. # TODO: Should we invest the time in a better way to handle small remainders?
  104. if rem > 0:
  105. sequences_per_file.append(rem)
  106. else:
  107. raise Exception('Unsupported split mode %s' % split_params['split_mode'])
  108. return sequences_per_file
  109. get_sequences_per_file = staticmethod(get_sequences_per_file)
  110. def do_slow_split( cls, input_datasets, subdir_generator_function, split_params):
  111. # count the sequences so we can split
  112. # TODO: if metadata is present, take the number of lines / 4
  113. if input_datasets[0].metadata is not None and input_datasets[0].metadata.sequences is not None:
  114. total_sequences = input_datasets[0].metadata.sequences
  115. else:
  116. input_file = input_datasets[0].file_name
  117. compress = is_gzip(input_file)
  118. if compress:
  119. # gzip is really slow before python 2.7!
  120. in_file = gzip.GzipFile(input_file, 'r')
  121. else:
  122. # TODO
  123. # if a file is not compressed, seek locations can be calculated and stored
  124. # ideally, this would be done in metadata
  125. # TODO
  126. # Add BufferedReader if python 2.7?
  127. in_file = open(input_file, 'rt')
  128. total_sequences = long(0)
  129. for i, line in enumerate(in_file):
  130. total_sequences += 1
  131. in_file.close()
  132. total_sequences /= 4
  133. sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
  134. return cls.write_split_files(input_datasets, None, subdir_generator_function, sequences_per_file)
  135. do_slow_split = classmethod(do_slow_split)
  136. def do_fast_split( cls, input_datasets, toc_file_datasets, subdir_generator_function, split_params):
  137. data = json.load(open(toc_file_datasets[0].file_name))
  138. sections = data['sections']
  139. total_sequences = long(0)
  140. for section in sections:
  141. total_sequences += long(section['sequences'])
  142. sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
  143. return cls.write_split_files(input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file)
  144. do_fast_split = classmethod(do_fast_split)
  145. def write_split_files(cls, input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file):
  146. directories = []
  147. def get_subdir(idx):
  148. if idx < len(directories):
  149. return directories[idx]
  150. dir = subdir_generator_function()
  151. directories.append(dir)
  152. return dir
  153. # we know how many splits and how many sequences in each. What remains is to write out instructions for the
  154. # splitting of all the input files. To decouple the format of those instructions from this code, the exact format of
  155. # those instructions is delegated to scripts
  156. start_sequence=0
  157. for part_no in range(len(sequences_per_file)):
  158. dir = get_subdir(part_no)
  159. for ds_no in range(len(input_datasets)):
  160. ds = input_datasets[ds_no]
  161. base_name = os.path.basename(ds.file_name)
  162. part_path = os.path.join(dir, base_name)
  163. split_data = dict(class_name='%s.%s' % (cls.__module__, cls.__name__),
  164. output_name=part_path,
  165. input_name=ds.file_name,
  166. args=dict(start_sequence=start_sequence, num_sequences=sequences_per_file[part_no]))
  167. if toc_file_datasets is not None:
  168. toc = toc_file_datasets[ds_no]
  169. split_data['args']['toc_file'] = toc.file_name
  170. f = open(os.path.join(dir, 'split_info_%s.json' % base_name), 'w')
  171. json.dump(split_data, f)
  172. f.close()
  173. start_sequence += sequences_per_file[part_no]
  174. return directories
  175. write_split_files = classmethod(write_split_files)
  176. def split( cls, input_datasets, subdir_generator_function, split_params):
  177. """Split a generic sequence file (not sensible or possible, see subclasses)."""
  178. if split_params is None:
  179. return None
  180. raise NotImplementedError("Can't split generic sequence files")
  181. class Alignment( data.Text ):
  182. """Class describing an alignment"""
  183. """Add metadata elements"""
  184. MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None )
  185. def split( cls, input_datasets, subdir_generator_function, split_params):
  186. """Split a generic alignment file (not sensible or possible, see subclasses)."""
  187. if split_params is None:
  188. return None
  189. raise NotImplementedError("Can't split generic alignment files")
  190. class Fasta( Sequence ):
  191. """Class representing a FASTA sequence"""
  192. file_ext = "fasta"
  193. def sniff( self, filename ):
  194. """
  195. Determines whether the file is in fasta format
  196. A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
  197. The first character of the description line is a greater-than (">") symbol in the first column.
  198. All lines should be shorter than 80 characters
  199. For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
  200. Rules for sniffing as True:
  201. We don't care about line length (other than empty lines).
  202. The first non-empty line must start with '>' and the Very Next line.strip() must have sequence data and not be a header.
  203. 'sequence data' here is loosely defined as non-empty lines which do not start with '>'
  204. This will cause Color Space FASTA (csfasta) to be detected as True (they are, after all, still FASTA files - they have a header line followed by sequence data)
  205. Previously this method did some checking to determine if the sequence data had integers (presumably to differentiate between fasta and csfasta)
  206. This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta
  207. We will only check that the first purported sequence is correctly formatted.
  208. >>> fname = get_test_fname( 'sequence.maf' )
  209. >>> Fasta().sniff( fname )
  210. False
  211. >>> fname = get_test_fname( 'sequence.fasta' )
  212. >>> Fasta().sniff( fname )
  213. True
  214. """
  215. try:
  216. fh = open( filename )
  217. while True:
  218. line = fh.readline()
  219. if not line:
  220. break #EOF
  221. line = line.strip()
  222. if line: #first non-empty line
  223. if line.startswith( '>' ):
  224. #The next line.strip() must not be '', nor startwith '>'
  225. line = fh.readline().strip()
  226. if line == '' or line.startswith( '>' ):
  227. break
  228. return True
  229. else:
  230. break #we found a non-empty line, but its not a fasta header
  231. fh.close()
  232. except:
  233. pass
  234. return False
  235. def split(cls, input_datasets, subdir_generator_function, split_params):
  236. """Split a FASTA file sequence by sequence.
  237. Note that even if split_mode="number_of_parts", the actual number of
  238. sub-files produced may not match that requested by split_size.
  239. If split_mode="to_size" then split_size is treated as the number of
  240. FASTA records to put in each sub-file (not size in bytes).
  241. """
  242. if split_params is None:
  243. return
  244. if len(input_datasets) > 1:
  245. raise Exception("FASTA file splitting does not support multiple files")
  246. input_file = input_datasets[0].file_name
  247. #Counting chunk size as number of sequences.
  248. if 'split_mode' not in split_params:
  249. raise Exception('Tool does not define a split mode')
  250. elif split_params['split_mode'] == 'number_of_parts':
  251. split_size = int(split_params['split_size'])
  252. log.debug("Split %s into %i parts..." % (input_file, split_size))
  253. #if split_mode = number_of_parts, and split_size = 10, and
  254. #we know the number of sequences (say 1234), then divide by
  255. #by ten, giving ten files of approx 123 sequences each.
  256. if input_datasets[0].metadata is not None \
  257. and input_datasets[0].metadata.sequences:
  258. #Galaxy has already counted/estimated the number
  259. batch_size = 1 + input_datasets[0].metadata.sequences // split_size
  260. cls._count_split(input_file, batch_size, subdir_generator_function)
  261. else:
  262. #OK, if Galaxy hasn't counted them, it may be a big file.
  263. #We're not going to count the records which would be slow
  264. #and a waste of disk IO time - instead we'll split using
  265. #the file size.
  266. chunk_size = os.path.getsize(input_file) // split_size
  267. cls._size_split(input_file, chunk_size, subdir_generator_function)
  268. elif split_params['split_mode'] == 'to_size':
  269. #Split the input file into as many sub-files as required,
  270. #each containing to_size many sequences
  271. batch_size = int(split_params['split_size'])
  272. log.debug("Split %s into batches of %i records..." % (input_file, batch_size))
  273. cls._count_split(input_file, batch_size, subdir_generator_function)
  274. else:
  275. raise Exception('Unsupported split mode %s' % split_params['split_mode'])
  276. split = classmethod(split)
  277. def _size_split(cls, input_file, chunk_size, subdir_generator_function):
  278. """Split a FASTA file into chunks based on size on disk.
  279. This does of course preserve complete records - it only splits at the
  280. start of a new FASTQ sequence record.
  281. """
  282. log.debug("Attemping to split FASTA file %s into chunks of %i bytes" \
  283. % (input_file, chunk_size))
  284. f = open(input_file, "rU")
  285. part_file = None
  286. try:
  287. #Note if the input FASTA file has no sequences, we will
  288. #produce just one sub-file which will be a copy of it.
  289. part_dir = subdir_generator_function()
  290. part_path = os.path.join(part_dir, os.path.basename(input_file))
  291. part_file = open(part_path, 'w')
  292. log.debug("Writing %s part to %s" % (input_file, part_path))
  293. start_offset = 0
  294. while True:
  295. offset = f.tell()
  296. line = f.readline()
  297. if not line:
  298. break
  299. if line[0]==">" and offset - start_offset >= chunk_size:
  300. #Start a new sub-file
  301. part_file.close()
  302. part_dir = subdir_generator_function()
  303. part_path = os.path.join(part_dir, os.path.basename(input_file))
  304. part_file = open(part_path, 'w')
  305. log.debug("Writing %s part to %s" % (input_file, part_path))
  306. start_offset = f.tell()
  307. part_file.write(line)
  308. except Exception, e:
  309. log.error('Unable to size split FASTA file: %s' % str(e))
  310. f.close()
  311. if part_file is not None:
  312. part_file.close()
  313. raise
  314. f.close()
  315. _size_split = classmethod(_size_split)
  316. def _count_split(cls, input_file, chunk_size, subdir_generator_function):
  317. """Split a FASTA file into chunks based on counting records."""
  318. log.debug("Attemping to split FASTA file %s into chunks of %i sequences" \
  319. % (input_file, chunk_size))
  320. f = open(input_file, "rU")
  321. part_file = None
  322. try:
  323. #Note if the input FASTA file has no sequences, we will
  324. #produce just one sub-file which will be a copy of it.
  325. part_dir = subdir_generator_function()
  326. part_path = os.path.join(part_dir, os.path.basename(input_file))
  327. part_file = open(part_path, 'w')
  328. log.debug("Writing %s part to %s" % (input_file, part_path))
  329. rec_count = 0
  330. while True:
  331. line = f.readline()
  332. if not line:
  333. break
  334. if line[0]==">":
  335. rec_count += 1
  336. if rec_count > chunk_size:
  337. #Start a new sub-file
  338. part_file.close()
  339. part_dir = subdir_generator_function()
  340. part_path = os.path.join(part_dir, os.path.basename(input_file))
  341. part_file = open(part_path, 'w')
  342. log.debug("Writing %s part to %s" % (input_file, part_path))
  343. rec_count = 1
  344. part_file.write(line)
  345. part_file.close()
  346. except Exception, e:
  347. log.error('Unable to count split FASTA file: %s' % str(e))
  348. f.close()
  349. if part_file is not None:
  350. part_file.close()
  351. raise
  352. f.close()
  353. _count_split = classmethod(_count_split)
  354. class csFasta( Sequence ):
  355. """ Class representing the SOLID Color-Space sequence ( csfasta ) """
  356. file_ext = "csfasta"
  357. def sniff( self, filename ):
  358. """
  359. Color-space sequence:
  360. >2_15_85_F3
  361. T213021013012303002332212012112221222112212222
  362. >>> fname = get_test_fname( 'sequence.fasta' )
  363. >>> csFasta().sniff( fname )
  364. False
  365. >>> fname = get_test_fname( 'sequence.csfasta' )
  366. >>> csFasta().sniff( fname )
  367. True
  368. """
  369. try:
  370. fh = open( filename )
  371. while True:
  372. line = fh.readline()
  373. if not line:
  374. break #EOF
  375. line = line.strip()
  376. if line and not line.startswith( '#' ): #first non-empty non-comment line
  377. if line.startswith( '>' ):
  378. line = fh.readline().strip()
  379. if line == '' or line.startswith( '>' ):
  380. break
  381. elif line[0] not in string.ascii_uppercase:
  382. return False
  383. elif len( line ) > 1 and not re.search( '^[\d.]+$', line[1:] ):
  384. return False
  385. return True
  386. else:
  387. break #we found a non-empty line, but it's not a header
  388. fh.close()
  389. except:
  390. pass
  391. return False
  392. def set_meta( self, dataset, **kwd ):
  393. if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
  394. dataset.metadata.data_lines = None
  395. dataset.metadata.sequences = None
  396. return
  397. return Sequence.set_meta( self, dataset, **kwd )
  398. class Fastq ( Sequence ):
  399. """Class representing a generic FASTQ sequence"""
  400. file_ext = "fastq"
  401. def set_meta( self, dataset, **kwd ):
  402. """
  403. Set the number of sequences and the number of data lines
  404. in dataset.
  405. """
  406. if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
  407. dataset.metadata.data_lines = None
  408. dataset.metadata.sequences = None
  409. return
  410. data_lines = 0
  411. sequences = 0
  412. seq_counter = 0 # blocks should be 4 lines long
  413. for line in file( dataset.file_name ):
  414. line = line.strip()
  415. if line and line.startswith( '#' ) and not sequences:
  416. # We don't count comment lines for sequence data types
  417. continue
  418. if line and line.startswith( '@' ):
  419. if seq_counter >= 4:
  420. # count previous block
  421. # blocks should be 4 lines long
  422. sequences += 1
  423. seq_counter = 1
  424. else:
  425. # in case quality line starts with @
  426. seq_counter += 1
  427. data_lines += 1
  428. else:
  429. data_lines += 1
  430. seq_counter += 1
  431. if seq_counter >= 4:
  432. # count final block
  433. sequences += 1
  434. dataset.metadata.data_lines = data_lines
  435. dataset.metadata.sequences = sequences
  436. def sniff ( self, filename ):
  437. """
  438. Determines whether the file is in generic fastq format
  439. For details, see http://maq.sourceforge.net/fastq.shtml
  440. Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina
  441. These differ in the representation of the quality scores
  442. >>> fname = get_test_fname( '1.fastqsanger' )
  443. >>> Fastq().sniff( fname )
  444. True
  445. >>> fname = get_test_fname( '2.fastqsanger' )
  446. >>> Fastq().sniff( fname )
  447. True
  448. """
  449. headers = get_headers( filename, None )
  450. bases_regexp = re.compile( "^[NGTAC]*" )
  451. # check that first block looks like a fastq block
  452. try:
  453. if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]:
  454. # Check the sequence line, make sure it contains only G/C/A/T/N
  455. if not bases_regexp.match( headers[1][0] ):
  456. return False
  457. return True
  458. return False
  459. except:
  460. return False
  461. def split( cls, input_datasets, subdir_generator_function, split_params):
  462. """
  463. FASTQ files are split on cluster boundaries, in increments of 4 lines
  464. """
  465. if split_params is None:
  466. return None
  467. # first, see if there are any associated FQTOC files that will give us the split locations
  468. # if so, we don't need to read the files to do the splitting
  469. toc_file_datasets = []
  470. for ds in input_datasets:
  471. tmp_ds = ds
  472. fqtoc_file = None
  473. while fqtoc_file is None and tmp_ds is not None:
  474. fqtoc_file = tmp_ds.get_converted_files_by_type('fqtoc')
  475. tmp_ds = tmp_ds.copied_from_library_dataset_dataset_association
  476. if fqtoc_file is not None:
  477. toc_file_datasets.append(fqtoc_file)
  478. if len(toc_file_datasets) == len(input_datasets):
  479. return cls.do_fast_split(input_datasets, toc_file_datasets, subdir_generator_function, split_params)
  480. return cls.do_slow_split(input_datasets, subdir_generator_function, split_params)
  481. split = classmethod(split)
  482. def process_split_file(data):
  483. """
  484. This is called in the context of an external process launched by a Task (possibly not on the Galaxy machine)
  485. to create the input files for the Task. The parameters:
  486. data - a dict containing the contents of the split file
  487. """
  488. args = data['args']
  489. input_name = data['input_name']
  490. output_name = data['output_name']
  491. start_sequence = long(args['start_sequence'])
  492. sequence_count = long(args['num_sequences'])
  493. if 'toc_file' in args:
  494. toc_file = json.load(open(args['toc_file'], 'r'))
  495. commands = Sequence.get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count)
  496. else:
  497. commands = Sequence.get_split_commands_sequential(is_gzip(input_name), input_name, output_name, start_sequence, sequence_count)
  498. for cmd in commands:
  499. if 0 != os.system(cmd):
  500. raise Exception("Executing '%s' failed" % cmd)
  501. return True
  502. process_split_file = staticmethod(process_split_file)
  503. class FastqSanger( Fastq ):
  504. """Class representing a FASTQ sequence ( the Sanger variant )"""
  505. file_ext = "fastqsanger"
  506. class FastqSolexa( Fastq ):
  507. """Class representing a FASTQ sequence ( the Solexa variant )"""
  508. file_ext = "fastqsolexa"
  509. class FastqIllumina( Fastq ):
  510. """Class representing a FASTQ sequence ( the Illumina 1.3+ variant )"""
  511. file_ext = "fastqillumina"
  512. class FastqCSSanger( Fastq ):
  513. """Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )"""
  514. file_ext = "fastqcssanger"
  515. class Maf( Alignment ):
  516. """Class describing a Maf alignment"""
  517. file_ext = "maf"
  518. #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
  519. MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
  520. MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
  521. MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
  522. def init_meta( self, dataset, copy_from=None ):
  523. Alignment.init_meta( self, dataset, copy_from=copy_from )
  524. def set_meta( self, dataset, overwrite = True, **kwd ):
  525. """
  526. Parses and sets species, chromosomes, index from MAF file.
  527. """
  528. #these metadata values are not accessable by users, always overwrite
  529. #Imported here to avoid circular dependency
  530. from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
  531. indexes, species, species_chromosomes, blocks = build_maf_index_species_chromosomes( dataset.file_name )
  532. if indexes is None:
  533. return #this is not a MAF file
  534. dataset.metadata.species = species
  535. dataset.metadata.blocks = blocks
  536. #write species chromosomes to a file
  537. chrom_file = dataset.metadata.species_chromosomes
  538. if not chrom_file:
  539. chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset )
  540. chrom_out = open( chrom_file.file_name, 'wb' )
  541. for spec, chroms in species_chromosomes.items():
  542. chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
  543. chrom_out.close()
  544. dataset.metadata.species_chromosomes = chrom_file
  545. index_file = dataset.metadata.maf_index
  546. if not index_file:
  547. index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
  548. indexes.write( open( index_file.file_name, 'wb' ) )
  549. dataset.metadata.maf_index = index_file
  550. def set_peek( self, dataset, is_multi_byte=False ):
  551. if not dataset.dataset.purged:
  552. # The file must exist on disk for the get_file_peek() method
  553. dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
  554. if dataset.metadata.blocks:
  555. dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) )
  556. else:
  557. # Number of blocks is not known ( this should not happen ), and auto-detect is
  558. # needed to set metadata
  559. dataset.blurb = "? blocks"
  560. else:
  561. dataset.peek = 'file does not exist'
  562. dataset.blurb = 'file purged from disk'
  563. def display_peek( self, dataset ):
  564. """Returns formated html of peek"""
  565. return self.make_html_table( dataset )
  566. def make_html_table( self, dataset, skipchars=[] ):
  567. """Create HTML table, used for displaying peek"""
  568. out = ['<table cellspacing="0" cellpadding="3">']
  569. try:
  570. out.append('<tr><th>Species:&nbsp;')
  571. for species in dataset.metadata.species:
  572. out.append( '%s&nbsp;' % species )
  573. out.append( '</th></tr>' )
  574. if not dataset.peek:
  575. dataset.set_peek()
  576. data = dataset.peek
  577. lines = data.splitlines()
  578. for line in lines:
  579. line = line.strip()
  580. if not line:
  581. continue
  582. out.append( '<tr><td>%s</td></tr>' % escape( line ) )
  583. out.append( '</table>' )
  584. out = "".join( out )
  585. except Exception, exc:
  586. out = "Can't create peek %s" % exc
  587. return out
  588. def sniff( self, filename ):
  589. """
  590. Determines wether the file is in maf format
  591. The .maf format is line-oriented. Each multiple alignment ends with a blank line.
  592. Each sequence in an alignment is on a single line, which can get quite long, but
  593. there is no length limit. Words in a line are delimited by any white space.
  594. Lines starting with # are considered to be comments. Lines starting with ## can
  595. be ignored by most programs, but contain meta-data of one form or another.
  596. The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
  597. variable=value pairs. There should be no white space surrounding the "=".
  598. For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5
  599. >>> fname = get_test_fname( 'sequence.maf' )
  600. >>> Maf().sniff( fname )
  601. True
  602. >>> fname = get_test_fname( 'sequence.fasta' )
  603. >>> Maf().sniff( fname )
  604. False
  605. """
  606. headers = get_headers( filename, None )
  607. try:
  608. if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf":
  609. return True
  610. else:
  611. return False
  612. except:
  613. return False
  614. class MafCustomTrack( data.Text ):
  615. file_ext = "mafcustomtrack"
  616. MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' )
  617. MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' )
  618. MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' )
  619. def set_meta( self, dataset, overwrite = True, **kwd ):
  620. """
  621. Parses and sets viewport metadata from MAF file.
  622. """
  623. max_block_check = 10
  624. chrom = None
  625. forward_strand_start = float( 'inf' )
  626. forward_strand_end = 0
  627. try:
  628. maf_file = open( dataset.file_name )
  629. maf_file.readline() #move past track line
  630. for i, block in enumerate( bx.align.maf.Reader( maf_file ) ):
  631. ref_comp = block.get_component_by_src_start( dataset.metadata.dbkey )
  632. if ref_comp:
  633. ref_chrom = bx.align.maf.src_split( ref_comp.src )[-1]
  634. if chrom is None:
  635. chrom = ref_chrom
  636. if chrom == ref_chrom:
  637. forward_strand_start = min( forward_strand_start, ref_comp.forward_strand_start )
  638. forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end )
  639. if i > max_block_check:
  640. break
  641. if forward_strand_end > forward_strand_start:
  642. dataset.metadata.vp_chromosome = chrom
  643. dataset.metadata.vp_start = forward_strand_start
  644. dataset.metadata.vp_end = forward_strand_end
  645. except:
  646. pass
  647. class Axt( data.Text ):
  648. """Class describing an axt alignment"""
  649. # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
  650. # here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
  651. # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
  652. file_ext = "axt"
  653. def sniff( self, filename ):
  654. """
  655. Determines whether the file is in axt format
  656. axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
  657. at Penn State University.
  658. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
  659. Blocks are separated from one another by blank lines.
  660. The summary line contains chromosomal position and size information about the alignment. It
  661. consists of 9 required fields.
  662. The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
  663. (line 3) with inserts. Repeats are indicated by lower-case letters.
  664. For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
  665. >>> fname = get_test_fname( 'alignment.axt' )
  666. >>> Axt().sniff( fname )
  667. True
  668. >>> fname = get_test_fname( 'alignment.lav' )
  669. >>> Axt().sniff( fname )
  670. False
  671. """
  672. headers = get_headers( filename, None )
  673. if len(headers) < 4:
  674. return False
  675. for hdr in headers:
  676. if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
  677. return True
  678. if len(hdr) > 0 and not hdr[0].startswith("#"):
  679. if len(hdr) != 9:
  680. return False
  681. try:
  682. map ( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] )
  683. except:
  684. return False
  685. if hdr[7] not in data.valid_strand:
  686. return False
  687. else:
  688. return True
  689. class Lav( data.Text ):
  690. """Class describing a LAV alignment"""
  691. file_ext = "lav"
  692. # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
  693. # here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
  694. # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
  695. def sniff( self, filename ):
  696. """
  697. Determines whether the file is in lav format
  698. LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
  699. The first line of a .lav file begins with #:lav.
  700. For complete details see http://www.bioperl.org/wiki/LAV_alignment_format
  701. >>> fname = get_test_fname( 'alignment.lav' )
  702. >>> Lav().sniff( fname )
  703. True
  704. >>> fname = get_test_fname( 'alignment.axt' )
  705. >>> Lav().sniff( fname )
  706. False
  707. """
  708. headers = get_headers( filename, None )
  709. try:
  710. if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'):
  711. return True
  712. else:
  713. return False
  714. except:
  715. return False