/nltk/toolbox.py
Python | 497 lines | 453 code | 7 blank | 37 comment | 4 complexity | 64fb98cf59e34ef4a629a39f4cfeda92 MD5 | raw file
Possible License(s): Apache-2.0
- # Natural Language Toolkit: Toolbox Reader
- #
- # Copyright (C) 2001-2012 NLTK Project
- # Author: Greg Aumann <greg_aumann@sil.org>
- # URL: <http://nltk.org>
- # For license information, see LICENSE.TXT
- """
- Module for reading, writing and manipulating
- Toolbox databases and settings files.
- """
- import os, re, codecs
- from StringIO import StringIO
- from xml.etree import ElementTree
- from xml.etree.ElementTree import TreeBuilder, Element, SubElement
- from nltk.data import PathPointer, ZipFilePathPointer, find
- class StandardFormat(object):
- """
- Class for reading and processing standard format marker files and strings.
- """
- def __init__(self, filename=None, encoding=None):
- self._encoding = encoding
- if filename is not None:
- self.open(filename)
- def open(self, sfm_file):
- """
- Open a standard format marker file for sequential reading.
- :param sfm_file: name of the standard format marker input file
- :type sfm_file: str
- """
- if isinstance(sfm_file, PathPointer):
- # [xx] We don't use 'rU' mode here -- do we need to?
- # (PathPointer.open doesn't take a mode option)
- self._file = sfm_file.open(self._encoding)
- else:
- self._file = codecs.open(sfm_file, 'rU', self._encoding)
- def open_string(self, s):
- """
- Open a standard format marker string for sequential reading.
- :param s: string to parse as a standard format marker input file
- :type s: str
- """
- self._file = StringIO(s)
- def raw_fields(self):
- """
- Return an iterator that returns the next field in a (marker, value)
- tuple. Linebreaks and trailing white space are preserved except
- for the final newline in each field.
- :rtype: iter(tuple(str, str))
- """
- join_string = '\n'
- line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$'
- # discard a BOM in the first line
- first_line_pat = re.compile(line_regexp % u'(?:\ufeff)?'.encode('utf8'))
- line_pat = re.compile(line_regexp % '')
- # need to get first line outside the loop for correct handling
- # of the first marker if it spans multiple lines
- file_iter = iter(self._file)
- line = file_iter.next()
- mobj = re.match(first_line_pat, line)
- mkr, line_value = mobj.groups()
- value_lines = [line_value,]
- self.line_num = 0
- for line in file_iter:
- self.line_num += 1
- mobj = re.match(line_pat, line)
- line_mkr, line_value = mobj.groups()
- if line_mkr:
- yield (mkr, join_string.join(value_lines))
- mkr = line_mkr
- value_lines = [line_value,]
- else:
- value_lines.append(line_value)
- self.line_num += 1
- yield (mkr, join_string.join(value_lines))
- def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
- """
- Return an iterator that returns the next field in a ``(marker, value)``
- tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
- was specified in the ``fields()`` method. Otherwise they are non-unicode strings.
- :param strip: strip trailing whitespace from the last line of each field
- :type strip: bool
- :param unwrap: Convert newlines in a field to spaces.
- :type unwrap: bool
- :param encoding: Name of an encoding to use. If it is specified then
- the ``fields()`` method returns unicode strings rather than non
- unicode strings.
- :type encoding: str or None
- :param errors: Error handling scheme for codec. Same as the ``decode()``
- builtin string method.
- :type errors: str
- :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
- Ignored if encoding is None. If the whole file is UTF-8 encoded set
- ``encoding='utf8'`` and leave ``unicode_fields`` with its default
- value of None.
- :type unicode_fields: sequence
- :rtype: iter(tuple(str, str))
- """
- if encoding is None and unicode_fields is not None:
- raise ValueError, 'unicode_fields is set but not encoding.'
- unwrap_pat = re.compile(r'\n+')
- for mkr, val in self.raw_fields():
- if encoding:
- if unicode_fields is not None and mkr in unicode_fields:
- val = val.decode('utf8', errors)
- else:
- val = val.decode(encoding, errors)
- mkr = mkr.decode(encoding, errors)
- if unwrap:
- val = unwrap_pat.sub(' ', val)
- if strip:
- val = val.rstrip()
- yield (mkr, val)
- def close(self):
- """Close a previously opened standard format marker file or string."""
- self._file.close()
- try:
- del self.line_num
- except AttributeError:
- pass
- class ToolboxData(StandardFormat):
- def parse(self, grammar=None, **kwargs):
- if grammar:
- return self._chunk_parse(grammar=grammar, **kwargs)
- else:
- return self._record_parse(**kwargs)
- def _record_parse(self, key=None, **kwargs):
- """
- Returns an element tree structure corresponding to a toolbox data file with
- all markers at the same level.
- Thus the following Toolbox database::
- \_sh v3.0 400 Rotokas Dictionary
- \_DateStampHasFourDigitYear
- \lx kaa
- \ps V.A
- \ge gag
- \gp nek i pas
- \lx kaa
- \ps V.B
- \ge strangle
- \gp pasim nek
- after parsing will end up with the same structure (ignoring the extra
- whitespace) as the following XML fragment after being parsed by
- ElementTree::
- <toolbox_data>
- <header>
- <_sh>v3.0 400 Rotokas Dictionary</_sh>
- <_DateStampHasFourDigitYear/>
- </header>
- <record>
- <lx>kaa</lx>
- <ps>V.A</ps>
- <ge>gag</ge>
- <gp>nek i pas</gp>
- </record>
- <record>
- <lx>kaa</lx>
- <ps>V.B</ps>
- <ge>strangle</ge>
- <gp>pasim nek</gp>
- </record>
- </toolbox_data>
- :param key: Name of key marker at the start of each record. If set to
- None (the default value) the first marker that doesn't begin with
- an underscore is assumed to be the key.
- :type key: str
- :param kwargs: Keyword arguments passed to ``StandardFormat.fields()``
- :type kwargs: dict
- :rtype: ElementTree._ElementInterface
- :return: contents of toolbox data divided into header and records
- """
- builder = TreeBuilder()
- builder.start('toolbox_data', {})
- builder.start('header', {})
- in_records = False
- for mkr, value in self.fields(**kwargs):
- if key is None and not in_records and mkr[0] != '_':
- key = mkr
- if mkr == key:
- if in_records:
- builder.end('record')
- else:
- builder.end('header')
- in_records = True
- builder.start('record', {})
- builder.start(mkr, {})
- builder.data(value)
- builder.end(mkr)
- if in_records:
- builder.end('record')
- else:
- builder.end('header')
- builder.end('toolbox_data')
- return builder.close()
- def _tree2etree(self, parent):
- from nltk.tree import Tree
- root = Element(parent.node)
- for child in parent:
- if isinstance(child, Tree):
- root.append(self._tree2etree(child))
- else:
- text, tag = child
- e = SubElement(root, tag)
- e.text = text
- return root
- def _chunk_parse(self, grammar=None, top_node='record', trace=0, **kwargs):
- """
- Returns an element tree structure corresponding to a toolbox data file
- parsed according to the chunk grammar.
- :type grammar: str
- :param grammar: Contains the chunking rules used to parse the
- database. See ``chunk.RegExp`` for documentation.
- :type top_node: str
- :param top_node: The node value that should be used for the
- top node of the chunk structure.
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- ``1`` will generate normal tracing output; and ``2`` or
- higher will generate verbose tracing output.
- :type kwargs: dict
- :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()``
- :rtype: ElementTree._ElementInterface
- """
- from nltk import chunk
- from nltk.tree import Tree
- cp = chunk.RegexpParser(grammar, top_node=top_node, trace=trace)
- db = self.parse(**kwargs)
- tb_etree = Element('toolbox_data')
- header = db.find('header')
- tb_etree.append(header)
- for record in db.findall('record'):
- parsed = cp.parse([(elem.text, elem.tag) for elem in record])
- tb_etree.append(self._tree2etree(parsed))
- return tb_etree
- _is_value = re.compile(r"\S")
- def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
- """
- Return a string with a standard format representation of the toolbox
- data in tree (tree can be a toolbox database or a single record).
- :param tree: flat representation of toolbox data (whole database or single record)
- :type tree: ElementTree._ElementInterface
- :param encoding: Name of an encoding to use.
- :type encoding: str
- :param errors: Error handling scheme for codec. Same as the ``encode()``
- builtin string method.
- :type errors: str
- :param unicode_fields:
- :type unicode_fields: dict(str) or set(str)
- :rtype: str
- """
- if tree.tag == 'record':
- root = Element('toolbox_data')
- root.append(tree)
- tree = root
- if tree.tag != 'toolbox_data':
- raise ValueError, "not a toolbox_data element structure"
- if encoding is None and unicode_fields is not None:
- raise ValueError, \
- "if encoding is not specified then neither should unicode_fields"
- l = []
- for rec in tree:
- l.append('\n')
- for field in rec:
- mkr = field.tag
- value = field.text
- if encoding is not None:
- if unicode_fields is not None and mkr in unicode_fields:
- cur_encoding = 'utf8'
- else:
- cur_encoding = encoding
- if re.search(_is_value, value):
- l.append((u"\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors))
- else:
- l.append((u"\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors))
- else:
- if re.search(_is_value, value):
- l.append("\\%s %s\n" % (mkr, value))
- else:
- l.append("\\%s%s\n" % (mkr, value))
- return ''.join(l[1:])
- class ToolboxSettings(StandardFormat):
- """This class is the base class for settings files."""
- def __init__(self):
- super(ToolboxSettings, self).__init__()
- def parse(self, encoding=None, errors='strict', **kwargs):
- """
- Return the contents of toolbox settings file with a nested structure.
- :param encoding: encoding used by settings file
- :type encoding: str
- :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method.
- :type errors: str
- :param kwargs: Keyword arguments passed to ``StandardFormat.fields()``
- :type kwargs: dict
- :rtype: ElementTree._ElementInterface
- """
- builder = TreeBuilder()
- for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs):
- # Check whether the first char of the field marker
- # indicates a block start (+) or end (-)
- block=mkr[0]
- if block in ("+", "-"):
- mkr=mkr[1:]
- else:
- block=None
- # Build tree on the basis of block char
- if block == "+":
- builder.start(mkr, {})
- builder.data(value)
- elif block == '-':
- builder.end(mkr)
- else:
- builder.start(mkr, {})
- builder.data(value)
- builder.end(mkr)
- return builder.close()
- def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None):
- # write XML to file
- l = list()
- _to_settings_string(tree.getroot(), l, encoding=encoding, errors=errors, unicode_fields=unicode_fields)
- return ''.join(l)
- def _to_settings_string(node, l, **kwargs):
- # write XML to file
- tag = node.tag
- text = node.text
- if len(node) == 0:
- if text:
- l.append('\\%s %s\n' % (tag, text))
- else:
- l.append('\\%s\n' % tag)
- else:
- if text:
- l.append('\\+%s %s\n' % (tag, text))
- else:
- l.append('\\+%s\n' % tag)
- for n in node:
- _to_settings_string(n, l, **kwargs)
- l.append('\\-%s\n' % tag)
- return
- def remove_blanks(elem):
- """
- Remove all elements and subelements with no text and no child elements.
- :param elem: toolbox data in an elementtree structure
- :type elem: ElementTree._ElementInterface
- """
- out = list()
- for child in elem:
- remove_blanks(child)
- if child.text or len(child) > 0:
- out.append(child)
- elem[:] = out
- def add_default_fields(elem, default_fields):
- """
- Add blank elements and subelements specified in default_fields.
- :param elem: toolbox data in an elementtree structure
- :type elem: ElementTree._ElementInterface
- :param default_fields: fields to add to each type of element and subelement
- :type default_fields: dict(tuple)
- """
- for field in default_fields.get(elem.tag, []):
- if elem.find(field) is None:
- SubElement(elem, field)
- for child in elem:
- add_default_fields(child, default_fields)
- def sort_fields(elem, field_orders):
- """
- Sort the elements and subelements in order specified in field_orders.
- :param elem: toolbox data in an elementtree structure
- :type elem: ElementTree._ElementInterface
- :param field_orders: order of fields for each type of element and subelement
- :type field_orders: dict(tuple)
- """
- order_dicts = dict()
- for field, order in field_orders.items():
- order_dicts[field] = order_key = dict()
- for i, subfield in enumerate(order):
- order_key[subfield] = i
- _sort_fields(elem, order_dicts)
- def _sort_fields(elem, orders_dicts):
- """sort the children of elem"""
- try:
- order = orders_dicts[elem.tag]
- except KeyError:
- pass
- else:
- tmp = [((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem)]
- tmp.sort()
- elem[:] = [child for key, child in tmp]
- for child in elem:
- if len(child):
- _sort_fields(child, orders_dicts)
- def add_blank_lines(tree, blanks_before, blanks_between):
- """
- Add blank lines before all elements and subelements specified in blank_before.
- :param elem: toolbox data in an elementtree structure
- :type elem: ElementTree._ElementInterface
- :param blank_before: elements and subelements to add blank lines before
- :type blank_before: dict(tuple)
- """
- try:
- before = blanks_before[tree.tag]
- between = blanks_between[tree.tag]
- except KeyError:
- for elem in tree:
- if len(elem):
- add_blank_lines(elem, blanks_before, blanks_between)
- else:
- last_elem = None
- for elem in tree:
- tag = elem.tag
- if last_elem is not None and last_elem.tag != tag:
- if tag in before and last_elem is not None:
- e = last_elem.getiterator()[-1]
- e.text = (e.text or "") + "\n"
- else:
- if tag in between:
- e = last_elem.getiterator()[-1]
- e.text = (e.text or "") + "\n"
- if len(elem):
- add_blank_lines(elem, blanks_before, blanks_between)
- last_elem = elem
- def demo():
- from itertools import islice
- # zip_path = find('corpora/toolbox.zip')
- # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
- file_path = find('corpora/toolbox/rotokas.dic')
- lexicon = ToolboxData(file_path).parse()
- print 'first field in fourth record:'
- print lexicon[3][0].tag
- print lexicon[3][0].text
- print '\nfields in sequential order:'
- for field in islice(lexicon.find('record'), 10):
- print field.tag, field.text
- print '\nlx fields:'
- for field in islice(lexicon.findall('record/lx'), 10):
- print field.text
- settings = ToolboxSettings()
- file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
- settings.open(file_path)
- # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
- tree = settings.parse(unwrap=False, encoding='cp1252')
- print tree.find('expset/expMDF/rtfPageSetup/paperSize').text
- settings_tree = ElementTree(tree)
- print to_settings_string(settings_tree).encode('utf8')
- if __name__ == '__main__':
- demo()