PageRenderTime 51ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk-old/contrib/nltk_contrib/mit/rspeer/feature.py

http://nltk.googlecode.com/
Python | 750 lines | 587 code | 61 blank | 102 comment | 102 complexity | e17d30dc9e4b726e1d7f336a8ea7268e MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. from nltk.featurestructure import *
  2. from nltk.cfg import Nonterminal, CFGProduction
  3. import string
  4. class Category(FeatureStructure, Nonterminal):
  5. """
  6. A C{Category} is a specialized feature structure, intended for use in
  7. parsing. It can act as a C{Nonterminal}.
  8. A C{Category} differs from a C{FeatureStructure} in these ways:
  9. - Categories may not be re-entrant.
  10. - Categories use value-based equality, while FeatureStructures use
  11. identity-based equality.
  12. - Strings in Categories are compared case-insensitively.
  13. - Categories have one feature marked as the 'head', which prints
  14. differently than other features if it has a value. For example,
  15. in the C{repr()} representation of a Category, the head goes to the
  16. left, on the outside of the brackets. Subclasses of C{Category}
  17. may change the feature name that is designated as the head, which is
  18. _head by default.
  19. - Subclasses of C{Category} may contain a list of I{required features},
  20. which are names of features whose value is None if unspecified. A
  21. Category lacking a feature that is required in it will not unify with
  22. any Category that has that feature. If a required feature's value is
  23. C{None}, it is considered to be not present. (Mixing different
  24. subclasses of C{Category} is probably a bad idea.)
  25. - C{True} and C{False} are allowed as values. A feature named C{foo}
  26. with a value of C{True} is simply expressed as C{+foo}. Similarly, if
  27. it is C{False}, it is expressed as C{-foo}.
  28. """
  29. headname = '_head'
  30. requiredFeatures = []
  31. def __init__(self, **features):
  32. self._features = features
  33. self._required = self.__class__.requiredFeatures
  34. for name in self._required:
  35. if not self._features.has_key(name):
  36. self._features[name] = None
  37. items = self._features.items()
  38. items.sort()
  39. self._hash = None
  40. self._frozen = False
  41. self._memorepr = None
  42. def required_features(self):
  43. "@return: A list of the names of all required features."
  44. return self._required
  45. def __cmp__(self, other):
  46. return cmp(repr(self), repr(other))
  47. def __div__(self, other):
  48. """
  49. @return: A new Category based on this one, with its C{/} feature set to
  50. C{other}.
  51. """
  52. temp = self.deepcopy()
  53. dict = temp._features
  54. dict['/'] = other
  55. return self.__class__(**dict)
  56. def __eq__(self, other):
  57. """
  58. @return: True if C{self} and C{other} assign the same value to
  59. to every feature. In particular, return true if
  60. C{self[M{p}]==other[M{p}]} for every feature path M{p} such
  61. that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e.,
  62. not a nested Category).
  63. @rtype: C{bool}
  64. """
  65. # Get the result of equal_values, and make it a real boolean while
  66. # we're at it.
  67. if not other.__class__ == self.__class__: return False
  68. if hash(self) != hash(other): return False
  69. return (self.equal_values(other) == True)
  70. def __ne__(self, other):
  71. return not (self == other)
  72. def __hash__(self):
  73. if self._hash is not None: return self._hash
  74. items = self._features.items()
  75. items.sort()
  76. return hash(tuple(items))
  77. def freeze(self):
  78. """
  79. Freezing a Category memoizes its hash value, to make comparisons on it
  80. faster. After freezing, the Category and all its values are immutable.
  81. @return: self
  82. """
  83. for val in self._features.values():
  84. if isinstance(val, Category) and not val.frozen():
  85. val.freeze()
  86. self._hash = hash(self)
  87. self._memorepr = self._repr({}, {})
  88. self._frozen = True
  89. return self
  90. def frozen(self):
  91. """
  92. Returns whether this Category is frozen (immutable).
  93. @rtype: C{bool}
  94. """
  95. return self._frozen
  96. def __setitem__(self, name, value):
  97. if self._frozen: raise "Cannot modify a frozen Category"
  98. self._features[name] = value
  99. def symbol(self):
  100. """
  101. @return: the one-line string representation of the Category.
  102. @rtype: C{str}
  103. If you want the symbol for the head of the category, use C{head()}.
  104. """
  105. return repr(self)
  106. def head(self):
  107. """
  108. @return: The head of this category (the value shown outside the
  109. brackets in its string representation). If there is no head, returns
  110. None.
  111. @rtype: C{str} or C{None}
  112. """
  113. return self._features.get(self.__class__.headname)
  114. def deepcopy(self, memo=None):
  115. """
  116. @return: A deep copy of C{self}.
  117. """
  118. newcopy = self.__class__()
  119. features = newcopy._features
  120. # Fill out the features.
  121. for (fname, fval) in self._features.items():
  122. if isinstance(fval, FeatureStructure):
  123. features[fname] = fval.deepcopy()
  124. else:
  125. features[fname] = fval
  126. return newcopy
  127. def reentrances(self):
  128. return []
  129. def feature_names(self):
  130. """
  131. @return: a list of all features that have values.
  132. """
  133. return filter(lambda x: not (x in self._required and self[x] is None),
  134. self._features.keys())
  135. def get_feature(self, *args):
  136. try:
  137. return self.__getitem__(*args)
  138. except IndexError:
  139. return StarValue()
  140. def has_feature(self, name):
  141. return (name in self.feature_names())
  142. def remove_unbound_vars(self):
  143. selfcopy = self.deepcopy()
  144. selfcopy._remove_unbound_vars()
  145. return selfcopy
  146. def _remove_unbound_vars(self):
  147. for (fname, fval) in self._features.items():
  148. if isinstance(fval, FeatureVariable):
  149. del self._features[fname]
  150. elif isinstance(fval, Category):
  151. fval._remove_unbound_vars()
  152. # All this is unlikely to be necessary. All I've changed is to make
  153. # strings case-insensitive.
  154. def _destructively_unify(self, other, bindings, trace=False, depth=0):
  155. """
  156. Attempt to unify C{self} and C{other} by modifying them
  157. in-place. If the unification succeeds, then C{self} will
  158. contain the unified value, and the value of C{other} is
  159. undefined. If the unification fails, then a
  160. _UnificationFailureError is raised, and the values of C{self}
  161. and C{other} are undefined.
  162. """
  163. if trace:
  164. print ' '+'| '*depth+' /'+`self`
  165. print ' '+'| '*depth+'|\\'+ `other`
  166. for (fname, otherval) in other._features.items():
  167. if trace:
  168. trace_otherval = otherval
  169. trace_selfval_defined = self._features.has_key(fname)
  170. trace_selfval = self._features.get(fname)
  171. if self._features.has_key(fname):
  172. selfval = self._features[fname]
  173. # If selfval or otherval is a bound variable, then
  174. # replace it by the variable's bound value.
  175. if isinstance(selfval, FeatureVariable):
  176. selfval = bindings.lookup(selfval)
  177. if isinstance(otherval, FeatureVariable):
  178. otherval = bindings.lookup(otherval)
  179. if trace:
  180. print ' '+'| '*(depth+1)
  181. print ' '+'%s| Unify %s feature:'%('| '*(depth),fname)
  182. # Case 1: unify 2 feature structures (recursive case)
  183. if (isinstance(selfval, FeatureStructure) and
  184. isinstance(otherval, FeatureStructure)):
  185. selfval._destructively_unify(otherval, bindings,
  186. trace, depth+1)
  187. # Case 2: unify 2 variables
  188. elif (isinstance(selfval, FeatureVariable) and
  189. isinstance(otherval, FeatureVariable)):
  190. self._features[fname] = selfval.alias(otherval)
  191. # Case 3: unify a variable with a value
  192. elif isinstance(selfval, FeatureVariable):
  193. bindings.bind(selfval, otherval)
  194. elif isinstance(otherval, FeatureVariable):
  195. bindings.bind(otherval, selfval)
  196. # Case 4A: unify two strings.
  197. elif isinstance(selfval, str) and isinstance(otherval, str)\
  198. and selfval.upper() == otherval.upper(): pass
  199. # Case 4: unify 2 non-equal values (failure case)
  200. elif selfval != otherval:
  201. if trace: print ' '+'| '*depth + 'X <-- FAIL'
  202. raise FeatureStructure._UnificationFailureError()
  203. # Case 5: unify 2 equal values
  204. else: pass
  205. if trace and not isinstance(selfval, FeatureStructure):
  206. # apply_forwards to get reentrancy links right:
  207. if isinstance(trace_selfval, FeatureStructure):
  208. trace_selfval._apply_forwards({})
  209. if isinstance(trace_otherval, FeatureStructure):
  210. trace_otherval._apply_forwards({})
  211. print ' '+'%s| /%r' % ('| '*(depth), trace_selfval)
  212. print ' '+'%s| |\\%r' % ('| '*(depth), trace_otherval)
  213. print ' '+'%s| +-->%r' % ('| '*(depth),
  214. self._features[fname])
  215. # Case 5: copy from other
  216. else:
  217. self._features[fname] = otherval
  218. if trace:
  219. print ' '+'| '*depth+'|'
  220. print ' '+'| '*depth+'+-->'+`self`
  221. if len(bindings.bound_variables()) > 0:
  222. print ' '+'| '*depth+' '+`bindings`
  223. def __repr__(self):
  224. """
  225. @return: A string representation of this feature structure.
  226. """
  227. if self._memorepr is not None: return self._memorepr
  228. else: return self._repr({}, {})
  229. return self._memorepr
  230. def _repr(self, reentrances, reentrance_ids):
  231. segments = []
  232. items = self.feature_names()
  233. items.sort() # sorting note: keys are unique strings, so we'll
  234. # never fall through to comparing values.
  235. for fname in items:
  236. if fname == self.__class__.headname: continue
  237. fval = self[fname]
  238. if isinstance(fval, bool):
  239. if fval: segments.append('+%s' % fname)
  240. else: segments.append('-%s' % fname)
  241. elif not isinstance(fval, Category):
  242. segments.append('%s=%r' % (fname, fval))
  243. else:
  244. fval_repr = fval._repr(reentrances, reentrance_ids)
  245. segments.append('%s=%s' % (fname, fval_repr))
  246. head = self._features.get(self.__class__.headname)
  247. if head is None: head = ''
  248. if head and not len(segments): return head
  249. return '%s[%s]' % (head, ', '.join(segments))
  250. def _str(self, reentrances, reentrance_ids):
  251. # Special case:
  252. if len(self.feature_names()) == 0:
  253. return ['[]']
  254. if self.feature_names() == [self.__class__.headname]:
  255. return ['%s[]' % self[self.__class__.headname]]
  256. # What's the longest feature name? Use this to align names.
  257. maxfnamelen = max([len(k) for k in self.feature_names()])
  258. lines = []
  259. items = self.feature_names()
  260. items.sort() # sorting note: keys are unique strings, so we'll
  261. # never fall through to comparing values.
  262. if self.__class__.headname in items:
  263. items.remove(self.__class__.headname)
  264. # items.insert(0, self.__class__.headname)
  265. for fname in items:
  266. fval = self[fname]
  267. if not isinstance(fval, FeatureStructure):
  268. # It's not a nested feature structure -- just print it.
  269. lines.append('%s = %r' % (fname.ljust(maxfnamelen), fval))
  270. else:
  271. # It's a new feature structure. Separate it from
  272. # other values by a blank line.
  273. if lines and lines[-1] != '': lines.append('')
  274. # Recursively print the feature's value (fval).
  275. fval_lines = fval._str(reentrances, reentrance_ids)
  276. # Indent each line to make room for fname.
  277. fval_lines = [(' '*(maxfnamelen+3))+l for l in fval_lines]
  278. # Pick which line we'll display fname on.
  279. nameline = (len(fval_lines)-1)/2
  280. fval_lines[nameline] = (
  281. fname.ljust(maxfnamelen)+' ='+
  282. fval_lines[nameline][maxfnamelen+2:])
  283. # Add the feature structure to the output.
  284. lines += fval_lines
  285. # Separate FeatureStructures by a blank line.
  286. lines.append('')
  287. # Get rid of any excess blank lines.
  288. if lines[-1] == '': lines = lines[:-1]
  289. # Add brackets around everything.
  290. headline = (len(lines) - 1)/2
  291. if self.has_feature(self.__class__.headname):
  292. head = self[self.__class__.headname]
  293. else: head = ''
  294. maxlen = max([len(line) for line in lines])
  295. for l in range(len(lines)):
  296. line = lines[l]
  297. if l == headline:
  298. lines[l] = ('%s[ %s%s ]' % (head, line, ' '*(maxlen-len(line))))
  299. else:
  300. lines[l] = ('%s[ %s%s ]' % (' '*len(head), line, ' '*(maxlen-len(line))))
  301. return lines
  302. # Regular expressions for parsing.
  303. _PARSE_RE = {'name': re.compile(r'\s*([^\s\(\)"\'\-=,\[\]]+)\s*'),
  304. 'categorystart': re.compile(r'\s*([^\s\(\)"\'\-=,\[\]]*)\s*\['),
  305. 'bool': re.compile(r'\s*([-\+])'),
  306. 'ident': re.compile(r'\s*\((\d+)\)\s*'),
  307. 'arrow': re.compile(r'\s*->\s*'),
  308. 'assign': re.compile(r'\s*=\s*'),
  309. 'bracket': re.compile(r'\s*]\s*'),
  310. 'comma': re.compile(r'\s*,\s*'),
  311. 'none': re.compile(r'None(?=\s|\]|,)'),
  312. 'int': re.compile(r'-?\d+(?=\s|\]|,)'),
  313. 'var': re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*'+'|'+
  314. r'\?<[a-zA-Z_][a-zA-Z0-9_]*'+
  315. r'(=[a-zA-Z_][a-zA-Z0-9_]*)*>'),
  316. 'symbol': re.compile(r'\w+'),
  317. 'disjunct': re.compile(r'\s*\|\s*'),
  318. 'whitespace': re.compile(r'\s*'),
  319. 'stringmarker': re.compile("['\"\\\\]")}
  320. def parse(cls, s):
  321. """
  322. Convert a string representation of a feature structure (as
  323. displayed by C{repr}) into a C{Category}. This parse
  324. imposes the following restrictions on the string
  325. representation:
  326. - Feature names cannot contain any of the following:
  327. whitespace, parenthases, quote marks, equals signs,
  328. dashes, plus signs, and square brackets.
  329. - Only the following basic feature value are supported:
  330. strings, integers, variables, C{None}, C{True}, C{False},
  331. and unquoted alphanumeric strings.
  332. - A feature named C{foo} with a value of C{True} or C{False} should
  333. be expressed as C{+foo} or C{-foo} respectively, not as
  334. C{foo=True} or C{foo=False}.
  335. """
  336. try:
  337. value, position = cls._parse(s, 0, {})
  338. except ValueError, e:
  339. estr = ('Error parsing field structure\n\n\t' +
  340. s + '\n\t' + ' '*e.args[1] + '^ ' +
  341. 'Expected %s\n' % e.args[0])
  342. raise ValueError, estr
  343. if position != len(s): raise ValueError()
  344. return value
  345. def _parse(cls, s, position=0, reentrances=None):
  346. """
  347. Helper function that parses a Category.
  348. @param s: The string to parse.
  349. @param position: The position in the string to start parsing.
  350. @param reentrances: A dictionary from reentrance ids to values.
  351. @return: A tuple (val, pos) of the feature structure created
  352. by parsing and the position where the parsed feature
  353. structure ends.
  354. """
  355. # A set of useful regular expressions (precompiled)
  356. _PARSE_RE = cls._PARSE_RE
  357. # Find the head, if there is one.
  358. match = _PARSE_RE['name'].match(s, position)
  359. if match is not None:
  360. head = match.group(1)
  361. position = match.end()
  362. else: head = None
  363. # Check that the name is followed by an open bracket.
  364. if position >= len(s) or s[position] != '[':
  365. return cls(**{cls.headname: head}), position
  366. position += 1
  367. # If it's immediately followed by a close bracket, then just
  368. # return an empty feature structure.
  369. match = _PARSE_RE['bracket'].match(s, position)
  370. if match is not None:
  371. if head is None: return cls(), match.end()
  372. else: return cls(**{cls.headname: head}), match.end()
  373. # Build a list of the features defined by the structure.
  374. # Each feature has one of the three following forms:
  375. # name = value
  376. # +name
  377. # -name
  378. features = {}
  379. if head is not None: features[cls.headname] = head
  380. while position < len(s):
  381. # Use these variables to hold info about the feature:
  382. name = target = val = None
  383. # Is this a shorthand boolean value?
  384. match = _PARSE_RE['bool'].match(s, position)
  385. if match is not None:
  386. if match.group(1) == '+': val = True
  387. else: val = False
  388. position = match.end()
  389. # Find the next feature's name.
  390. match = _PARSE_RE['name'].match(s, position)
  391. if match is None: raise ValueError('feature name', position)
  392. name = match.group(1)
  393. position = match.end()
  394. # If it's not a shorthand boolean, it must be an assignment.
  395. if val is None:
  396. match = _PARSE_RE['assign'].match(s, position)
  397. if match is None: raise ValueError('equals sign', position)
  398. position = match.end()
  399. val, position = cls._parseval(s, position, reentrances)
  400. features[name] = val
  401. # Check for a close bracket
  402. match = _PARSE_RE['bracket'].match(s, position)
  403. if match is not None:
  404. return cls(**features), match.end()
  405. # Otherwise, there should be a comma
  406. match = _PARSE_RE['comma'].match(s, position)
  407. if match is None: raise ValueError('comma', position)
  408. position = match.end()
  409. # We never saw a close bracket.
  410. raise ValueError('close bracket', position)
  411. def _parseval(cls, s, position, reentrances):
  412. """
  413. Helper function that parses a feature value. Currently
  414. supports: None, bools, integers, variables, strings, nested feature
  415. structures.
  416. @param s: The string to parse.
  417. @param position: The position in the string to start parsing.
  418. @param reentrances: A dictionary from reentrance ids to values.
  419. @return: A tuple (val, pos) of the value created by parsing
  420. and the position where the parsed value ends.
  421. """
  422. # A set of useful regular expressions (precompiled)
  423. _PARSE_RE = cls._PARSE_RE
  424. # End of string (error)
  425. if position == len(s): raise ValueError('value', position)
  426. # String value
  427. if s[position] in "'\"":
  428. start = position
  429. quotemark = s[position:position+1]
  430. position += 1
  431. while 1:
  432. match = _PARSE_RE['stringmarker'].search(s, position)
  433. if not match: raise ValueError('close quote', position)
  434. position = match.end()
  435. if match.group() == '\\': position += 1
  436. elif match.group() == quotemark:
  437. return eval(s[start:position]), position
  438. # Nested category
  439. if _PARSE_RE['categorystart'].match(s, position) is not None:
  440. return cls._parse(s, position, reentrances)
  441. # Variable
  442. match = _PARSE_RE['var'].match(s, position)
  443. if match is not None:
  444. return FeatureVariable.parse(match.group()), match.end()
  445. # None
  446. match = _PARSE_RE['none'].match(s, position)
  447. if match is not None:
  448. return None, match.end()
  449. # Integer value
  450. match = _PARSE_RE['int'].match(s, position)
  451. if match is not None:
  452. return int(match.group()), match.end()
  453. # Alphanumeric symbol (must be checked after integer)
  454. match = _PARSE_RE['symbol'].match(s, position)
  455. if match is not None:
  456. return cls(**{cls.headname: match.group()}), match.end()
  457. # We don't know how to parse this value.
  458. raise ValueError('value', position)
  459. def parse_rules(cls, s):
  460. """
  461. Parse a L{CFG} line involving C{Categories}. A line has this form:
  462. C{lhs -> rhs | rhs | ...}
  463. where C{lhs} is a Category, and each C{rhs} is a sequence of
  464. Categories.
  465. @returns: a list of C{CFGProductions}, one for each C{rhs}.
  466. """
  467. _PARSE_RE = cls._PARSE_RE
  468. position = 0
  469. try:
  470. lhs, position = cls._parse(s, position)
  471. except ValueError, e:
  472. estr = ('Error parsing field structure\n\n\t' +
  473. s + '\n\t' + ' '*e.args[1] + '^ ' +
  474. 'Expected %s\n' % e.args[0])
  475. raise ValueError, estr
  476. lhs.freeze()
  477. match = _PARSE_RE['arrow'].match(s, position)
  478. if match is None: raise ValueError('arrow', position)
  479. else: position = match.end()
  480. rules = []
  481. while position < len(s):
  482. rhs = []
  483. while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None:
  484. try:
  485. val, position = cls._parseval(s, position, {})
  486. except ValueError, e:
  487. estr = ('Error parsing field structure\n\n\t' +
  488. s + '\n\t' + ' '*e.args[1] + '^ ' +
  489. 'Expected %s\n' % e.args[0])
  490. raise ValueError, estr
  491. if isinstance(val, Category): val.freeze()
  492. rhs.append(val)
  493. position = _PARSE_RE['whitespace'].match(s, position).end()
  494. rules.append(CFGProduction(lhs, rhs))
  495. if position < len(s):
  496. match = _PARSE_RE['disjunct'].match(s, position)
  497. position = match.end()
  498. # Special case: if there's nothing after the arrow, it is one rule with
  499. # an empty RHS, instead of no rules.
  500. if len(rules) == 0: rules = [CFGProduction(lhs, ())]
  501. return rules
  502. _parseval=classmethod(_parseval)
  503. _parse=classmethod(_parse)
  504. parse=classmethod(parse)
  505. parse_rules=classmethod(parse_rules)
  506. class GrammarCategory(Category):
  507. """
  508. A class of C{Category} for use in parsing.
  509. The name of the head feature in a C{GrammarCategory} is C{pos} (for "part
  510. of speech"). There is one required feature, C{/}, which is intended to
  511. indicate a type of phrase that is missing from the grammatical structure.
  512. In addition, GrammarCategories are displayed and parse differently, to be
  513. consistent with NLP teaching materials: the value of the C{/} feature can
  514. be written with a slash after the right bracket, so that the string
  515. representation looks like: C{head[...]/value}.
  516. An example of a C{GrammarCategory} is C{VP[+fin]/NP}, for a verb phrase
  517. that is finite and has an omitted noun phrase inside it.
  518. """
  519. headname = 'pos'
  520. requiredFeatures = ['/']
  521. def _repr(self, reentrances, reentrance_ids):
  522. segments = []
  523. items = self.feature_names()
  524. items.sort() # sorting note: keys are unique strings, so we'll
  525. # never fall through to comparing values.
  526. for fname in items:
  527. if fname == self.__class__.headname or fname == '/': continue
  528. fval = self[fname]
  529. if isinstance(fval, bool):
  530. if fval: segments.append('+%s' % fname)
  531. else: segments.append('-%s' % fname)
  532. elif not isinstance(fval, Category):
  533. segments.append('%s=%r' % (fname, fval))
  534. else:
  535. fval_repr = fval._repr(reentrances, reentrance_ids)
  536. segments.append('%s=%s' % (fname, fval_repr))
  537. head = self._features.get(self.__class__.headname)
  538. if head is None: head = ''
  539. if not len(segments): features = ''
  540. else: features = "[%s]" % ', '.join(segments)
  541. slash = self._features.get('/')
  542. if slash is None: slash = ''
  543. else: slash = '/%r' % slash
  544. return '%s%s%s' % (head, features, slash)
  545. _PARSE_RE = {'name': re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/]+)\s*'),
  546. 'categorystart': re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/]*)\s*(\[|/)'),
  547. 'bool': re.compile(r'\s*([-\+])'),
  548. 'ident': re.compile(r'\s*\((\d+)\)\s*'),
  549. 'arrow': re.compile(r'\s*->\s*'),
  550. 'assign': re.compile(r'\s*=\s*'),
  551. 'bracket': re.compile(r'\s*]\s*'),
  552. 'comma': re.compile(r'\s*,\s*'),
  553. 'none': re.compile(r'None(?=\s|\]|,)'),
  554. 'int': re.compile(r'-?\d+(?=\s|\]|,)'),
  555. 'var': re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*'+'|'+
  556. r'\?<[a-zA-Z_][a-zA-Z0-9_]*'+
  557. r'(=[a-zA-Z_][a-zA-Z0-9_]*)*>'),
  558. 'symbol': re.compile(r'\w+'),
  559. 'disjunct': re.compile(r'\s*\|\s*'),
  560. 'slash': re.compile(r'\s*/\s*'),
  561. 'whitespace': re.compile(r'\s*'),
  562. 'stringmarker': re.compile("['\"\\\\]")}
  563. def _parse(cls, s, position=0, reentrances=None):
  564. # A set of useful regular expressions (precompiled)
  565. _PARSE_RE = cls._PARSE_RE
  566. features = {}
  567. # Find the head, if there is one.
  568. match = _PARSE_RE['name'].match(s, position)
  569. if match is not None:
  570. features[cls.headname] = match.group(1)
  571. position = match.end()
  572. # If the name is followed by an open bracket, start looking for
  573. # features.
  574. if position < len(s) and s[position] == '[':
  575. position += 1
  576. # Build a list of the features defined by the structure.
  577. # Each feature has one of the three following forms:
  578. # name = value
  579. # +name
  580. # -name
  581. while True:
  582. if not position < len(s):
  583. raise ValueError('close bracket', position)
  584. # Use these variables to hold info about the feature:
  585. name = target = val = None
  586. # Check for a close bracket at the beginning
  587. match = _PARSE_RE['bracket'].match(s, position)
  588. if match is not None:
  589. position = match.end()
  590. # Get out and check for a slash value.
  591. break
  592. # Is this a shorthand boolean value?
  593. match = _PARSE_RE['bool'].match(s, position)
  594. if match is not None:
  595. if match.group(1) == '+': val = True
  596. else: val = False
  597. position = match.end()
  598. # Find the next feature's name.
  599. match = _PARSE_RE['name'].match(s, position)
  600. if match is None: raise ValueError('feature name', position)
  601. name = match.group(1)
  602. position = match.end()
  603. # If it's not a shorthand boolean, it must be an assignment.
  604. if val is None:
  605. match = _PARSE_RE['assign'].match(s, position)
  606. if match is None: raise ValueError('equals sign', position)
  607. position = match.end()
  608. val, position = cls._parseval(s, position, reentrances)
  609. features[name] = val
  610. # Check for a close bracket
  611. match = _PARSE_RE['bracket'].match(s, position)
  612. if match is not None:
  613. position = match.end()
  614. # Get out and check for a slash value.
  615. break
  616. # Otherwise, there should be a comma
  617. match = _PARSE_RE['comma'].match(s, position)
  618. if match is None: raise ValueError('comma', position)
  619. position = match.end()
  620. # Check for a slash value
  621. match = _PARSE_RE['slash'].match(s, position)
  622. if match is not None:
  623. position = match.end()
  624. slash, position = cls._parseval(s, position, 0)
  625. features['/'] = slash
  626. return cls(**features), position
  627. _parse = classmethod(_parse)
  628. # vim:ts=4:sts=4:et:nowrap: