PageRenderTime 37ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/Products/CMFPlone/UnicodeSplitter/config.py

https://github.com/emanlove/Products.CMFPlone
Python | 55 lines | 26 code | 8 blank | 21 comment | 0 complexity | 2c440163867323988bda3b987b07750a MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause
  1. # -*- coding: utf-8 -*-
  2. """
  3. config.py
  4. Created by Manabu Terada, CMScom on 2009-08-08.
  5. """
  6. import re
  7. STOP_WORD = []
  8. ## Setting, adding langs.
  9. rangetable = dict(
  10. # ascii=u"a-zA-Z0-9_",
  11. # digit=u"\d",
  12. # U+AC00-D7AF Hangul Syllables ハングル音節文字
  13. hangul=u"\uAC00-\uD7AF",
  14. # U+30A0-30FF Katakana 片仮名
  15. # U+3040-309F Hiragana 平仮名
  16. # kana=u"\u3040-\u30FF",
  17. # hiragana=u"\u3040-\u309F\u30FC",
  18. # katakana=u"\u30A0-\u30FF",
  19. # U+4E00-9FFF CJK Unified Ideographs CJK統合漢字
  20. # U+3400-4DBF CJK Unified Ideographs Extension A CJK統合漢字拡張A
  21. # U+F900-FAFF CJK Compatibility Ideographs CJK互換漢字
  22. # ideo=u"\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF",
  23. cj=u"\u3040-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF",
  24. thai=u"\u0E00-\u0E7F", # U+0E00-0E7F Thai タイ文字
  25. )
  26. ## End of setting.
  27. ## Splitting core.
  28. ps = rangetable.values()
  29. allp = u"".join(ps)
  30. glob_true = u"[^%s]([^%s]|[\*\?])*|" \
  31. % (allp, allp) + u"|".join([u"[%s]+" % (x, ) for x in ps])
  32. glob_false = u"[^%s]+|" % allp + u"|".join(u"[%s]+" % x for x in ps)
  33. rx_all = re.compile(ur"[%s]" % allp, re.UNICODE)
  34. rx_U = re.compile(r"\w+", re.UNICODE)
  35. rxGlob_U = re.compile(r"\w+[\w*?]*", re.UNICODE)
  36. rx_L = re.compile(r"\w+", re.LOCALE)
  37. rxGlob_L = re.compile(r"\w+[\w*?]*", re.LOCALE)
  38. # pattern = re.compile(u"[a-zA-Z0-9_]+|[\uac00-\ud7af]+|[\u4E00-\u9FFF\u3400-\u4dbf\uf900-\ufaff\u3040-\u30ff]+", re.UNICODE)
  39. # pattern_g = re.compile(u"[a-zA-Z0-9_]+[*?]*|[\u4E00-\u9FFF\u3400-\u4dbf\uf900-\ufaff\u3040-\u30ff\uac00-\ud7af]+[*?]*", re.UNICODE)
  40. pattern = re.compile(glob_false, re.UNICODE)
  41. pattern_g = re.compile(glob_true, re.UNICODE)