/plone/i18n/normalizer/__init__.py

https://github.com/plone/plone.i18n · Python · 215 lines · 109 code · 40 blank · 66 comment · 25 complexity · 83d573684e6c2eb759813d2271ffb27c MD5 · raw file

  1. from .base import baseNormalize
  2. from .interfaces import IFileNameNormalizer
  3. from .interfaces import IIDNormalizer
  4. from .interfaces import IURLNormalizer
  5. from zope.component import queryUtility
  6. from zope.interface import implementer
  7. import re
  8. # Define and compile static regexes
  9. FILENAME_REGEX = re.compile(r"^(.+)\.(\w{,4})$")
  10. IGNORE_REGEX = re.compile(r"['\"]")
  11. NON_WORD_REGEX = re.compile(r"[\W\-]+")
  12. DANGEROUS_CHARS_REGEX = re.compile(r"[!$%&()*+,/:;<=>?@\\^{|}\[\]~`]+")
  13. URL_DANGEROUS_CHARS_REGEX = re.compile(r"[!#$%&()*+,/:;<=>?@\\^{|}\[\]~`]+")
  14. MULTIPLE_DASHES_REGEX = re.compile(r"\-+")
  15. EXTRA_DASHES_REGEX = re.compile(r"(^\-+)|(\-+$)")
  16. UNDERSCORE_START_REGEX = re.compile(r"(^_+)(.*)$")
  17. LOCALE_SPLIT_REGEX = re.compile(r"[_-]")
  18. # Define static constraints
  19. MAX_LENGTH = 50
  20. MAX_FILENAME_LENGTH = 1023
  21. MAX_URL_LENGTH = 255
  22. def cropName(base, maxLength=MAX_LENGTH):
  23. baseLength = len(base)
  24. index = baseLength
  25. while index > maxLength:
  26. index = base.rfind("-", 0, index)
  27. if index == -1 and baseLength > maxLength:
  28. base = base[:maxLength]
  29. elif index > 0:
  30. base = base[:index]
  31. return base
  32. @implementer(IIDNormalizer)
  33. class IDNormalizer:
  34. """
  35. This normalizer can normalize any unicode string and returns a
  36. version that only contains of ASCII characters allowed in a typical
  37. scripting or programming language id, such as CSS class names or Python
  38. variable names for example.
  39. Let's make sure that this implementation actually fulfills the API.
  40. >>> from zope.interface.verify import verifyClass
  41. >>> verifyClass(IIDNormalizer, IDNormalizer)
  42. True
  43. """
  44. def normalize(self, text, locale=None, max_length=MAX_LENGTH):
  45. """
  46. Returns a normalized text. text has to be a unicode string and locale
  47. should be a normal locale, for example: 'pt-BR', 'sr@Latn' or 'de'
  48. """
  49. if locale is not None:
  50. # Try to get a normalizer for the locale
  51. util = queryUtility(IIDNormalizer, name=locale)
  52. parts = LOCALE_SPLIT_REGEX.split(locale)
  53. if util is None and len(parts) > 1:
  54. # Try to get a normalizer for the base language if we asked
  55. # for one for a language/country combination and found none
  56. util = queryUtility(IIDNormalizer, name=parts[0])
  57. # be defensive: if queryUtility() returns an instance of the same
  58. # normalizer class as this one, we'll loop forever until
  59. # "RuntimeError: maximum recursion depth exceeded" (ticket #11630)
  60. if util is not None and util.__class__ is not self.__class__:
  61. text = util.normalize(text, locale=locale)
  62. text = baseNormalize(text)
  63. # lowercase text
  64. text = text.lower()
  65. text = IGNORE_REGEX.sub("", text)
  66. text = NON_WORD_REGEX.sub("-", text)
  67. text = MULTIPLE_DASHES_REGEX.sub("-", text)
  68. text = EXTRA_DASHES_REGEX.sub("", text)
  69. return cropName(text, maxLength=max_length)
  70. @implementer(IFileNameNormalizer)
  71. class FileNameNormalizer:
  72. """
  73. This normalizer can normalize any unicode string and returns a version
  74. that only contains of ASCII characters allowed in a file name.
  75. Let's make sure that this implementation actually fulfills the API.
  76. >>> from zope.interface.verify import verifyClass
  77. >>> verifyClass(IFileNameNormalizer, FileNameNormalizer)
  78. True
  79. """
  80. def normalize(self, text, locale=None, max_length=MAX_FILENAME_LENGTH):
  81. """
  82. Returns a normalized text. text has to be a unicode string and locale
  83. should be a normal locale, for example: 'pt-BR', 'sr@Latn' or 'de'
  84. """
  85. if locale is not None:
  86. # Try to get a normalizer for the locale
  87. util = queryUtility(IFileNameNormalizer, name=locale)
  88. parts = LOCALE_SPLIT_REGEX.split(locale)
  89. if util is None and len(parts) > 1:
  90. # Try to get a normalizer for the base language if we asked
  91. # for one for a language/country combination and found none
  92. util = queryUtility(IFileNameNormalizer, name=parts[0])
  93. # be defensive: if queryUtility() returns an instance of the same
  94. # normalizer class as this one, we'll loop forever until
  95. # "RuntimeError: maximum recursion depth exceeded" (ticket #11630)
  96. if util is not None and util.__class__ is not self.__class__:
  97. text = util.normalize(text, locale=locale)
  98. # Preserve filename extensions
  99. text = baseNormalize(text)
  100. # Remove any leading underscores
  101. m = UNDERSCORE_START_REGEX.match(text)
  102. if m is not None:
  103. text = m.groups()[1]
  104. base = text
  105. ext = ""
  106. m = FILENAME_REGEX.match(text)
  107. if m is not None:
  108. base = m.groups()[0]
  109. ext = m.groups()[1]
  110. base = IGNORE_REGEX.sub("", base)
  111. base = DANGEROUS_CHARS_REGEX.sub("-", base)
  112. base = EXTRA_DASHES_REGEX.sub("", base)
  113. base = MULTIPLE_DASHES_REGEX.sub("-", base)
  114. base = cropName(base, maxLength=max_length)
  115. if ext != "":
  116. base = base + "." + ext
  117. return base
  118. @implementer(IURLNormalizer)
  119. class URLNormalizer:
  120. """
  121. This normalizer can normalize any unicode string and returns a URL-safe
  122. version that only contains of ASCII characters allowed in a URL.
  123. Let's make sure that this implementation actually fulfills the API.
  124. >>> from zope.interface.verify import verifyClass
  125. >>> verifyClass(IURLNormalizer, URLNormalizer)
  126. True
  127. """
  128. def normalize(self, text, locale=None, max_length=MAX_URL_LENGTH):
  129. """
  130. Returns a normalized text. text has to be a unicode string and locale
  131. should be a normal locale, for example: 'pt-BR', 'sr@Latn' or 'de'
  132. """
  133. if locale is not None:
  134. # Try to get a normalizer for the locale
  135. util = queryUtility(IURLNormalizer, name=locale)
  136. parts = LOCALE_SPLIT_REGEX.split(locale)
  137. if util is None and len(parts) > 1:
  138. # Try to get a normalizer for the base language if we asked
  139. # for one for a language/country combination and found none
  140. util = queryUtility(IURLNormalizer, name=parts[0])
  141. # be defensive: if queryUtility() returns an instance of the same
  142. # normalizer class as this one, we'll loop forever until
  143. # "RuntimeError: maximum recursion depth exceeded" (ticket #11630)
  144. if util is not None and util.__class__ is not self.__class__:
  145. text = util.normalize(text, locale=locale)
  146. text = baseNormalize(text)
  147. # Remove any leading underscores
  148. m = UNDERSCORE_START_REGEX.match(text)
  149. if m is not None:
  150. text = m.groups()[1]
  151. # lowercase text
  152. base = text.lower()
  153. ext = ""
  154. m = FILENAME_REGEX.match(base)
  155. if m is not None:
  156. base = m.groups()[0]
  157. ext = m.groups()[1]
  158. base = IGNORE_REGEX.sub("", base)
  159. base = NON_WORD_REGEX.sub("-", base)
  160. base = URL_DANGEROUS_CHARS_REGEX.sub("-", base)
  161. base = EXTRA_DASHES_REGEX.sub("", base)
  162. base = MULTIPLE_DASHES_REGEX.sub("-", base)
  163. base = cropName(base, maxLength=max_length)
  164. if ext != "":
  165. base = base + "." + ext
  166. return base
  167. idnormalizer = IDNormalizer()
  168. filenamenormalizer = FileNameNormalizer()
  169. urlnormalizer = URLNormalizer()