PageRenderTime 43ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/twitter_text/regex.py

http://github.com/dryan/twitter-text-py
Python | 280 lines | 222 code | 27 blank | 31 comment | 5 complexity | 231ecaa28d55a390ef57268b04b324c3 MD5 | raw file
Possible License(s): Apache-2.0
  1. # encoding=utf-8
  2. # A collection of regular expressions for parsing Tweet text. The regular expression
  3. # list is frozen at load time to ensure immutability. These reular expressions are
  4. # used throughout the Twitter classes. Special care has been taken to make
  5. # sure these reular expressions work with Tweets in all languages.
  6. import re, string
  7. REGEXEN = {} # :nodoc:
  8. def regex_range(start, end = None):
  9. if end:
  10. return u'%s-%s' % (unichr(start), unichr(end))
  11. else:
  12. return u'%s' % unichr(start)
  13. # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
  14. # to access both the list of characters and a pattern suitible for use with String#split
  15. # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
  16. UNICODE_SPACES = []
  17. for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [
  18. range(0x0009, 0x000D), # White_Space # Cc [5] <control-0009>..<control-000D>
  19. 0x0020, # White_Space # Zs SPACE
  20. 0x0085, # White_Space # Cc <control-0085>
  21. 0x00A0, # White_Space # Zs NO-BREAK SPACE
  22. 0x1680, # White_Space # Zs OGHAM SPACE MARK
  23. 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
  24. range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE
  25. 0x2028, # White_Space # Zl LINE SEPARATOR
  26. 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
  27. 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
  28. 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
  29. 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
  30. ]):
  31. UNICODE_SPACES.append(unichr(space))
  32. REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES))
  33. # Characters not allowed in Tweets
  34. INVALID_CHARACTERS = [
  35. 0xFFFE, 0xFEFF, # BOM
  36. 0xFFFF, # Special
  37. 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change
  38. ]
  39. REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS]
  40. REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$')
  41. # Latin accented characters
  42. # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
  43. # Also excludes 0xf7, the division sign
  44. LATIN_ACCENTS = [
  45. regex_range(0x00c0, 0x00d6),
  46. regex_range(0x00d8, 0x00f6),
  47. regex_range(0x00f8, 0x00ff),
  48. regex_range(0x0100, 0x024f),
  49. regex_range(0x0253, 0x0254),
  50. regex_range(0x0256, 0x0257),
  51. regex_range(0x0259),
  52. regex_range(0x025b),
  53. regex_range(0x0263),
  54. regex_range(0x0268),
  55. regex_range(0x026f),
  56. regex_range(0x0272),
  57. regex_range(0x0289),
  58. regex_range(0x028b),
  59. regex_range(0x02bb),
  60. regex_range(0x0300, 0x036f),
  61. regex_range(0x1e00, 0x1eff),
  62. ]
  63. REGEXEN['latin_accents'] = re.compile(ur''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE)
  64. LATIN_ACCENTS = u''.join(LATIN_ACCENTS)
  65. RTL_CHARACTERS = ''.join([
  66. regex_range(0x0600,0x06FF),
  67. regex_range(0x0750,0x077F),
  68. regex_range(0x0590,0x05FF),
  69. regex_range(0xFE70,0xFEFF)
  70. ])
  71. NON_LATIN_HASHTAG_CHARS = ''.join([
  72. # Cyrillic (Russian, Ukrainian, etc.)
  73. regex_range(0x0400, 0x04ff), # Cyrillic
  74. regex_range(0x0500, 0x0527), # Cyrillic Supplement
  75. regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
  76. regex_range(0xa640, 0xa69f), # Cyrillic Extended B
  77. regex_range(0x0591, 0x05bf), # Hebrew
  78. regex_range(0x05c1, 0x05c2),
  79. regex_range(0x05c4, 0x05c5),
  80. regex_range(0x05c7),
  81. regex_range(0x05d0, 0x05ea),
  82. regex_range(0x05f0, 0x05f4),
  83. regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
  84. regex_range(0xfb2a, 0xfb36),
  85. regex_range(0xfb38, 0xfb3c),
  86. regex_range(0xfb3e),
  87. regex_range(0xfb40, 0xfb41),
  88. regex_range(0xfb43, 0xfb44),
  89. regex_range(0xfb46, 0xfb4f),
  90. regex_range(0x0610, 0x061a), # Arabic
  91. regex_range(0x0620, 0x065f),
  92. regex_range(0x066e, 0x06d3),
  93. regex_range(0x06d5, 0x06dc),
  94. regex_range(0x06de, 0x06e8),
  95. regex_range(0x06ea, 0x06ef),
  96. regex_range(0x06fa, 0x06fc),
  97. regex_range(0x06ff),
  98. regex_range(0x0750, 0x077f), # Arabic Supplement
  99. regex_range(0x08a0), # Arabic Extended A
  100. regex_range(0x08a2, 0x08ac),
  101. regex_range(0x08e4, 0x08fe),
  102. regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
  103. regex_range(0xfbd3, 0xfd3d),
  104. regex_range(0xfd50, 0xfd8f),
  105. regex_range(0xfd92, 0xfdc7),
  106. regex_range(0xfdf0, 0xfdfb),
  107. regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
  108. regex_range(0xfe76, 0xfefc),
  109. regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
  110. regex_range(0x0e01, 0x0e3a), # Thai
  111. regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
  112. regex_range(0x1100, 0x11ff), # Hangul Jamo
  113. regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
  114. regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
  115. regex_range(0xAC00, 0xD7AF), # Hangul Syllables
  116. regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
  117. regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
  118. ])
  119. CJ_HASHTAG_CHARACTERS = ''.join([
  120. regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
  121. regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
  122. regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
  123. regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
  124. regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
  125. regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
  126. ])
  127. try:
  128. CJ_HASHTAG_CHARACTERS = ''.join([
  129. CJ_HASHTAG_CHARACTERS,
  130. regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
  131. regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
  132. regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
  133. regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
  134. ])
  135. except ValueError:
  136. # this is a narrow python build so these extended Kanji characters won't work
  137. pass
  138. PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
  139. SPACE_CHARS = ur" \t\n\x0B\f\r"
  140. CTRL_CHARS = ur"\x00-\x1F\x7F"
  141. # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
  142. HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
  143. HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
  144. HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
  145. HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE)
  146. REGEXEN['valid_hashtag'] = HASHTAG
  147. REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE)
  148. REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$')
  149. REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)')
  150. REGEXEN['at_signs'] = re.compile(ur'[@@]')
  151. REGEXEN['valid_mention_or_list'] = re.compile(
  152. ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character
  153. ur'(%s)' % REGEXEN['at_signs'].pattern + # at mark
  154. ur'([a-zA-Z0-9_]{1,20})' + # screen name
  155. ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional)
  156. )
  157. REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE)
  158. # Used in Extractor for final filtering
  159. REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE)
  160. # URL related hash regex collection
  161. REGEXEN['valid_url_preceding_chars'] = re.compile(ur'(?:[^A-Z0-9@@$##%s]|^)' % ur''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE)
  162. REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$')
  163. DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES))
  164. REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
  165. REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
  166. REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
  167. REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
  168. REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE)
  169. REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
  170. # This is used in Extractor
  171. REGEXEN['valid_ascii_domain'] = re.compile(ur'(?:(?:[A-Za-z0-9\-_]|[%s])+\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
  172. # This is used in Extractor for stricter t.co URL extraction
  173. REGEXEN['valid_tco_url'] = re.compile(ur'^https?:\/\/t\.co\/[a-z0-9]+', re.IGNORECASE | re.UNICODE)
  174. # This is used in Extractor to filter out unwanted URLs.
  175. REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE)
  176. REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+')
  177. REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE)
  178. # Allow URL paths to contain balanced parens
  179. # 1. Used in Wikipedia URLs like /Primer_(film)
  180. # 2. Used in IIS sessions like /S(dfd346)/
  181. REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE)
  182. # Valid end-of-path chracters (so /foo. does not gobble the period).
  183. # 1. Allow =&# for empty URL parameters and other URL-join artifacts
  184. REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE)
  185. REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE)
  186. REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE)
  187. REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE)
  188. REGEXEN['valid_url'] = re.compile(ur'((%s)((https?:\/\/)?(%s)(?::(%s))?(/%s*)?(\?%s*%s)?))' % (
  189. REGEXEN['valid_url_preceding_chars'].pattern,
  190. REGEXEN['valid_domain'].pattern,
  191. REGEXEN['valid_port_number'].pattern,
  192. REGEXEN['valid_url_path'].pattern,
  193. REGEXEN['valid_url_query_chars'].pattern,
  194. REGEXEN['valid_url_query_ending_chars'].pattern
  195. ), re.IGNORECASE | re.UNICODE)
  196. # Matches
  197. # $1 total match
  198. # $2 Preceeding chracter
  199. # $3 URL
  200. # $4 Protocol (optional)
  201. # $5 Domain(s)
  202. # $6 Port number (optional)
  203. # $7 URL Path and anchor
  204. # $8 Query String
  205. REGEXEN['cashtag'] = re.compile(ur'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE)
  206. REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE)
  207. # These URL validation pattern strings are based on the ABNF from RFC 3986
  208. REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE)
  209. REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE)
  210. REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE)
  211. REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)
  212. REGEXEN['validate_url_scheme'] = re.compile(ur'(?:[a-z][a-z0-9+\-.]*)', re.IGNORECASE | re.UNICODE)
  213. REGEXEN['validate_url_userinfo'] = re.compile(ur'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)
  214. REGEXEN['validate_url_dec_octet'] = re.compile(ur'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE)
  215. REGEXEN['validate_url_ipv4'] = re.compile(ur'(?:%s(?:\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE)
  216. # Punting on real IPv6 validation for now
  217. REGEXEN['validate_url_ipv6'] = re.compile(ur'(?:\[[a-f0-9:\.]+\])', re.IGNORECASE | re.UNICODE)
  218. # Also punting on IPvFuture for now
  219. REGEXEN['validate_url_ip'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE)
  220. # This is more strict than the rfc specifies
  221. REGEXEN['validate_url_subdomain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
  222. REGEXEN['validate_url_domain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
  223. REGEXEN['validate_url_domain_tld'] = re.compile(ur'(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
  224. REGEXEN['validate_url_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE)
  225. REGEXEN['validate_url_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE)
  226. # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
  227. REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
  228. REGEXEN['validate_url_unicode_domain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
  229. REGEXEN['validate_url_unicode_domain_tld'] = re.compile(ur'(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
  230. REGEXEN['validate_url_unicode_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE)
  231. REGEXEN['validate_url_unicode_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE)
  232. REGEXEN['validate_url_port'] = re.compile(ur'[0-9]{1,5}')
  233. REGEXEN['validate_url_unicode_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE)
  234. REGEXEN['validate_url_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE)
  235. REGEXEN['validate_url_path'] = re.compile(ur'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
  236. REGEXEN['validate_url_query'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
  237. REGEXEN['validate_url_fragment'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
  238. # Modified version of RFC 3986 Appendix B
  239. REGEXEN['validate_url_unencoded'] = re.compile(ur'\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\?([^#]*))?(?:\#(.*))?\Z', re.IGNORECASE | re.UNICODE)
  240. REGEXEN['rtl_chars'] = re.compile(ur'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE)