PageRenderTime 54ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/vendor/rails/activesupport/test/multibyte_handler_test.rb

https://github.com/bricooke/my-biz-expenses
Ruby | 274 lines | 234 code | 31 blank | 9 comment | 1 complexity | 880413e7a0207dd4dafe62dac8a7ea47 MD5 | raw file
Possible License(s): CC-BY-SA-3.0, BSD-3-Clause
  1. require File.dirname(__FILE__) + '/abstract_unit'
  2. $KCODE = 'UTF8'
  3. class String
  4. # Unicode Inspect returns the codepoints of the string in hex
  5. def ui
  6. "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' '))
  7. end unless ''.respond_to?(:ui)
  8. end
  9. module UTF8HandlingTest
  10. def common_setup
  11. # This is an ASCII string with some russian strings and a ligature. It's nicely calibrated, because
  12. # slicing it at some specific bytes will kill your characters if you use standard Ruby routines.
  13. # It has both capital and standard letters, so that we can test case conversions easily.
  14. # It has 26 charactes and 28 when the ligature gets split during normalization.
  15. @string = "Abcd Блå ffi бла бла бла бла"
  16. @string_kd = "Abcd Блå ffi бла бла бла бла"
  17. @string_kc = "Abcd Блå ffi бла бла бла бла"
  18. @string_c = "Abcd Блå ffi бла бла бла бла"
  19. @string_d = "Abcd Блå ffi бла бла бла бла"
  20. @bytestring = "\270\236\010\210\245" # Not UTF-8
  21. # Characters from the character classes as described in UAX #29
  22. @character_from_class = {
  23. :l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A,
  24. :extend => 0x094D, :n => 0x64
  25. }
  26. end
  27. def test_utf8_recognition
  28. assert ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@string),
  29. "Should recognize as a valid UTF-8 string"
  30. assert !ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@bytestring), "This is bytestring, not UTF-8"
  31. end
  32. def test_simple_normalization
  33. # Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
  34. assert_equal [0x915, 0x93c].pack('U*').ui, [0x915, 0x93c].pack('U*').chars.normalize(:c).to_s.ui
  35. null_byte_str = "Test\0test"
  36. assert_equal '', @handler.normalize(''), "Empty string should not break things"
  37. assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kc).ui, "Null byte should remain"
  38. assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :c).ui, "Null byte should remain"
  39. assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :d).ui, "Null byte should remain"
  40. assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kd).ui, "Null byte should remain"
  41. assert_equal null_byte_str.ui, @handler.decompose(null_byte_str).ui, "Null byte should remain"
  42. assert_equal null_byte_str.ui, @handler.compose(null_byte_str).ui, "Null byte should remain"
  43. comp_str = [
  44. 44, # LATIN CAPITAL LETTER D
  45. 307, # COMBINING DOT ABOVE
  46. 328, # COMBINING OGONEK
  47. 323 # COMBINING DOT BELOW
  48. ].pack("U*")
  49. norm_str_KC = [44,105,106,328,323].pack("U*")
  50. norm_str_C = [44,307,328,323].pack("U*")
  51. norm_str_D = [44,307,110,780,78,769].pack("U*")
  52. norm_str_KD = [44,105,106,110,780,78,769].pack("U*")
  53. assert_equal norm_str_KC.ui, @handler.normalize(comp_str, :kc).ui, "Should normalize KC"
  54. assert_equal norm_str_C.ui, @handler.normalize(comp_str, :c).ui, "Should normalize C"
  55. assert_equal norm_str_D.ui, @handler.normalize(comp_str, :d).ui, "Should normalize D"
  56. assert_equal norm_str_KD.ui, @handler.normalize(comp_str, :kd).ui, "Should normalize KD"
  57. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.normalize(@bytestring) }
  58. end
  59. # Test for the Public Review Issue #29, bad explaination of composition might lead to a
  60. # bad implementation: http://www.unicode.org/review/pr-29.html
  61. def test_normalization_C_pri_29
  62. [
  63. [0x0B47, 0x0300, 0x0B3E],
  64. [0x1100, 0x0300, 0x1161]
  65. ].map { |c| c.pack('U*') }.each do |c|
  66. assert_equal c.ui, @handler.normalize(c, :c).ui, "Composition is implemented incorrectly"
  67. end
  68. end
  69. def test_casefolding
  70. simple_str = "abCdef"
  71. simple_str_upcase = "ABCDEF"
  72. simple_str_downcase = "abcdef"
  73. assert_equal '', @handler.downcase(@handler.upcase('')), "Empty string should not break things"
  74. assert_equal simple_str_upcase, @handler.upcase(simple_str), "should upcase properly"
  75. assert_equal simple_str_downcase, @handler.downcase(simple_str), "should downcase properly"
  76. assert_equal simple_str_downcase, @handler.downcase(@handler.upcase(simple_str_downcase)), "upcase and downcase should be mirrors"
  77. rus_str = "аБвгд\0f"
  78. rus_str_upcase = "АБВГД\0F"
  79. rus_str_downcase = "абвгд\0f"
  80. assert_equal rus_str_upcase, @handler.upcase(rus_str), "should upcase properly honoring null-byte"
  81. assert_equal rus_str_downcase, @handler.downcase(rus_str), "should downcase properly honoring null-byte"
  82. jap_str = "の埋め込み化対応はほぼ完成"
  83. assert_equal jap_str, @handler.upcase(jap_str), "Japanse has no upcase, should remain unchanged"
  84. assert_equal jap_str, @handler.downcase(jap_str), "Japanse has no downcase, should remain unchanged"
  85. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.upcase(@bytestring) }
  86. end
  87. def test_capitalize
  88. { 'аБвг аБвг' => 'Абвг абвг',
  89. 'аБвг АБВГ' => 'Абвг абвг',
  90. 'АБВГ АБВГ' => 'Абвг абвг',
  91. '' => '' }.each do |f,t|
  92. assert_equal t, @handler.capitalize(f), "Capitalize should work as expected"
  93. end
  94. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.capitalize(@bytestring) }
  95. end
  96. def test_translate_offset
  97. str = "Блaå" # [2, 2, 1, 2] bytes
  98. assert_equal 0, @handler.translate_offset('', 0), "Offset for an empty string makes no sense, return 0"
  99. assert_equal 0, @handler.translate_offset(str, 0), "First character, first byte"
  100. assert_equal 0, @handler.translate_offset(str, 1), "First character, second byte"
  101. assert_equal 1, @handler.translate_offset(str, 2), "Second character, third byte"
  102. assert_equal 1, @handler.translate_offset(str, 3), "Second character, fourth byte"
  103. assert_equal 2, @handler.translate_offset(str, 4), "Third character, fifth byte"
  104. assert_equal 3, @handler.translate_offset(str, 5), "Fourth character, sixth byte"
  105. assert_equal 3, @handler.translate_offset(str, 6), "Fourth character, seventh byte"
  106. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.translate_offset(@bytestring, 3) }
  107. end
  108. def test_insert
  109. assert_equal '', @handler.insert('', 0, ''), "Empty string should not break things"
  110. assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @handler.insert(@string, 10, "БУМ"),
  111. "Text should be inserted at right codepoints"
  112. assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @string, "Insert should be destructive"
  113. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) do
  114. @handler.insert(@bytestring, 2, "\210")
  115. end
  116. end
  117. def test_reverse
  118. str = "wБлåa \n"
  119. rev = "\n aåлБw"
  120. assert_equal '', @handler.reverse(''), "Empty string shouldn't change"
  121. assert_equal rev.ui, @handler.reverse(str).ui, "Should reverse properly"
  122. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.reverse(@bytestring) }
  123. end
  124. def test_size
  125. assert_equal 0, @handler.size(''), "Empty string has size 0"
  126. assert_equal 26, @handler.size(@string), "String length should be 26"
  127. assert_equal 26, @handler.length(@string), "String length method should be properly aliased"
  128. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.size(@bytestring) }
  129. end
  130. def test_slice
  131. assert_equal 0x41, @handler.slice(@string, 0), "Singular characters should return codepoints"
  132. assert_equal 0xE5, @handler.slice(@string, 7), "Singular characters should return codepoints"
  133. assert_equal nil, @handler.slice('', -1..1), "Broken range should return nil"
  134. assert_equal '', @handler.slice('', 0..10), "Empty string should not break things"
  135. assert_equal "d Блå ffi", @handler.slice(@string, 3..9), "Unicode characters have to be returned"
  136. assert_equal "d Блå ffi", @handler.slice(@string, 3, 7), "Unicode characters have to be returned"
  137. assert_equal "A", @handler.slice(@string, 0, 1), "Slicing from an offset should return characters"
  138. assert_equal " Блå ffi ", @handler.slice(@string, 4..10), "Unicode characters have to be returned"
  139. assert_equal "", @handler.slice(@string, 7..6), "Range is empty, should return an empty string"
  140. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.slice(@bytestring, 2..3) }
  141. end
  142. def test_grapheme_cluster_length
  143. assert_equal 0, @handler.g_length(''), "String should count 0 grapheme clusters"
  144. assert_equal 2, @handler.g_length([0x0924, 0x094D, 0x0930].pack('U*')), "String should count 2 grapheme clusters"
  145. assert_equal 1, @handler.g_length(string_from_classes(%w(cr lf))), "Don't cut between CR and LF"
  146. assert_equal 1, @handler.g_length(string_from_classes(%w(l l))), "Don't cut between L"
  147. assert_equal 1, @handler.g_length(string_from_classes(%w(l v))), "Don't cut between L and V"
  148. assert_equal 1, @handler.g_length(string_from_classes(%w(l lv))), "Don't cut between L and LV"
  149. assert_equal 1, @handler.g_length(string_from_classes(%w(l lvt))), "Don't cut between L and LVT"
  150. assert_equal 1, @handler.g_length(string_from_classes(%w(lv v))), "Don't cut between LV and V"
  151. assert_equal 1, @handler.g_length(string_from_classes(%w(lv t))), "Don't cut between LV and T"
  152. assert_equal 1, @handler.g_length(string_from_classes(%w(v v))), "Don't cut between V and V"
  153. assert_equal 1, @handler.g_length(string_from_classes(%w(v t))), "Don't cut between V and T"
  154. assert_equal 1, @handler.g_length(string_from_classes(%w(lvt t))), "Don't cut between LVT and T"
  155. assert_equal 1, @handler.g_length(string_from_classes(%w(t t))), "Don't cut between T and T"
  156. assert_equal 1, @handler.g_length(string_from_classes(%w(n extend))), "Don't cut before Extend"
  157. assert_equal 2, @handler.g_length(string_from_classes(%w(n n))), "Cut between normal characters"
  158. assert_equal 3, @handler.g_length(string_from_classes(%w(n cr lf n))), "Don't cut between CR and LF"
  159. assert_equal 2, @handler.g_length(string_from_classes(%w(n l v t))), "Don't cut between L, V and T"
  160. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.g_length(@bytestring) }
  161. end
  162. def test_index
  163. s = "Καλημέρα κόσμε!"
  164. assert_equal 0, @handler.index('', ''), "The empty string is always found at the beginning of the string"
  165. assert_equal 0, @handler.index('haystack', ''), "The empty string is always found at the beginning of the string"
  166. assert_equal 0, @handler.index(s, 'Κ'), "Greek K is at 0"
  167. assert_equal 1, @handler.index(s, 'α'), "Greek Alpha is at 1"
  168. assert_equal nil, @handler.index(@bytestring, 'a')
  169. assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.index(@bytestring, "\010") }
  170. end
  171. def test_strip
  172. # A unicode aware version of strip should strip all 26 types of whitespace. This includes the NO BREAK SPACE
  173. # aka BOM (byte order mark). The byte order mark has no place in UTF-8 because it's used to detect LE and BE.
  174. b = "\n" + [
  175. 32, # SPACE
  176. 8195, # EM SPACE
  177. 8199, # FIGURE SPACE,
  178. 8201, # THIN SPACE
  179. 8202, # HAIR SPACE
  180. 65279, # NO BREAK SPACE (ZW)
  181. ].pack('U*')
  182. m = "word блин\n\n\n word"
  183. e = [
  184. 65279, # NO BREAK SPACE (ZW)
  185. 8201, # THIN SPACE
  186. 8199, # FIGURE SPACE,
  187. 32, # SPACE
  188. ].pack('U*')
  189. string = b+m+e
  190. assert_equal '', @handler.strip(''), "Empty string should stay empty"
  191. assert_equal m+e, @handler.lstrip(string), "Whitespace should be gone on the left"
  192. assert_equal b+m, @handler.rstrip(string), "Whitespace should be gone on the right"
  193. assert_equal m, @handler.strip(string), "Whitespace should be stripped on both sides"
  194. bs = "\n #{@bytestring} \n\n"
  195. assert_equal @bytestring, @handler.strip(bs), "Invalid unicode strings should still strip"
  196. end
  197. def test_tidy_bytes
  198. result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
  199. assert_equal result, @handler.tidy_bytes(@bytestring)
  200. assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a'),
  201. 'tidy_bytes should leave surrounding characters intact'
  202. assert_equal "é#{result}é", @handler.tidy_bytes('é' + @bytestring + 'é'),
  203. 'tidy_bytes should leave surrounding characters intact'
  204. assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') }
  205. assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla
  206. assert_equal "\xC2\xA9", @handler.tidy_bytes("\xA9") # iso_8859_1: copyright symbol
  207. assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote
  208. assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro
  209. assert_equal "\x00", @handler.tidy_bytes("\x00") # null char
  210. assert_equal [0xfffd].pack('U'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char
  211. end
  212. protected
  213. def string_from_classes(classes)
  214. classes.collect do |k|
  215. @character_from_class[k.intern]
  216. end.pack('U*')
  217. end
  218. end
  219. begin
  220. require_library_or_gem('utf8proc_native')
  221. require 'active_record/multibyte/handlers/utf8_handler_proc'
  222. class UTF8HandlingTestProc < Test::Unit::TestCase
  223. include UTF8HandlingTest
  224. def setup
  225. common_setup
  226. @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc
  227. end
  228. end
  229. rescue LoadError
  230. end
  231. class UTF8HandlingTestPure < Test::Unit::TestCase
  232. include UTF8HandlingTest
  233. def setup
  234. common_setup
  235. @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler
  236. end
  237. end