PageRenderTime 44ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/test/csv/test_encodings.rb

http://github.com/ruby/ruby
Ruby | 350 lines | 285 code | 41 blank | 24 comment | 16 complexity | 671e43eacc48150160bc1c4ed82da82f MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, AGPL-3.0
  1. # -*- coding: utf-8 -*-
  2. # frozen_string_literal: false
  3. require_relative "helper"
  4. class TestCSVEncodings < Test::Unit::TestCase
  5. extend DifferentOFS
  6. def setup
  7. super
  8. require 'tempfile'
  9. @temp_csv_file = Tempfile.new(%w"test_csv. .csv")
  10. @temp_csv_path = @temp_csv_file.path
  11. @temp_csv_file.close
  12. end
  13. def teardown
  14. @temp_csv_file.close!
  15. super
  16. end
  17. ########################################
  18. ### Hand Test Some Popular Encodings ###
  19. ########################################
  20. def test_parses_utf8_encoding
  21. assert_parses( [ %w[ one two ],
  22. %w[ 1 3 ],
  23. %w[ 5 6 ] ], "UTF-8" )
  24. end
  25. def test_parses_latin1_encoding
  26. assert_parses( [ %w[ one two Résumé ],
  27. %w[ 1 Résumé 3 ],
  28. %w[ Résumé 5 6 ] ], "ISO-8859-1" )
  29. end
  30. def test_parses_utf16be_encoding
  31. assert_parses( [ %w[ one two ],
  32. %w[ 1 3 ],
  33. %w[ 5 6 ] ], "UTF-16BE" )
  34. end
  35. def test_parses_shift_jis_encoding
  36. assert_parses( [ %w[ ],
  37. %w[ ],
  38. %w[ ] ], "Shift_JIS" )
  39. end
  40. ###########################################################
  41. ### Try Simple Reading for All Non-dummy Ruby Encodings ###
  42. ###########################################################
  43. def test_reading_with_most_encodings
  44. each_encoding do |encoding|
  45. begin
  46. assert_parses( [ %w[ abc def ],
  47. %w[ ghi jkl ] ], encoding )
  48. rescue Encoding::ConverterNotFoundError
  49. fail("Failed to support #{encoding.name}.")
  50. end
  51. end
  52. end
  53. def test_regular_expression_escaping
  54. each_encoding do |encoding|
  55. begin
  56. assert_parses( [ %w[ abc def ],
  57. %w[ ghi jkl ] ], encoding, col_sep: "|" )
  58. rescue Encoding::ConverterNotFoundError
  59. fail("Failed to properly escape #{encoding.name}.")
  60. end
  61. end
  62. end
  63. def test_read_with_default_encoding
  64. data = "abc"
  65. default_external = Encoding.default_external
  66. each_encoding do |encoding|
  67. File.open(@temp_csv_path, "wb", encoding: encoding) {|f| f << data}
  68. begin
  69. no_warnings do
  70. Encoding.default_external = encoding
  71. end
  72. result = CSV.read(@temp_csv_path)[0][0]
  73. ensure
  74. no_warnings do
  75. Encoding.default_external = default_external
  76. end
  77. end
  78. assert_equal(encoding, result.encoding)
  79. end
  80. end
  81. #######################################################################
  82. ### Stress Test ASCII Compatible and Non-ASCII Compatible Encodings ###
  83. #######################################################################
  84. def test_auto_line_ending_detection
  85. # arrange data to place a \r at the end of CSV's read ahead point
  86. encode_for_tests([["a" * 509]], row_sep: "\r\n") do |data|
  87. assert_equal("\r\n".encode(data.encoding), CSV.new(data).row_sep)
  88. end
  89. end
  90. def test_csv_chars_are_transcoded
  91. encode_for_tests([%w[abc def]]) do |data|
  92. %w[col_sep row_sep quote_char].each do |csv_char|
  93. assert_equal( "|".encode(data.encoding),
  94. CSV.new(data, csv_char.to_sym => "|").send(csv_char) )
  95. end
  96. end
  97. end
  98. def test_parser_works_with_encoded_headers
  99. encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
  100. parsed = CSV.parse(data, headers: true)
  101. assert_all?(parsed.headers, "Wrong data encoding.") {|h| h.encoding == data.encoding}
  102. parsed.each do |row|
  103. assert_all?(row.fields, "Wrong data encoding.") {|f| f.encoding == data.encoding}
  104. end
  105. end
  106. end
  107. def test_built_in_converters_transcode_to_utf_8_then_convert
  108. encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
  109. parsed = CSV.parse(data, converters: :integer)
  110. assert_all?(parsed[0], "Wrong data encoding.") {|f| f.encoding == data.encoding}
  111. assert_equal([1, 2, 3], parsed[1])
  112. end
  113. end
  114. def test_built_in_header_converters_transcode_to_utf_8_then_convert
  115. encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
  116. parsed = CSV.parse( data, headers: true,
  117. header_converters: :downcase )
  118. assert_all?(parsed.headers, "Wrong data encoding.") {|h| h.encoding.name == "UTF-8"}
  119. assert_all?(parsed[0].fields, "Wrong data encoding.") {|f| f.encoding == data.encoding}
  120. end
  121. end
  122. def test_open_allows_you_to_set_encodings
  123. encode_for_tests([%w[abc def]]) do |data|
  124. # read and write in encoding
  125. File.open(@temp_csv_path, "wb:#{data.encoding.name}") { |f| f << data }
  126. CSV.open(@temp_csv_path, "rb:#{data.encoding.name}") do |csv|
  127. csv.each do |row|
  128. assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding}
  129. end
  130. end
  131. # read and write with transcoding
  132. File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
  133. f << data
  134. end
  135. CSV.open(@temp_csv_path, "rb:UTF-32BE:#{data.encoding.name}") do |csv|
  136. csv.each do |row|
  137. assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding}
  138. end
  139. end
  140. end
  141. end
  142. def test_foreach_allows_you_to_set_encodings
  143. encode_for_tests([%w[abc def]]) do |data|
  144. # read and write in encoding
  145. File.open(@temp_csv_path, "wb", encoding: data.encoding) { |f| f << data }
  146. CSV.foreach(@temp_csv_path, encoding: data.encoding) do |row|
  147. row.each {|f| assert_equal(f.encoding, data.encoding)}
  148. end
  149. # read and write with transcoding
  150. File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
  151. f << data
  152. end
  153. CSV.foreach( @temp_csv_path,
  154. encoding: "UTF-32BE:#{data.encoding.name}" ) do |row|
  155. assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding}
  156. end
  157. end
  158. end
  159. def test_read_allows_you_to_set_encodings
  160. encode_for_tests([%w[abc def]]) do |data|
  161. # read and write in encoding
  162. File.open(@temp_csv_path, "wb:#{data.encoding.name}") { |f| f << data }
  163. rows = CSV.read(@temp_csv_path, encoding: data.encoding.name)
  164. assert_all?(rows.flatten, "Wrong data encoding.") {|f| f.encoding == data.encoding}
  165. # read and write with transcoding
  166. File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
  167. f << data
  168. end
  169. rows = CSV.read( @temp_csv_path,
  170. encoding: "UTF-32BE:#{data.encoding.name}" )
  171. assert_all?(rows.flatten, "Wrong data encoding.") {|f| f.encoding == data.encoding}
  172. end
  173. end
  174. #################################
  175. ### Write CSV in any Encoding ###
  176. #################################
  177. def test_can_write_csv_in_any_encoding
  178. each_encoding do |encoding|
  179. # test generate_line with encoding hint
  180. begin
  181. csv = %w[abc d|ef].map { |f| f.encode(encoding) }.
  182. to_csv(col_sep: "|", encoding: encoding.name)
  183. rescue Encoding::ConverterNotFoundError
  184. next
  185. end
  186. assert_equal(encoding, csv.encoding)
  187. # test generate_line with encoding guessing from fields
  188. csv = %w[abc d|ef].map { |f| f.encode(encoding) }.to_csv(col_sep: "|")
  189. assert_equal(encoding, csv.encoding)
  190. # writing to files
  191. data = encode_ary([%w[abc d,ef], %w[123 456 ]], encoding)
  192. CSV.open(@temp_csv_path, "wb:#{encoding.name}") do |f|
  193. data.each { |row| f << row }
  194. end
  195. assert_equal(data, CSV.read(@temp_csv_path, encoding: encoding.name))
  196. end
  197. end
  198. def test_encoding_is_upgraded_during_writing_as_needed
  199. data = ["foo".force_encoding("US-ASCII"), "\u3042"]
  200. assert_equal("US-ASCII", data.first.encoding.name)
  201. assert_equal("UTF-8", data.last.encoding.name)
  202. assert_equal("UTF-8", data.join('').encoding.name)
  203. assert_equal("UTF-8", data.to_csv.encoding.name)
  204. end
  205. def test_encoding_is_upgraded_for_ascii_content_during_writing_as_needed
  206. data = ["foo".force_encoding("ISO-8859-1"), "\u3042"]
  207. assert_equal("ISO-8859-1", data.first.encoding.name)
  208. assert_equal("UTF-8", data.last.encoding.name)
  209. assert_equal("UTF-8", data.join('').encoding.name)
  210. assert_equal("UTF-8", data.to_csv.encoding.name)
  211. end
  212. def test_explicit_encoding
  213. bug9766 = '[ruby-core:62113] [Bug #9766]'
  214. s = CSV.generate(encoding: "Windows-31J") do |csv|
  215. csv << ["foo".force_encoding("ISO-8859-1"), "\u3042"]
  216. end
  217. assert_equal(["foo,\u3042\n".encode(Encoding::Windows_31J), Encoding::Windows_31J], [s, s.encoding], bug9766)
  218. end
  219. def test_row_separator_detection_with_invalid_encoding
  220. csv = CSV.new("invalid,\xF8\r\nvalid,x\r\n".force_encoding("UTF-8"),
  221. encoding: "UTF-8")
  222. assert_equal("\r\n", csv.row_sep)
  223. end
  224. def test_invalid_encoding_row_error
  225. csv = CSV.new("valid,x\rinvalid,\xF8\r".force_encoding("UTF-8"),
  226. encoding: "UTF-8", row_sep: "\r")
  227. error = assert_raise(CSV::MalformedCSVError) do
  228. csv.shift
  229. csv.shift
  230. end
  231. assert_equal("Invalid byte sequence in UTF-8 in line 2.",
  232. error.message)
  233. end
  234. private
  235. def assert_parses(fields, encoding, **options)
  236. encoding = Encoding.find(encoding) unless encoding.is_a? Encoding
  237. orig_fields = fields
  238. fields = encode_ary(fields, encoding)
  239. data = ary_to_data(fields, **options)
  240. parsed = CSV.parse(data, **options)
  241. assert_equal(fields, parsed)
  242. parsed.flatten.each_with_index do |field, i|
  243. assert_equal(encoding, field.encoding, "Field[#{i + 1}] was transcoded.")
  244. end
  245. File.open(@temp_csv_path, "wb") {|f| f.print(data)}
  246. CSV.open(@temp_csv_path, "rb:#{encoding}", **options) do |csv|
  247. csv.each_with_index do |row, i|
  248. assert_equal(fields[i], row)
  249. end
  250. end
  251. begin
  252. CSV.open(@temp_csv_path,
  253. "rb:#{encoding}:#{__ENCODING__}",
  254. **options) do |csv|
  255. csv.each_with_index do |row, i|
  256. assert_equal(orig_fields[i], row)
  257. end
  258. end unless encoding == __ENCODING__
  259. rescue Encoding::ConverterNotFoundError
  260. end
  261. options[:encoding] = encoding.name
  262. CSV.open(@temp_csv_path, **options) do |csv|
  263. csv.each_with_index do |row, i|
  264. assert_equal(fields[i], row)
  265. end
  266. end
  267. options.delete(:encoding)
  268. options[:external_encoding] = encoding.name
  269. options[:internal_encoding] = __ENCODING__.name
  270. begin
  271. CSV.open(@temp_csv_path, **options) do |csv|
  272. csv.each_with_index do |row, i|
  273. assert_equal(orig_fields[i], row)
  274. end
  275. end unless encoding == __ENCODING__
  276. rescue Encoding::ConverterNotFoundError
  277. end
  278. end
  279. def encode_ary(ary, encoding)
  280. ary.map { |row| row.map { |field| field.encode(encoding) } }
  281. end
  282. def ary_to_data(ary, **options)
  283. encoding = ary.flatten.first.encoding
  284. quote_char = (options[:quote_char] || '"').encode(encoding)
  285. col_sep = (options[:col_sep] || ",").encode(encoding)
  286. row_sep = (options[:row_sep] || "\n").encode(encoding)
  287. ary.map { |row|
  288. row.map { |field|
  289. [quote_char, field.encode(encoding), quote_char].join('')
  290. }.join(col_sep) + row_sep
  291. }.join('').encode(encoding)
  292. end
  293. def encode_for_tests(data, **options)
  294. yield ary_to_data(encode_ary(data, "UTF-8"), **options)
  295. yield ary_to_data(encode_ary(data, "UTF-16BE"), **options)
  296. end
  297. def each_encoding
  298. Encoding.list.each do |encoding|
  299. next if encoding.dummy? # skip "dummy" encodings
  300. yield encoding
  301. end
  302. end
  303. def no_warnings
  304. old_verbose, $VERBOSE = $VERBOSE, nil
  305. yield
  306. ensure
  307. $VERBOSE = old_verbose
  308. end
  309. end