PageRenderTime 88ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/src/mochiutf8.erl

http://github.com/basho/mochiweb
Erlang | 335 lines | 247 code | 28 blank | 60 comment | 5 complexity | e7e667b42ec3a02817dad6918b497a79 MD5 | raw file
Possible License(s): MIT
  1. %% @copyright 2010 Mochi Media, Inc.
  2. %% @author Bob Ippolito <bob@mochimedia.com>
  3. %%
  4. %% Permission is hereby granted, free of charge, to any person obtaining a
  5. %% copy of this software and associated documentation files (the "Software"),
  6. %% to deal in the Software without restriction, including without limitation
  7. %% the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. %% and/or sell copies of the Software, and to permit persons to whom the
  9. %% Software is furnished to do so, subject to the following conditions:
  10. %%
  11. %% The above copyright notice and this permission notice shall be included in
  12. %% all copies or substantial portions of the Software.
  13. %%
  14. %% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. %% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. %% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  17. %% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. %% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. %% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20. %% DEALINGS IN THE SOFTWARE.
  21. %% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring
  22. %% invalid bytes.
  23. -module(mochiutf8).
  24. -export([valid_utf8_bytes/1, codepoint_to_bytes/1, codepoints_to_bytes/1]).
  25. -export([bytes_to_codepoints/1, bytes_foldl/3, codepoint_foldl/3]).
  26. -export([read_codepoint/1, len/1]).
  27. %% External API
  28. -type unichar_low() :: 0..16#d7ff.
  29. -type unichar_high() :: 16#e000..16#10ffff.
  30. -type unichar() :: unichar_low() | unichar_high().
  31. -spec codepoint_to_bytes(unichar()) -> binary().
  32. %% @doc Convert a unicode codepoint to UTF-8 bytes.
  33. codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) ->
  34. %% U+0000 - U+007F - 7 bits
  35. <<C>>;
  36. codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) ->
  37. %% U+0080 - U+07FF - 11 bits
  38. <<0:5, B1:5, B0:6>> = <<C:16>>,
  39. <<2#110:3, B1:5,
  40. 2#10:2, B0:6>>;
  41. codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso
  42. (C < 16#D800 orelse C > 16#DFFF) ->
  43. %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
  44. <<B2:4, B1:6, B0:6>> = <<C:16>>,
  45. <<2#1110:4, B2:4,
  46. 2#10:2, B1:6,
  47. 2#10:2, B0:6>>;
  48. codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) ->
  49. %% U+10000 - U+10FFFF - 21 bits
  50. <<0:3, B3:3, B2:6, B1:6, B0:6>> = <<C:24>>,
  51. <<2#11110:5, B3:3,
  52. 2#10:2, B2:6,
  53. 2#10:2, B1:6,
  54. 2#10:2, B0:6>>.
  55. -spec codepoints_to_bytes([unichar()]) -> binary().
  56. %% @doc Convert a list of codepoints to a UTF-8 binary.
  57. codepoints_to_bytes(L) ->
  58. <<<<(codepoint_to_bytes(C))/binary>> || C <- L>>.
  59. -spec read_codepoint(binary()) -> {unichar(), binary(), binary()}.
  60. read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) ->
  61. %% U+0000 - U+007F - 7 bits
  62. <<B:1/binary, _/binary>> = Bin,
  63. {C, B, Rest};
  64. read_codepoint(Bin = <<2#110:3, B1:5,
  65. 2#10:2, B0:6,
  66. Rest/binary>>) ->
  67. %% U+0080 - U+07FF - 11 bits
  68. case <<B1:5, B0:6>> of
  69. <<C:11>> when C >= 16#80 ->
  70. <<B:2/binary, _/binary>> = Bin,
  71. {C, B, Rest}
  72. end;
  73. read_codepoint(Bin = <<2#1110:4, B2:4,
  74. 2#10:2, B1:6,
  75. 2#10:2, B0:6,
  76. Rest/binary>>) ->
  77. %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
  78. case <<B2:4, B1:6, B0:6>> of
  79. <<C:16>> when (C >= 16#0800 andalso C =< 16#FFFF) andalso
  80. (C < 16#D800 orelse C > 16#DFFF) ->
  81. <<B:3/binary, _/binary>> = Bin,
  82. {C, B, Rest}
  83. end;
  84. read_codepoint(Bin = <<2#11110:5, B3:3,
  85. 2#10:2, B2:6,
  86. 2#10:2, B1:6,
  87. 2#10:2, B0:6,
  88. Rest/binary>>) ->
  89. %% U+10000 - U+10FFFF - 21 bits
  90. case <<B3:3, B2:6, B1:6, B0:6>> of
  91. <<C:21>> when (C >= 16#010000 andalso C =< 16#10FFFF) ->
  92. <<B:4/binary, _/binary>> = Bin,
  93. {C, B, Rest}
  94. end.
  95. -spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _.
  96. codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) ->
  97. Acc;
  98. codepoint_foldl(F, Acc, Bin) ->
  99. {C, _, Rest} = read_codepoint(Bin),
  100. codepoint_foldl(F, F(C, Acc), Rest).
  101. -spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _.
  102. bytes_foldl(F, Acc, <<>>) when is_function(F, 2) ->
  103. Acc;
  104. bytes_foldl(F, Acc, Bin) ->
  105. {_, B, Rest} = read_codepoint(Bin),
  106. bytes_foldl(F, F(B, Acc), Rest).
  107. -spec bytes_to_codepoints(binary()) -> [unichar()].
  108. bytes_to_codepoints(B) ->
  109. lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)).
  110. -spec len(binary()) -> non_neg_integer().
  111. len(<<>>) ->
  112. 0;
  113. len(B) ->
  114. {_, _, Rest} = read_codepoint(B),
  115. 1 + len(Rest).
  116. -spec valid_utf8_bytes(B::binary()) -> binary().
  117. %% @doc Return only the bytes in B that represent valid UTF-8. Uses
  118. %% the following recursive algorithm: skip one byte if B does not
  119. %% follow UTF-8 syntax (a 1-4 byte encoding of some number),
  120. %% skip sequence of 2-4 bytes if it represents an overlong encoding
  121. %% or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF).
  122. valid_utf8_bytes(B) when is_binary(B) ->
  123. binary_skip_bytes(B, invalid_utf8_indexes(B)).
  124. %% Internal API
  125. -spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary().
  126. %% @doc Return B, but skipping the 0-based indexes in L.
  127. binary_skip_bytes(B, []) ->
  128. B;
  129. binary_skip_bytes(B, L) ->
  130. binary_skip_bytes(B, L, 0, []).
  131. %% @private
  132. -spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary().
  133. binary_skip_bytes(B, [], _N, Acc) ->
  134. iolist_to_binary(lists:reverse([B | Acc]));
  135. binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) ->
  136. binary_skip_bytes(RestB, RestL, 1 + N, Acc);
  137. binary_skip_bytes(<<C, RestB/binary>>, L, N, Acc) ->
  138. binary_skip_bytes(RestB, L, 1 + N, [C | Acc]).
  139. -spec invalid_utf8_indexes(binary()) -> [non_neg_integer()].
  140. %% @doc Return the 0-based indexes in B that are not valid UTF-8.
  141. invalid_utf8_indexes(B) ->
  142. invalid_utf8_indexes(B, 0, []).
  143. %% @private.
  144. -spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()].
  145. invalid_utf8_indexes(<<C, Rest/binary>>, N, Acc) when C < 16#80 ->
  146. %% U+0000 - U+007F - 7 bits
  147. invalid_utf8_indexes(Rest, 1 + N, Acc);
  148. invalid_utf8_indexes(<<C1, C2, Rest/binary>>, N, Acc)
  149. when C1 band 16#E0 =:= 16#C0,
  150. C2 band 16#C0 =:= 16#80 ->
  151. %% U+0080 - U+07FF - 11 bits
  152. case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of
  153. C when C < 16#80 ->
  154. %% Overlong encoding.
  155. invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]);
  156. _ ->
  157. %% Upper bound U+07FF does not need to be checked
  158. invalid_utf8_indexes(Rest, 2 + N, Acc)
  159. end;
  160. invalid_utf8_indexes(<<C1, C2, C3, Rest/binary>>, N, Acc)
  161. when C1 band 16#F0 =:= 16#E0,
  162. C2 band 16#C0 =:= 16#80,
  163. C3 band 16#C0 =:= 16#80 ->
  164. %% U+0800 - U+FFFF - 16 bits
  165. case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
  166. (C3 band 16#3F) of
  167. C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) ->
  168. %% Overlong encoding or surrogate.
  169. invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]);
  170. _ ->
  171. %% Upper bound U+FFFF does not need to be checked
  172. invalid_utf8_indexes(Rest, 3 + N, Acc)
  173. end;
  174. invalid_utf8_indexes(<<C1, C2, C3, C4, Rest/binary>>, N, Acc)
  175. when C1 band 16#F8 =:= 16#F0,
  176. C2 band 16#C0 =:= 16#80,
  177. C3 band 16#C0 =:= 16#80,
  178. C4 band 16#C0 =:= 16#80 ->
  179. %% U+10000 - U+10FFFF - 21 bits
  180. case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
  181. (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of
  182. C when (C < 16#10000) orelse (C > 16#10FFFF) ->
  183. %% Overlong encoding or invalid code point.
  184. invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]);
  185. _ ->
  186. invalid_utf8_indexes(Rest, 4 + N, Acc)
  187. end;
  188. invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) ->
  189. %% Invalid char
  190. invalid_utf8_indexes(Rest, 1 + N, [N | Acc]);
  191. invalid_utf8_indexes(<<>>, _N, Acc) ->
  192. lists:reverse(Acc).
  193. %%
  194. %% Tests
  195. %%
  196. -ifdef(TEST).
  197. -include_lib("eunit/include/eunit.hrl").
  198. binary_skip_bytes_test() ->
  199. ?assertEqual(<<"foo">>,
  200. binary_skip_bytes(<<"foo">>, [])),
  201. ?assertEqual(<<"foobar">>,
  202. binary_skip_bytes(<<"foo bar">>, [3])),
  203. ?assertEqual(<<"foo">>,
  204. binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])),
  205. ?assertEqual(<<"oo bar">>,
  206. binary_skip_bytes(<<"foo bar">>, [0])),
  207. ok.
  208. invalid_utf8_indexes_test() ->
  209. ?assertEqual(
  210. [],
  211. invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)),
  212. ?assertEqual(
  213. [0],
  214. invalid_utf8_indexes(<<128>>)),
  215. ?assertEqual(
  216. [57,59,60,64,66,67],
  217. invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
  218. 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
  219. ok.
  220. codepoint_to_bytes_test() ->
  221. %% U+0000 - U+007F - 7 bits
  222. %% U+0080 - U+07FF - 11 bits
  223. %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
  224. %% U+10000 - U+10FFFF - 21 bits
  225. ?assertEqual(
  226. <<"a">>,
  227. codepoint_to_bytes($a)),
  228. ?assertEqual(
  229. <<16#c2, 16#80>>,
  230. codepoint_to_bytes(16#80)),
  231. ?assertEqual(
  232. <<16#df, 16#bf>>,
  233. codepoint_to_bytes(16#07ff)),
  234. ?assertEqual(
  235. <<16#ef, 16#bf, 16#bf>>,
  236. codepoint_to_bytes(16#ffff)),
  237. ?assertEqual(
  238. <<16#f4, 16#8f, 16#bf, 16#bf>>,
  239. codepoint_to_bytes(16#10ffff)),
  240. ok.
  241. bytes_foldl_test() ->
  242. ?assertEqual(
  243. <<"abc">>,
  244. bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, <<"abc">>)),
  245. ?assertEqual(
  246. <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>,
  247. bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>,
  248. <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
  249. ok.
  250. bytes_to_codepoints_test() ->
  251. ?assertEqual(
  252. "abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff],
  253. bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
  254. ok.
  255. codepoint_foldl_test() ->
  256. ?assertEqual(
  257. "cba",
  258. codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)),
  259. ?assertEqual(
  260. [16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"],
  261. codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [],
  262. <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
  263. ok.
  264. len_test() ->
  265. ?assertEqual(
  266. 29,
  267. len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)),
  268. ok.
  269. codepoints_to_bytes_test() ->
  270. ?assertEqual(
  271. iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))),
  272. codepoints_to_bytes(lists:seq(1, 1000))),
  273. ok.
  274. valid_utf8_bytes_test() ->
  275. ?assertEqual(
  276. <<"invalid U+11ffff: ">>,
  277. valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)),
  278. ?assertEqual(
  279. <<"U+10ffff: ", 244, 143, 191, 191>>,
  280. valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)),
  281. ?assertEqual(
  282. <<"overlong 2-byte encoding (a): ">>,
  283. valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)),
  284. ?assertEqual(
  285. <<"overlong 2-byte encoding (!): ">>,
  286. valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)),
  287. ?assertEqual(
  288. <<"mu: ", 194, 181>>,
  289. valid_utf8_bytes(<<"mu: ", 194, 181>>)),
  290. ?assertEqual(
  291. <<"bad coding bytes: ">>,
  292. valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)),
  293. ?assertEqual(
  294. <<"low surrogate (unpaired): ">>,
  295. valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)),
  296. ?assertEqual(
  297. <<"high surrogate (unpaired): ">>,
  298. valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)),
  299. ?assertEqual(
  300. <<"unicode snowman for you: ", 226, 152, 131>>,
  301. valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)),
  302. ?assertEqual(
  303. <<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>,
  304. valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
  305. 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
  306. ok.
  307. -endif.