PageRenderTime 1215ms CodeModel.GetById 242ms app.highlight 751ms RepoModel.GetById 217ms app.codeStats 0ms

/src/mochiutf8.erl

http://github.com/basho/mochiweb
Erlang | 335 lines | 247 code | 28 blank | 60 comment | 5 complexity | e7e667b42ec3a02817dad6918b497a79 MD5 | raw file
  1%% @copyright 2010 Mochi Media, Inc.
  2%% @author Bob Ippolito <bob@mochimedia.com>
  3%%
  4%% Permission is hereby granted, free of charge, to any person obtaining a
  5%% copy of this software and associated documentation files (the "Software"),
  6%% to deal in the Software without restriction, including without limitation
  7%% the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8%% and/or sell copies of the Software, and to permit persons to whom the
  9%% Software is furnished to do so, subject to the following conditions:
 10%%
 11%% The above copyright notice and this permission notice shall be included in
 12%% all copies or substantial portions of the Software.
 13%%
 14%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 17%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20%% DEALINGS IN THE SOFTWARE.
 21
 22%% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring
 23%%      invalid bytes.
 24
 25-module(mochiutf8).
 26-export([valid_utf8_bytes/1, codepoint_to_bytes/1, codepoints_to_bytes/1]).
 27-export([bytes_to_codepoints/1, bytes_foldl/3, codepoint_foldl/3]).
 28-export([read_codepoint/1, len/1]).
 29
 30%% External API
 31
 32-type unichar_low() :: 0..16#d7ff.
 33-type unichar_high() :: 16#e000..16#10ffff.
 34-type unichar() :: unichar_low() | unichar_high().
 35
 36-spec codepoint_to_bytes(unichar()) -> binary().
 37%% @doc Convert a unicode codepoint to UTF-8 bytes.
 38codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) ->
 39    %% U+0000 - U+007F - 7 bits
 40    <<C>>;
 41codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) ->
 42    %% U+0080 - U+07FF - 11 bits
 43    <<0:5, B1:5, B0:6>> = <<C:16>>,
 44    <<2#110:3, B1:5,
 45      2#10:2, B0:6>>;
 46codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso
 47                           (C < 16#D800 orelse C > 16#DFFF) ->
 48    %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
 49    <<B2:4, B1:6, B0:6>> = <<C:16>>,
 50    <<2#1110:4, B2:4,
 51      2#10:2, B1:6,
 52      2#10:2, B0:6>>;
 53codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) ->
 54    %% U+10000 - U+10FFFF - 21 bits
 55    <<0:3, B3:3, B2:6, B1:6, B0:6>> = <<C:24>>,
 56    <<2#11110:5, B3:3,
 57      2#10:2, B2:6,
 58      2#10:2, B1:6,
 59      2#10:2, B0:6>>.
 60
 61-spec codepoints_to_bytes([unichar()]) -> binary().
 62%% @doc Convert a list of codepoints to a UTF-8 binary.
 63codepoints_to_bytes(L) ->
 64    <<<<(codepoint_to_bytes(C))/binary>> || C <- L>>.
 65
 66-spec read_codepoint(binary()) -> {unichar(), binary(), binary()}.
 67read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) ->
 68    %% U+0000 - U+007F - 7 bits
 69    <<B:1/binary, _/binary>> = Bin,
 70    {C, B, Rest};
 71read_codepoint(Bin = <<2#110:3, B1:5,
 72                       2#10:2, B0:6,
 73                       Rest/binary>>) ->
 74    %% U+0080 - U+07FF - 11 bits
 75    case <<B1:5, B0:6>> of
 76        <<C:11>> when C >= 16#80 ->
 77            <<B:2/binary, _/binary>> = Bin,
 78            {C, B, Rest}
 79    end;
 80read_codepoint(Bin = <<2#1110:4, B2:4,
 81                       2#10:2, B1:6,
 82                       2#10:2, B0:6,
 83                       Rest/binary>>) ->
 84    %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
 85    case <<B2:4, B1:6, B0:6>> of
 86        <<C:16>> when (C >= 16#0800 andalso C =< 16#FFFF) andalso
 87                      (C < 16#D800 orelse C > 16#DFFF) ->
 88            <<B:3/binary, _/binary>> = Bin,
 89            {C, B, Rest}
 90    end;
 91read_codepoint(Bin = <<2#11110:5, B3:3,
 92                       2#10:2, B2:6,
 93                       2#10:2, B1:6,
 94                       2#10:2, B0:6,
 95                       Rest/binary>>) ->
 96    %% U+10000 - U+10FFFF - 21 bits
 97    case <<B3:3, B2:6, B1:6, B0:6>> of
 98        <<C:21>> when (C >= 16#010000 andalso C =< 16#10FFFF) ->
 99            <<B:4/binary, _/binary>> = Bin,
100            {C, B, Rest}
101    end.
102
103-spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _.
104codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) ->
105    Acc;
106codepoint_foldl(F, Acc, Bin) ->
107    {C, _, Rest} = read_codepoint(Bin),
108    codepoint_foldl(F, F(C, Acc), Rest).
109
110-spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _.
111bytes_foldl(F, Acc, <<>>) when is_function(F, 2) ->
112    Acc;
113bytes_foldl(F, Acc, Bin) ->
114    {_, B, Rest} = read_codepoint(Bin),
115    bytes_foldl(F, F(B, Acc), Rest).
116
117-spec bytes_to_codepoints(binary()) -> [unichar()].
118bytes_to_codepoints(B) ->
119    lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)).
120
121-spec len(binary()) -> non_neg_integer().
122len(<<>>) ->
123    0;
124len(B) ->
125    {_, _, Rest} = read_codepoint(B),
126    1 + len(Rest).
127
128-spec valid_utf8_bytes(B::binary()) -> binary().
129%% @doc Return only the bytes in B that represent valid UTF-8. Uses
130%%      the following recursive algorithm: skip one byte if B does not
131%%      follow UTF-8 syntax (a 1-4 byte encoding of some number),
132%%      skip sequence of 2-4 bytes if it represents an overlong encoding
133%%      or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF).
134valid_utf8_bytes(B) when is_binary(B) ->
135    binary_skip_bytes(B, invalid_utf8_indexes(B)).
136
137%% Internal API
138
139-spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary().
140%% @doc Return B, but skipping the 0-based indexes in L.
141binary_skip_bytes(B, []) ->
142    B;
143binary_skip_bytes(B, L) ->
144    binary_skip_bytes(B, L, 0, []).
145
146%% @private
147-spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary().
148binary_skip_bytes(B, [], _N, Acc) ->
149    iolist_to_binary(lists:reverse([B | Acc]));
150binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) ->
151    binary_skip_bytes(RestB, RestL, 1 + N, Acc);
152binary_skip_bytes(<<C, RestB/binary>>, L, N, Acc) ->
153    binary_skip_bytes(RestB, L, 1 + N, [C | Acc]).
154
155-spec invalid_utf8_indexes(binary()) -> [non_neg_integer()].
156%% @doc Return the 0-based indexes in B that are not valid UTF-8.
157invalid_utf8_indexes(B) ->
158    invalid_utf8_indexes(B, 0, []).
159
160%% @private.
161-spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()].
162invalid_utf8_indexes(<<C, Rest/binary>>, N, Acc) when C < 16#80 ->
163    %% U+0000 - U+007F - 7 bits
164    invalid_utf8_indexes(Rest, 1 + N, Acc);
165invalid_utf8_indexes(<<C1, C2, Rest/binary>>, N, Acc)
166  when C1 band 16#E0 =:= 16#C0,
167       C2 band 16#C0 =:= 16#80 ->
168    %% U+0080 - U+07FF - 11 bits
169    case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of
170	C when C < 16#80 ->
171            %% Overlong encoding.
172            invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]);
173        _ ->
174            %% Upper bound U+07FF does not need to be checked
175            invalid_utf8_indexes(Rest, 2 + N, Acc)
176    end;
177invalid_utf8_indexes(<<C1, C2, C3, Rest/binary>>, N, Acc)
178  when C1 band 16#F0 =:= 16#E0,
179       C2 band 16#C0 =:= 16#80,
180       C3 band 16#C0 =:= 16#80 ->
181    %% U+0800 - U+FFFF - 16 bits
182    case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
183	(C3 band 16#3F) of
184	C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) ->
185	    %% Overlong encoding or surrogate.
186            invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]);
187	_ ->
188            %% Upper bound U+FFFF does not need to be checked
189	    invalid_utf8_indexes(Rest, 3 + N, Acc)
190    end;
191invalid_utf8_indexes(<<C1, C2, C3, C4, Rest/binary>>, N, Acc)
192  when C1 band 16#F8 =:= 16#F0,
193       C2 band 16#C0 =:= 16#80,
194       C3 band 16#C0 =:= 16#80,
195       C4 band 16#C0 =:= 16#80 ->
196    %% U+10000 - U+10FFFF - 21 bits
197    case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
198           (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of
199	C when (C < 16#10000) orelse (C > 16#10FFFF) ->
200	    %% Overlong encoding or invalid code point.
201	    invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]);
202	_ ->
203	    invalid_utf8_indexes(Rest, 4 + N, Acc)
204    end;
205invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) ->
206    %% Invalid char
207    invalid_utf8_indexes(Rest, 1 + N, [N | Acc]);
208invalid_utf8_indexes(<<>>, _N, Acc) ->
209    lists:reverse(Acc).
210
211%%
212%% Tests
213%%
214-ifdef(TEST).
215-include_lib("eunit/include/eunit.hrl").
216
217binary_skip_bytes_test() ->
218    ?assertEqual(<<"foo">>,
219                 binary_skip_bytes(<<"foo">>, [])),
220    ?assertEqual(<<"foobar">>,
221                 binary_skip_bytes(<<"foo bar">>, [3])),
222    ?assertEqual(<<"foo">>,
223                 binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])),
224    ?assertEqual(<<"oo bar">>,
225                 binary_skip_bytes(<<"foo bar">>, [0])),
226    ok.
227
228invalid_utf8_indexes_test() ->
229    ?assertEqual(
230       [],
231       invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)),
232    ?assertEqual(
233       [0],
234       invalid_utf8_indexes(<<128>>)),
235    ?assertEqual(
236       [57,59,60,64,66,67],
237       invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
238                              167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
239    ok.
240
241codepoint_to_bytes_test() ->
242    %% U+0000 - U+007F - 7 bits
243    %% U+0080 - U+07FF - 11 bits
244    %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
245    %% U+10000 - U+10FFFF - 21 bits
246    ?assertEqual(
247       <<"a">>,
248       codepoint_to_bytes($a)),
249    ?assertEqual(
250       <<16#c2, 16#80>>,
251       codepoint_to_bytes(16#80)),
252    ?assertEqual(
253       <<16#df, 16#bf>>,
254       codepoint_to_bytes(16#07ff)),
255    ?assertEqual(
256       <<16#ef, 16#bf, 16#bf>>,
257       codepoint_to_bytes(16#ffff)),
258    ?assertEqual(
259       <<16#f4, 16#8f, 16#bf, 16#bf>>,
260       codepoint_to_bytes(16#10ffff)),
261    ok.
262
263bytes_foldl_test() ->
264    ?assertEqual(
265       <<"abc">>,
266       bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, <<"abc">>)),
267    ?assertEqual(
268       <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>,
269       bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>,
270                   <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
271    ok.
272
273bytes_to_codepoints_test() ->
274    ?assertEqual(
275       "abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff],
276       bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
277    ok.
278
279codepoint_foldl_test() ->
280    ?assertEqual(
281       "cba",
282       codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)),
283    ?assertEqual(
284       [16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"],
285       codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [],
286                       <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
287    ok.
288
289len_test() ->
290    ?assertEqual(
291       29,
292       len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)),
293    ok.
294
295codepoints_to_bytes_test() ->
296    ?assertEqual(
297       iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))),
298       codepoints_to_bytes(lists:seq(1, 1000))),
299    ok.
300
301valid_utf8_bytes_test() ->
302    ?assertEqual(
303       <<"invalid U+11ffff: ">>,
304       valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)),
305    ?assertEqual(
306       <<"U+10ffff: ", 244, 143, 191, 191>>,
307       valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)),
308    ?assertEqual(
309       <<"overlong 2-byte encoding (a): ">>,
310       valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)),
311    ?assertEqual(
312       <<"overlong 2-byte encoding (!): ">>,
313       valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)),
314    ?assertEqual(
315       <<"mu: ", 194, 181>>,
316       valid_utf8_bytes(<<"mu: ", 194, 181>>)),
317    ?assertEqual(
318       <<"bad coding bytes: ">>,
319       valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)),
320    ?assertEqual(
321       <<"low surrogate (unpaired): ">>,
322       valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)),
323    ?assertEqual(
324       <<"high surrogate (unpaired): ">>,
325       valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)),
326    ?assertEqual(
327       <<"unicode snowman for you: ", 226, 152, 131>>,
328       valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)),
329    ?assertEqual(
330       <<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>,
331       valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
332                          167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
333    ok.
334
335-endif.