/src/support/z_string.erl
https://code.google.com/p/zotonic/ · Erlang · 856 lines · 675 code · 88 blank · 93 comment · 21 complexity · a31d95872c65a2562832bd04ff89e3c8 MD5 · raw file
- %% @author Marc Worrell <marc@worrell.nl>
- %% @copyright 2009-2010 Marc Worrell
- %% Date: 2009-04-26
- %% @doc String related functions
- %% @todo Make this UTF-8 safe
- %% @todo Check valid chars for filenames, allow chinese, japanese, etc?
- %% CJK Unified Ideographs Extension A: Range: 3400-4DBF
- %% CJK Unified Ideographs: Range: 4E00-9FAF
- %% Kangxi Radicals: Range 2F00-2FDF
- %% See also: http://www.utf8-chartable.de/
- %% Copyright 2009-2010 Marc Worrell
- %%
- %% Licensed under the Apache License, Version 2.0 (the "License");
- %% you may not use this file except in compliance with the License.
- %% You may obtain a copy of the License at
- %%
- %% http://www.apache.org/licenses/LICENSE-2.0
- %%
- %% Unless required by applicable law or agreed to in writing, software
- %% distributed under the License is distributed on an "AS IS" BASIS,
- %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- %% See the License for the specific language governing permissions and
- %% limitations under the License.
- -module(z_string).
- -author("Marc Worrell <marc@worrell.nl").
- %% interface functions
- -export([
- trim/1,
- trim_left/1,
- trim_right/1,
- trim/2,
- trim_left/2,
- trim_right/2,
- trim_left_func/2,
- is_string/1,
- first_char/1,
- last_char/1,
- unquote/1,
- unquote/2,
- nospaces/1,
- line/1,
- to_rootname/1,
- to_name/1,
- to_slug/1,
- to_lower/1,
- to_upper/1,
- replace/3,
- truncate/2,
- truncate/3,
- truncatewords/2,
- truncatewords/3,
- split_lines/1,
- escape_ical/1,
- starts_with/2,
- ends_with/2,
- contains/2,
- split/2,
- test/0
- ]).
- -include_lib("include/zotonic.hrl").
- %% @doc Remove whitespace at the start and end of the string
- trim(B) when is_binary(B) ->
- trim_right(trim_left(B));
- trim(L) when is_list(L) ->
- binary_to_list(trim(iolist_to_binary(L))).
- %% @doc Remove all occurences of a character at the start and end of a string.
- trim(B, Char) when is_binary(B) ->
- trim_right(trim_left(B, Char), Char);
- trim(L, Char) when is_list(L) ->
- binary_to_list(trim(iolist_to_binary(L), Char)).
- %% @doc Remove whitespace at the start the string
- trim_left(S) ->
- trim_left_func(S, fun(C) -> C =< 32 end).
- %% @doc Remove all occurences of a char at the start of a string
- trim_left(S, Char) ->
- trim_left_func(S, fun(C) -> C == Char end).
- trim_left_func(<<Char, Rest/binary>> = Bin, F) ->
- case F(Char) of
- true -> trim_left_func(Rest, F);
- false -> Bin
- end;
- trim_left_func([Char|Rest] = L, F) when is_integer(Char) ->
- case F(Char) of
- true -> trim_left(Rest, F);
- false -> L
- end;
- trim_left_func([L|Rest], F) when is_list(L); is_binary(L) ->
- case trim_left_func(L, F) of
- [] -> trim_left_func(Rest, F);
- <<>> -> trim_left_func(Rest, F);
- Other -> [Other|Rest]
- end;
- trim_left_func(Other, _F) ->
- Other.
-
-
- %% @doc Remove whitespace at the end of the string
- trim_right(B) when is_binary(B) ->
- trim_right(B, <<>>, <<>>);
- trim_right(L) ->
- binary_to_list(trim_right(iolist_to_binary(L))).
- trim_right(<<C, Rest/binary>>, WS, Acc) ->
- case C of
- W when W =< 32 -> trim_right(Rest, <<WS/binary, C>>, Acc);
- _ -> trim_right(Rest, <<>>, <<Acc/binary, WS/binary, C>>)
- end;
- trim_right(<<>>, _WS, Acc) ->
- Acc.
- %% @doc Remove all occurences of a char at the end of the string
- trim_right(B, Char) when is_binary(B) ->
- trim_right(B, Char, <<>>, <<>>);
- trim_right(L, Char) ->
- binary_to_list(trim_right(iolist_to_binary(L), Char)).
- trim_right(<<C, Rest/binary>>, Char, WS, Acc) ->
- case C of
- Char -> trim_right(Rest, Char, <<WS/binary, C>>, Acc);
- _ -> trim_right(Rest, Char, <<>>, <<Acc/binary, WS/binary, C>>)
- end;
- trim_right(<<>>, _Char, _WS, Acc) ->
- Acc.
- %% @doc Check if the variable is a one dimensional list, probably a string
- is_string([]) ->
- true;
- is_string([C|Rest]) when
- is_integer(C)
- andalso C =< 255
- andalso (C >= 32 orelse C == 9 orelse C == 10 orelse C == 12 orelse C == 13) ->
- is_string(Rest);
- is_string(_) ->
- false.
- %% @doc Return the first character of a string.
- %% @todo Make this UTF-8 safe
- first_char([]) -> undefined;
- first_char([H|T]) when is_integer(H) ->
- truncate([H|T], 1, "");
- first_char(<<>>) -> undefined;
- first_char(<<C, _/binary>>) -> C.
- %% @doc Return the last character of a string
- last_char([]) -> undefined;
- last_char([C]) -> C;
- last_char([_|R]) -> last_char(R);
- last_char(<<>>) -> undefined;
- last_char(<<C>>) -> C;
- last_char(<<_, R/binary>>) -> last_char(R).
- %% @doc Remove the first and last char if they are double quotes.
- unquote(S) ->
- unquote(S, $").
- unquote(S, Q) ->
- case S of
- <<Q, R/binary>> -> unquote1(R, <<>>, Q, S);
- [Q|R] -> unquote1(R, [], Q, S);
- _ -> S
- end.
-
- unquote1([], _Acc, _Q, S) -> S;
- unquote1([Q], Acc, Q, _S) -> lists:reverse(Acc);
- unquote1([H|T], Acc, Q, S) -> unquote1(T, [H|Acc], Q, S);
- unquote1(<<>>, _Acc, _Q, S) -> S;
- unquote1(<<Q>>, Acc, Q, _S) -> Acc;
- unquote1(<<C,R/binary>>, Acc, Q, S) -> unquote1(R, <<Acc/binary, C>>, Q, S).
- %% @doc Remove all spaces and control characters from a string.
- nospaces(B) when is_binary(B) ->
- nospaces(binary_to_list(B));
- nospaces(L) ->
- nospaces(L, []).
- nospaces([], Acc) ->
- lists:reverse(Acc);
- nospaces([C|Rest], Acc) when C =< 32 ->
- nospaces(Rest, Acc);
- nospaces([C|Rest], Acc) ->
- nospaces(Rest, [C|Acc]).
- %% @doc Make sure that the string is on one line only, replace control characters with spaces
- line(B) when is_binary(B) ->
- line(binary_to_list(B));
- line(L) ->
- line1(L, []).
-
- line1([], Acc) ->
- lists:reverse(Acc);
- line1([H|T], Acc) when H < 32 ->
- line1(T, [32|Acc]);
- line1([H|T], Acc) ->
- line1(T, [H|Acc]).
- %% @doc Return a lowercase string for the input
- %% @spec to_lower(Value) -> String
- to_lower(B) when is_binary(B) ->
- to_lower(binary_to_list(B));
- to_lower(A) when is_atom(A) ->
- to_lower(atom_to_list(A));
- to_lower(L) when is_list(L) ->
- to_lower(lists:flatten(L), []).
- to_lower([], Acc) -> lists:reverse(Acc);
- to_lower([H|T], Acc) when H >= $A andalso H =< $Z -> to_lower(T, [H+32|Acc]);
- to_lower("Е"++T, Acc) -> to_lower(T, [165,195|Acc]);
- to_lower("Д"++T, Acc) -> to_lower(T, [164,195|Acc]);
- to_lower("Б"++T, Acc) -> to_lower(T, [161,195|Acc]);
- to_lower("А"++T, Acc) -> to_lower(T, [160,195|Acc]);
- to_lower("Л"++T, Acc) -> to_lower(T, [171,195|Acc]);
- to_lower("К"++T, Acc) -> to_lower(T, [170,195|Acc]);
- to_lower("Й"++T, Acc) -> to_lower(T, [169,195|Acc]);
- to_lower("И"++T, Acc) -> to_lower(T, [168,195|Acc]);
- to_lower("П"++T, Acc) -> to_lower(T, [175,195|Acc]);
- to_lower("О"++T, Acc) -> to_lower(T, [174,195|Acc]);
- to_lower("Н"++T, Acc) -> to_lower(T, [173,195|Acc]);
- to_lower("М"++T, Acc) -> to_lower(T, [172,195|Acc]);
- to_lower("Ь"++T, Acc) -> to_lower(T, [188,195|Acc]);
- to_lower("Ы"++T, Acc) -> to_lower(T, [187,195|Acc]);
- to_lower("Ъ"++T, Acc) -> to_lower(T, [186,195|Acc]);
- to_lower("Щ"++T, Acc) -> to_lower(T, [185,195|Acc]);
- to_lower("Ц"++T, Acc) -> to_lower(T, [182,195|Acc]);
- to_lower("Ф"++T, Acc) -> to_lower(T, [180,195|Acc]);
- to_lower("У"++T, Acc) -> to_lower(T, [179,195|Acc]);
- to_lower("Т"++T, Acc) -> to_lower(T, [178,195|Acc]);
- to_lower("Ш"++T, Acc) -> to_lower(T, [184,195|Acc]);
- to_lower("З"++T, Acc) -> to_lower(T, [167,195|Acc]);
- to_lower("Ж"++T, Acc) -> to_lower(T, [166,195|Acc]);
- to_lower("Њ"++T, Acc) -> to_lower(T, [147,197|Acc]);
- % Cyrillic support
- to_lower("?"++T, Acc) -> to_lower(T, [176,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [177,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [178,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [179,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [180,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [181,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [145,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [182,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [183,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [184,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [185,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [186,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [187,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [188,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [189,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [190,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [191,208|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [128,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [129,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [130,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [131,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [132,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [133,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [134,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [135,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [136,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [137,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [138,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [139,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [140,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [141,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [142,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [143,209|Acc]);
- % Extra Ukrainian characters
- to_lower("?"++T, Acc) -> to_lower(T, [145,210|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [151,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [150,209|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [148,209|Acc]);
- % Polish support
- to_lower("?"++T, Acc) -> to_lower(T, [133,196|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [153,196|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [135,196|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [130,197|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [132,197|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [155,197|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [186,197|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [188,197|Acc]);
- % Turkish support
- to_lower("?"++T, Acc) -> to_lower(T, [159,197|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [159,196|Acc]);
- to_lower("?"++T, Acc) -> to_lower(T, [177,196|Acc]);
- % Other characters are taken as-is
- to_lower([H|T], Acc) -> to_lower(T, [H|Acc]).
- %% @doc Return a uppercase string for the input
- %% @spec to_upper(Value) -> String
- to_upper(B) when is_binary(B) ->
- to_upper(binary_to_list(B));
- to_upper(A) when is_atom(A) ->
- to_upper(atom_to_list(A));
- to_upper(L) when is_list(L) ->
- to_upper(lists:flatten(L), []).
- to_upper([], Acc) -> lists:reverse(Acc);
- to_upper([H|T], Acc) when H >= $a andalso H =< $z -> to_upper(T, [H-32|Acc]);
- to_upper("е"++T, Acc) -> to_upper(T, [133,195|Acc]);
- to_upper("д"++T, Acc) -> to_upper(T, [132,195|Acc]);
- to_upper("б"++T, Acc) -> to_upper(T, [129,195|Acc]);
- to_upper("а"++T, Acc) -> to_upper(T, [128,195|Acc]);
- to_upper("л"++T, Acc) -> to_upper(T, [139,195|Acc]);
- to_upper("к"++T, Acc) -> to_upper(T, [138,195|Acc]);
- to_upper("й"++T, Acc) -> to_upper(T, [137,195|Acc]);
- to_upper("и"++T, Acc) -> to_upper(T, [136,195|Acc]);
- to_upper("п"++T, Acc) -> to_upper(T, [143,195|Acc]);
- to_upper("О"++T, Acc) -> to_upper(T, [142,195|Acc]);
- to_upper("н"++T, Acc) -> to_upper(T, [141,195|Acc]);
- to_upper("м"++T, Acc) -> to_upper(T, [140,195|Acc]);
- to_upper("ь"++T, Acc) -> to_upper(T, [156,195|Acc]);
- to_upper("ы"++T, Acc) -> to_upper(T, [155,195|Acc]);
- to_upper("ъ"++T, Acc) -> to_upper(T, [154,195|Acc]);
- to_upper("щ"++T, Acc) -> to_upper(T, [153,195|Acc]);
- to_upper("ц"++T, Acc) -> to_upper(T, [150,195|Acc]);
- to_upper("ф"++T, Acc) -> to_upper(T, [148,195|Acc]);
- to_upper("у"++T, Acc) -> to_upper(T, [147,195|Acc]);
- to_upper("т"++T, Acc) -> to_upper(T, [146,195|Acc]);
- to_upper("ш"++T, Acc) -> to_upper(T, [152,195|Acc]);
- to_upper("з"++T, Acc) -> to_upper(T, [135,195|Acc]);
- to_upper("ж"++T, Acc) -> to_upper(T, [134,195|Acc]);
- to_upper("њ"++T, Acc) -> to_upper(T, [146,197|Acc]);
- % Cyrillic support
- to_upper("?"++T, Acc) -> to_upper(T, [144,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [145,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [146,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [147,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [148,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [149,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [129,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [150,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [151,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [152,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [153,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [154,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [155,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [156,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [157,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [158,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [159,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [160,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [161,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [162,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [163,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [164,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [165,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [166,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [167,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [168,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [169,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [170,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [171,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [172,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [173,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [174,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [175,208|Acc]);
- % Extra Ukrainian characters
- to_upper("?"++T, Acc) -> to_upper(T, [144,210|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [135,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [143,208|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [132,208|Acc]);
- % Polish support
- to_upper("?"++T, Acc) -> to_upper(T, [132,196|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [152,196|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [134,196|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [129,197|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [131,197|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [154,197|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [185,197|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [187,197|Acc]);
- % Turkish support
- to_upper("?"++T, Acc) -> to_upper(T, [158,197|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [158,196|Acc]);
- to_upper("?"++T, Acc) -> to_upper(T, [176,196|Acc]);
- % Other chars are taken as-is
- to_upper([H|T], Acc) -> to_upper(T, [H|Acc]).
- %% @doc Filter a filename so that we obtain a basename that is safe to use.
- %% @spec to_rootname(string()) -> string()
- to_rootname(Filename) ->
- to_slug(filename:rootname(filename:basename(Filename))).
- %% @doc Map a string to a slug that can be used in the uri of a page. Same as a name, but then with dashes instead of underscores.
- %% @spec to_slug(String) -> String
- to_slug(Title) ->
- Slug = to_name(Title),
- [ case C of $_ -> $-; _ -> C end || C <- Slug ].
- %% @doc Map a string to a value that can be used as a name or slug. Maps all characters to lowercase and remove non digalpha chars
- %% @spec to_name(String) -> String
- to_name({trans, Tr}) ->
- case proplists:get_value(en, Tr) of
- undefined ->
- case Tr of
- [{_,V}|_] -> to_name(V);
- _ -> to_name([])
- end;
- V -> to_name(V)
- end;
- to_name(Name) when is_binary(Name) ->
- to_name(binary_to_list(Name));
- to_name(Name) when is_atom(Name) ->
- to_name(atom_to_list(Name));
- to_name(Name) ->
- to_name(Name, [], 0).
- to_name([], Acc, _I) ->
- case string:strip(lists:reverse(Acc), both, $_) of
- [] -> "_";
- Name -> Name
- end;
- to_name(_, Acc, N) when N >= 80 ->
- to_name([], Acc, 80);
- to_name([C|T], Acc, I) when C >= $A andalso C =< $Z ->
- to_name(T, [C+32|Acc], I+1);
- to_name([C|T], Acc, I) when (C >= $a andalso C =< $z) orelse (C >= $0 andalso C =< $9) orelse C =:= $_ ->
- to_name(T, [C|Acc], I+1);
- to_name("д"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("л"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("п"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("ь"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("ц"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("Д"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("Л"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("П"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("Ь"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("Ц"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("й"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("и"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("Й"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("И"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("н"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("м"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("Н"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("М"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("ъ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("щ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("Ъ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("Щ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("у"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("т"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("У"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("Т"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("Я"++T, Acc, I) -> to_name(T, [$s,$s|Acc], I+2);
- to_name("з"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
- to_name("З"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
- to_name("ш"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("Ш"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("е"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("Е"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("Ђ"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("я"++T, Acc, I) -> to_name(T, [$i,$j|Acc], I+2);
- to_name("@"++T, Acc, I) -> to_name(T, [$_,$t,$a,$_|Acc], I+4);
- % Cyrillic support (from http://en.wikipedia.org/wiki/Romanization_of_Russian)
- to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$b|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$b|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$v|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$v|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$d|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$d|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$o,$y|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$o,$y|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$z|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$z|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$j|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$j|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$k|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$k|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$m|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$m|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$p|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$p|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$r|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$r|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$t|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$t|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$f|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$f|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$h|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$h|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$c|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$c|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$s|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$s|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$h,$s|Acc], I+3);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$h,$s|Acc], I+3);
- to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$y|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$y|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$e|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$h,$e|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$u,$y|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$u,$y|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$a,$y|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$a,$y|Acc], I+2);
- % Ukrainian support
- to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$e,$y|Acc], I+2);
- to_name("?"++T, Acc, I) -> to_name(T, [$e,$y|Acc], I+2);
- % Polish support
- to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
- % Turkish support
- to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
- % Some entities - we might want to add generic code here, depends
- % on where to_name/1 is used (can we assume that the input is always html?)
- to_name("&"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- to_name("<"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- to_name(">"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- to_name("'"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
- % Other sequences of characters are mapped to $_
- to_name([_C|T], [$_|_] = Acc, I) ->
- to_name(T, Acc, I+1);
- to_name([_C|T], Acc, I) ->
- to_name(T, [$_|Acc], I+1).
- %% @doc Replace a string inside another string
- %% Copyright 2008 Rusty Klophaus (Nitrogen, MIT License)
- replace([], _, _) -> [];
- replace(String, S1, S2) when is_list(String), is_list(S1), is_list(S2) ->
- Length = length(S1),
- case string:substr(String, 1, Length) of
- S1 ->
- S2 ++ replace(string:substr(String, Length + 1), S1, S2);
- _ ->
- [hd(String)|replace(tl(String), S1, S2)]
- end.
- %% @doc Truncate a string. Append the '...' character at the place of break off.
- %% @spec truncate(String, int()) -> String
- truncate(L, N) ->
- truncate(L, N, "…").
- truncate(B, N, Append) when is_binary(B) ->
- truncate(z_convert:to_list(B), N, Append);
- truncate(_L, N, _Append) when N =< 0 ->
- [];
- truncate(L, N, Append) ->
- truncate(L, N, Append, in_word, [], in_word, []).
-
- truncate([], _, _Append, _LastState, _Last, _AccState, Acc) ->
- lists:reverse(Acc);
- truncate(_, 0, _Append, sentence, Last, _AccState, _Acc) ->
- lists:reverse(Last);
- truncate(_, 0, Append, _, [], _AccState, Acc) ->
- lists:reverse(insert_acc(Append, Acc));
- truncate(_, 0, Append, _LastState, Last, _AccState, _Acc) ->
- lists:reverse(insert_acc(Append, Last));
- truncate([C|Rest], N, Append, LastState, Last, AccState, Acc)
- when C == $.; C == $!; C == $? ->
- case AccState of
- in_word -> truncate(Rest, N-1, Append, sentence, [C|Acc], sentence, [C|Acc]);
- word -> truncate(Rest, N-1, Append, sentence, [C|Acc], sentence, [C|Acc]);
- _ -> truncate(Rest, N-1, Append, LastState, Last, sentence, [C|Acc])
- end;
- truncate([C|Rest], N, Append, LastState, Last, AccState, Acc)
- when C == $;; C == $-; C == $, ->
- case AccState of
- in_word -> truncate(Rest, N-1, Append, sentence, Acc, word, [C|Acc]);
- _ -> truncate(Rest, N-1, Append, LastState, Last, word, [C|Acc])
- end;
- truncate([C|Rest], N, Append, LastState, Last, AccState, Acc)
- when C == 32; C == 9; C == 10; C == 13; C == $/; C == $|; C == $(; C == $); C == $" ->
- case AccState of
- in_word -> truncate(Rest, N-1, Append, word, Acc, word, [C|Acc]);
- _ -> truncate(Rest, N-1, Append, LastState, Last, word, [C|Acc])
- end;
- truncate([$&|_]=Input, N, Append, LastState, Last, AccState, Acc) ->
- {Rest1,Acc1} = get_entity(Input,Acc),
- case AccState of
- in_word -> truncate(Rest1, N-1, Append, word, Acc1, word, Acc1);
- _ -> truncate(Rest1, N-1, Append, LastState, Last, word, Acc1)
- end;
- %% Overlong encoding: start of a 2-byte sequence, but code point <= 127
- truncate([X,A|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 192, X =< 193 ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [A,X|Acc]);
- %% Start of 2-byte sequence
- truncate([X,A|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 194, X =< 223 ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [A,X|Acc]);
- %% Start of 3-byte sequence
- truncate([X,A,B|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 224, X =< 239 ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [B,A,X|Acc]);
- %% Start of 4-byte sequence
- truncate([X,A,B,C|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 240, X =< 244 ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [C,B,A,X|Acc]);
- %% Restricted by RFC 3629: start of 4-byte sequence for codepoint above 10FFFF
- truncate([X,A,B,C|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 245, X =< 247 ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [C,B,A,X|Acc]);
- %% Restricted by RFC 3629: start of 5-byte sequence
- truncate([X,A,B,C,D|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 248, X =< 251 ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [D,C,B,A,X|Acc]);
- %% Restricted by RFC 3629: start of 6-byte sequence
- truncate([X,A,B,C,D,E|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 252, X =< 253 ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [E,D,C,B,A,X|Acc]);
-
- %% Any other character
- truncate([C|Rest], N, Append, LastState, Last, _AccState, Acc) ->
- truncate(Rest, N-1, Append, LastState, Last, in_word, [C|Acc]).
- insert_acc([], Acc) ->
- Acc;
- insert_acc([H|T], Acc) ->
- insert_acc(T, [H|Acc]).
-
- get_entity([], Acc) ->
- {[],Acc};
- get_entity([$;|Rest], Acc) ->
- {Rest,[$;|Acc]};
- get_entity([C|Rest], Acc) ->
- get_entity(Rest, [C|Acc]).
- truncatewords(S, Words) ->
- truncatewords(S, Words, "…").
- truncatewords(S, Words, Append) when is_binary(S) ->
- truncatewords(z_convert:to_list(S), in_space, Words, Append, []);
- truncatewords(S, Words, Append) when is_list(S) ->
- truncatewords(S, in_space, Words, Append, []).
- truncatewords(_S, _State, 0, Append, Acc) ->
- lists:reverse(trim_left_func(Acc, fun iswordsep/1), Append);
- truncatewords([], _State, _Words, _Append, Acc) ->
- lists:reverse(Acc);
- truncatewords([C|Rest], in_space, Words, Append, Acc) ->
- case iswordsep(C) of
- true -> truncatewords(Rest, in_space, Words, Append, [C|Acc]);
- false -> truncatewords(Rest, in_word, Words, Append, [C|Acc])
- end;
- truncatewords([C|Rest], in_word, Words, Append, Acc) ->
- case iswordsep(C) of
- true -> truncatewords(Rest, in_space, Words-1, Append, [C|Acc]);
- false -> truncatewords(Rest, in_word, Words, Append, [C|Acc])
- end.
- iswordsep($\s) -> true;
- iswordsep($\n) -> true;
- iswordsep($\r) -> true;
- iswordsep($\t) -> true;
- iswordsep($,) -> true;
- iswordsep($:) -> true;
- iswordsep($;) -> true;
- iswordsep(_) -> false.
- %% @doc Split the binary into lines. Line separators can be \r, \n or \r\n.
- split_lines(B) when is_binary(B) ->
- split_lines(B, <<>>, []).
-
- split_lines(<<>>, Line, Acc) ->
- lists:reverse([Line|Acc]);
- split_lines(<<13,10,Rest/binary>>, Line, Acc) ->
- split_lines(Rest, <<>>, [Line|Acc]);
- split_lines(<<13,Rest/binary>>, Line, Acc) ->
- split_lines(Rest, <<>>, [Line|Acc]);
- split_lines(<<10,Rest/binary>>, Line, Acc) ->
- split_lines(Rest, <<>>, [Line|Acc]);
- split_lines(<<C, Rest/binary>>, Line, Acc) ->
- split_lines(Rest, <<Line/binary, C>>, Acc).
- %% @doc Escape special characters for ical RFC2445 elements
- escape_ical(L) when is_list(L) ->
- escape_ical(iolist_to_binary(L));
- escape_ical(B) when is_binary(B) ->
- escape_ical(B, <<>>, 0);
- escape_ical(A) when is_atom(A) ->
- escape_ical(atom_to_list(A)).
- escape_ical(<<>>, Acc, _N) -> Acc;
- escape_ical(B, Acc, N) when N >= 70 -> escape_ical(B, <<Acc/binary, 13, 10, 32>>, 0);
- escape_ical(<<13, 10, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $n>>, N+2);
- escape_ical(<<10, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $n>>, N+2);
- escape_ical(<<9, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, 32>>, N+1);
- escape_ical(<<$", Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $">>, N+2);
- escape_ical(<<$,, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $,>>, N+2);
- escape_ical(<<$:, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $", $:, $">>, N+3);
- escape_ical(<<$;, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $;>>, N+2);
- escape_ical(<<$\\, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $\\>>, N+2);
- escape_ical(<<C, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, C>>, N+1).
- %% @doc Return true if Start is a prefix of Word
- %% @spec starts_with(String, String) -> bool()
- starts_with(Start, B) when is_binary(Start), is_binary(B) ->
- StartSize = size(Start),
- case B of
- <<Start:StartSize/binary, _/binary>> -> true;
- _ -> false
- end;
- starts_with(Start, String) ->
- starts_with(iolist_to_binary(Start), iolist_to_binary(String)).
- %% @doc Return true iff Word ends with End
- %% @spec ends_with(String, String) -> bool()
- ends_with(End, B) when is_binary(End), is_binary(B) ->
- StartSize = size(B) - size(End),
- case B of
- <<_:StartSize/binary, End/binary>> -> true;
- _ ->false
- end;
- ends_with(End, String) ->
- ends_with(iolist_to_binary(End), iolist_to_binary(String)).
- %% @doc Return true iff What is found in the string
- %% @spec contains(String, String) -> bool()
- contains(What, B) when is_binary(What), is_binary(B) ->
- contains(What, size(What), B, 0);
- contains(What, String) ->
- contains(iolist_to_binary(What), iolist_to_binary(String)).
- contains(_What, _SizeWhat, B, C) when C > size(B) ->
- false;
- contains(What, SizeWhat, B, C) ->
- case B of
- <<_:C/binary, What:SizeWhat/binary, _/binary>> ->true;
- _ ->contains(What, SizeWhat, B, C + 1)
- end.
- %% @doc Split a string, see http://www.erlang.org/pipermail/erlang-questions/2008-October/038896.html
- %% @spec split(String, String) -> list()
- split(String, []) ->
- split0(String);
- split(String, [Sep]) when is_integer(Sep) ->
- split1(String, Sep);
- split(String, [C1,C2|L]) when is_integer(C1), is_integer(C2) ->
- split2(String, C1, C2, L).
- %% Split a string at "", which is deemed to occur _between_
- %% adjacent characters, but queerly, not at the beginning
- %% or the end.
- split0([C|Cs]) ->
- [[C] | split0(Cs)];
- split0([]) ->
- [].
- %% Split a string at a single character separator.
- split1(String, Sep) ->
- split1_loop(String, Sep, "").
- split1_loop([Sep|String], Sep, Rev) ->
- [lists:reverse(Rev) | split1(String, Sep)];
- split1_loop([Chr|String], Sep, Rev) ->
- split1_loop(String, Sep, [Chr|Rev]);
- split1_loop([], _, Rev) ->
- [lists:reverse(Rev)].
- %% Split a string at a multi-character separator
- %% [C1,C2|L]. These components are split out for
- %% a fast match.
- split2(String, C1, C2, L) ->
- split2_loop(String, C1, C2, L, "").
- split2_loop([C1|S = [C2|String]], C1, C2, L, Rev) ->
- case split_prefix(L, String)
- of no -> split2_loop(S, C1, C2, L, [C1|Rev])
- ; Rest -> [lists:reverse(Rev) | split2(Rest, C1, C2, L)]
- end;
- split2_loop([Chr|String], C1, C2, L, Rev) ->
- split2_loop(String, C1, C2, L, [Chr|Rev]);
- split2_loop([], _, _, _, Rev) ->
- [lists:reverse(Rev)].
- split_prefix([C|L], [C|S]) -> split_prefix(L, S);
- split_prefix([], S) -> S;
- split_prefix(_, _) -> no.
- test() ->
- A = "ьзgen",
- A = to_lower(to_upper(A)),
- "ucgen" = to_name(A),
- "a" = first_char("aap"),
- "?" = first_char("???xx"),
- "?" = first_char("?aap"),
- ok.