z_string.erl | searchcode

/src/support/z_string.erl

https://code.google.com/p/zotonic/ · Erlang · 856 lines · 675 code · 88 blank · 93 comment · 21 complexity · a31d95872c65a2562832bd04ff89e3c8 MD5 · raw file

%% @author Marc Worrell <marc@worrell.nl>
%% @copyright 2009-2010 Marc Worrell
%% Date: 2009-04-26
%% @doc String related functions
%% @todo Make this UTF-8 safe

%% @todo Check valid chars for filenames, allow chinese, japanese, etc?
%% CJK Unified Ideographs Extension A: Range: 3400-4DBF
%% CJK Unified Ideographs: Range: 4E00-9FAF
%% Kangxi Radicals: Range 2F00-2FDF
%% See also: http://www.utf8-chartable.de/

%% Copyright 2009-2010 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%% 
%%     http://www.apache.org/licenses/LICENSE-2.0
%% 
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.

-module(z_string).
-author("Marc Worrell <marc@worrell.nl").

%% interface functions
-export([
    trim/1,
    trim_left/1,
    trim_right/1,
    trim/2,
    trim_left/2,
    trim_right/2,
    trim_left_func/2,
    is_string/1,
    first_char/1,
    last_char/1,
    unquote/1,
    unquote/2,
    nospaces/1,
    line/1,
    to_rootname/1,
    to_name/1,
    to_slug/1,
    to_lower/1,
    to_upper/1,
    replace/3,
    truncate/2,
    truncate/3,
    truncatewords/2,
    truncatewords/3,
    split_lines/1,
    escape_ical/1,
    starts_with/2,
    ends_with/2,
    contains/2,
    split/2,
    test/0
]).

-include_lib("include/zotonic.hrl").


%% @doc Remove whitespace at the start and end of the string
trim(B) when is_binary(B) ->
	trim_right(trim_left(B));
trim(L) when is_list(L) ->
	binary_to_list(trim(iolist_to_binary(L))).

%% @doc Remove all occurences of a character at the start and end of a string.
trim(B, Char) when is_binary(B) ->
	trim_right(trim_left(B, Char), Char);
trim(L, Char) when is_list(L) ->
	binary_to_list(trim(iolist_to_binary(L), Char)).


%% @doc Remove whitespace at the start the string
trim_left(S) ->
    trim_left_func(S, fun(C) -> C =< 32 end).

%% @doc Remove all occurences of a char at the start of a string
trim_left(S, Char) ->
    trim_left_func(S, fun(C) -> C == Char end).


trim_left_func(<<Char, Rest/binary>> = Bin, F) ->
    case F(Char) of
        true -> trim_left_func(Rest, F);
        false -> Bin
    end;
trim_left_func([Char|Rest] = L, F) when is_integer(Char) ->
    case F(Char) of
        true -> trim_left(Rest, F);
        false -> L
    end;
trim_left_func([L|Rest], F) when is_list(L); is_binary(L) ->
    case trim_left_func(L, F) of
        [] -> trim_left_func(Rest, F);
        <<>> -> trim_left_func(Rest, F);
        Other -> [Other|Rest]
    end;
trim_left_func(Other, _F) ->
    Other.

    
	
%% @doc Remove whitespace at the end of the string
trim_right(B) when is_binary(B) ->
	trim_right(B, <<>>, <<>>);
trim_right(L) ->
	binary_to_list(trim_right(iolist_to_binary(L))).

	trim_right(<<C, Rest/binary>>, WS, Acc) ->
		case C of
			W when W =< 32 -> trim_right(Rest, <<WS/binary, C>>, Acc);
			_ -> trim_right(Rest, <<>>, <<Acc/binary, WS/binary, C>>)
		end;
	trim_right(<<>>, _WS, Acc) ->
		Acc.

%% @doc Remove all occurences of a char at the end of the string
trim_right(B, Char) when is_binary(B) ->
	trim_right(B, Char, <<>>, <<>>);
trim_right(L, Char) ->
	binary_to_list(trim_right(iolist_to_binary(L), Char)).

	trim_right(<<C, Rest/binary>>, Char, WS, Acc) ->
		case C of
			Char -> trim_right(Rest, Char, <<WS/binary, C>>, Acc);
			_ -> trim_right(Rest, Char, <<>>, <<Acc/binary, WS/binary, C>>)
		end;
	trim_right(<<>>, _Char, _WS, Acc) ->
		Acc.

%% @doc Check if the variable is a one dimensional list, probably a string
is_string([]) -> 
    true;
is_string([C|Rest]) when 
		is_integer(C)
		andalso C =< 255
		andalso (C >= 32 orelse C == 9 orelse C == 10 orelse C == 12 orelse C == 13) ->
    is_string(Rest);
is_string(_) -> 
    false.


%% @doc Return the first character of a string.
%% @todo Make this UTF-8 safe
first_char([]) -> undefined;
first_char([H|T]) when is_integer(H) ->
    truncate([H|T], 1, "");
first_char(<<>>) -> undefined;
first_char(<<C, _/binary>>) -> C.


%% @doc Return the last character of a string
last_char([]) -> undefined;
last_char([C]) -> C;
last_char([_|R]) -> last_char(R);
last_char(<<>>) -> undefined;
last_char(<<C>>) -> C;
last_char(<<_, R/binary>>) -> last_char(R).


%% @doc Remove the first and last char if they are double quotes.
unquote(S) ->
    unquote(S, $").

unquote(S, Q) ->
    case S of
        <<Q, R/binary>> -> unquote1(R, <<>>, Q, S);
        [Q|R] -> unquote1(R, [], Q, S);
        _ -> S
    end.
    
    unquote1([], _Acc, _Q, S) -> S;
    unquote1([Q], Acc, Q, _S) -> lists:reverse(Acc);
    unquote1([H|T], Acc, Q, S) -> unquote1(T, [H|Acc], Q, S);

    unquote1(<<>>, _Acc, _Q, S) -> S;
    unquote1(<<Q>>, Acc, Q, _S) -> Acc;
    unquote1(<<C,R/binary>>, Acc, Q, S) -> unquote1(R, <<Acc/binary, C>>, Q, S).


%% @doc Remove all spaces and control characters from a string.
nospaces(B) when is_binary(B) ->
    nospaces(binary_to_list(B));
nospaces(L) ->
    nospaces(L, []).

nospaces([], Acc) ->
    lists:reverse(Acc);
nospaces([C|Rest], Acc) when C =< 32 ->
    nospaces(Rest, Acc);
nospaces([C|Rest], Acc) ->
    nospaces(Rest, [C|Acc]).



%% @doc Make sure that the string is on one line only, replace control characters with spaces
line(B) when is_binary(B) ->
    line(binary_to_list(B));
line(L) ->
    line1(L, []).
    
    line1([], Acc) ->
        lists:reverse(Acc);
    line1([H|T], Acc) when H < 32 ->
        line1(T, [32|Acc]);
    line1([H|T], Acc) ->
        line1(T, [H|Acc]).


%% @doc Return a lowercase string for the input
%% @spec to_lower(Value) -> String
to_lower(B) when is_binary(B) ->
    to_lower(binary_to_list(B));
to_lower(A) when is_atom(A) ->
    to_lower(atom_to_list(A));
to_lower(L) when is_list(L) ->
    to_lower(lists:flatten(L), []).

	to_lower([], Acc) -> lists:reverse(Acc);
	to_lower([H|T], Acc) when H >= $A andalso H =< $Z -> to_lower(T, [H+32|Acc]); 
	to_lower("Е"++T, Acc) -> to_lower(T, [165,195|Acc]);
	to_lower("Д"++T, Acc) -> to_lower(T, [164,195|Acc]);
	to_lower("Б"++T, Acc) -> to_lower(T, [161,195|Acc]);
	to_lower("А"++T, Acc) -> to_lower(T, [160,195|Acc]);
	to_lower("Л"++T, Acc) -> to_lower(T, [171,195|Acc]);
	to_lower("К"++T, Acc) -> to_lower(T, [170,195|Acc]);
	to_lower("Й"++T, Acc) -> to_lower(T, [169,195|Acc]);
	to_lower("И"++T, Acc) -> to_lower(T, [168,195|Acc]);
	to_lower("П"++T, Acc) -> to_lower(T, [175,195|Acc]);
	to_lower("О"++T, Acc) -> to_lower(T, [174,195|Acc]);
	to_lower("Н"++T, Acc) -> to_lower(T, [173,195|Acc]);
	to_lower("М"++T, Acc) -> to_lower(T, [172,195|Acc]);
	to_lower("Ь"++T, Acc) -> to_lower(T, [188,195|Acc]);
	to_lower("Ы"++T, Acc) -> to_lower(T, [187,195|Acc]);
	to_lower("Ъ"++T, Acc) -> to_lower(T, [186,195|Acc]);
	to_lower("Щ"++T, Acc) -> to_lower(T, [185,195|Acc]);
	to_lower("Ц"++T, Acc) -> to_lower(T, [182,195|Acc]);
	to_lower("Ф"++T, Acc) -> to_lower(T, [180,195|Acc]);
	to_lower("У"++T, Acc) -> to_lower(T, [179,195|Acc]);
	to_lower("Т"++T, Acc) -> to_lower(T, [178,195|Acc]);
	to_lower("Ш"++T, Acc) -> to_lower(T, [184,195|Acc]);
	to_lower("З"++T, Acc) -> to_lower(T, [167,195|Acc]);
	to_lower("Ж"++T, Acc) -> to_lower(T, [166,195|Acc]);
	to_lower("Њ"++T, Acc) -> to_lower(T, [147,197|Acc]);
	% Cyrillic support
	to_lower("?"++T, Acc) -> to_lower(T, [176,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [177,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [178,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [179,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [180,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [181,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [145,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [182,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [183,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [184,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [185,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [186,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [187,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [188,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [189,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [190,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [191,208|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [128,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [129,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [130,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [131,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [132,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [133,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [134,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [135,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [136,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [137,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [138,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [139,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [140,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [141,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [142,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [143,209|Acc]);
	% Extra Ukrainian characters
	to_lower("?"++T, Acc) -> to_lower(T, [145,210|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [151,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [150,209|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [148,209|Acc]);
	% Polish support
	to_lower("?"++T, Acc) -> to_lower(T, [133,196|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [153,196|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [135,196|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [130,197|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [132,197|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [155,197|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [186,197|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [188,197|Acc]);
    % Turkish support
	to_lower("?"++T, Acc) -> to_lower(T, [159,197|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [159,196|Acc]);
	to_lower("?"++T, Acc) -> to_lower(T, [177,196|Acc]);
	% Other characters are taken as-is
	to_lower([H|T], Acc) -> to_lower(T, [H|Acc]).


%% @doc Return a uppercase string for the input
%% @spec to_upper(Value) -> String
to_upper(B) when is_binary(B) ->
    to_upper(binary_to_list(B));
to_upper(A) when is_atom(A) ->
    to_upper(atom_to_list(A));
to_upper(L) when is_list(L) ->
    to_upper(lists:flatten(L), []).

	to_upper([], Acc) -> lists:reverse(Acc);
	to_upper([H|T], Acc) when H >= $a andalso H =< $z -> to_upper(T, [H-32|Acc]); 
	to_upper("е"++T, Acc) -> to_upper(T, [133,195|Acc]);
	to_upper("д"++T, Acc) -> to_upper(T, [132,195|Acc]);
	to_upper("б"++T, Acc) -> to_upper(T, [129,195|Acc]);
	to_upper("а"++T, Acc) -> to_upper(T, [128,195|Acc]);
	to_upper("л"++T, Acc) -> to_upper(T, [139,195|Acc]);
	to_upper("к"++T, Acc) -> to_upper(T, [138,195|Acc]);
	to_upper("й"++T, Acc) -> to_upper(T, [137,195|Acc]);
	to_upper("и"++T, Acc) -> to_upper(T, [136,195|Acc]);
	to_upper("п"++T, Acc) -> to_upper(T, [143,195|Acc]);
	to_upper("О"++T, Acc) -> to_upper(T, [142,195|Acc]);
	to_upper("н"++T, Acc) -> to_upper(T, [141,195|Acc]);
	to_upper("м"++T, Acc) -> to_upper(T, [140,195|Acc]);
	to_upper("ь"++T, Acc) -> to_upper(T, [156,195|Acc]);
	to_upper("ы"++T, Acc) -> to_upper(T, [155,195|Acc]);
	to_upper("ъ"++T, Acc) -> to_upper(T, [154,195|Acc]);
	to_upper("щ"++T, Acc) -> to_upper(T, [153,195|Acc]);
	to_upper("ц"++T, Acc) -> to_upper(T, [150,195|Acc]);
	to_upper("ф"++T, Acc) -> to_upper(T, [148,195|Acc]);
	to_upper("у"++T, Acc) -> to_upper(T, [147,195|Acc]);
	to_upper("т"++T, Acc) -> to_upper(T, [146,195|Acc]);
	to_upper("ш"++T, Acc) -> to_upper(T, [152,195|Acc]);
	to_upper("з"++T, Acc) -> to_upper(T, [135,195|Acc]);
	to_upper("ж"++T, Acc) -> to_upper(T, [134,195|Acc]);
	to_upper("њ"++T, Acc) -> to_upper(T, [146,197|Acc]);
	% Cyrillic support
	to_upper("?"++T, Acc) -> to_upper(T, [144,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [145,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [146,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [147,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [148,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [149,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [129,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [150,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [151,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [152,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [153,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [154,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [155,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [156,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [157,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [158,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [159,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [160,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [161,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [162,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [163,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [164,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [165,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [166,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [167,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [168,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [169,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [170,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [171,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [172,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [173,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [174,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [175,208|Acc]);
	% Extra Ukrainian characters
	to_upper("?"++T, Acc) -> to_upper(T, [144,210|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [135,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [143,208|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [132,208|Acc]);
	% Polish support
	to_upper("?"++T, Acc) -> to_upper(T, [132,196|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [152,196|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [134,196|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [129,197|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [131,197|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [154,197|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [185,197|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [187,197|Acc]);
	% Turkish support
	to_upper("?"++T, Acc) -> to_upper(T, [158,197|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [158,196|Acc]);
	to_upper("?"++T, Acc) -> to_upper(T, [176,196|Acc]);

	% Other chars are taken as-is
	to_upper([H|T], Acc) -> to_upper(T, [H|Acc]).

%% @doc Filter a filename so that we obtain a basename that is safe to use.
%% @spec to_rootname(string()) -> string()
to_rootname(Filename) ->
    to_slug(filename:rootname(filename:basename(Filename))).


%% @doc Map a string to a slug that can be used in the uri of a page. Same as a name, but then with dashes instead of underscores.
%% @spec to_slug(String) -> String
to_slug(Title) ->
    Slug = to_name(Title),
    [ case C of $_ -> $-; _ -> C end || C <- Slug ].


%% @doc Map a string to a value that can be used as a name or slug. Maps all characters to lowercase and remove non digalpha chars
%% @spec to_name(String) -> String
to_name({trans, Tr}) ->
    case proplists:get_value(en, Tr) of
        undefined -> 
            case Tr of
                [{_,V}|_] -> to_name(V);
                _ -> to_name([])
            end;
        V -> to_name(V)
    end;
to_name(Name) when is_binary(Name) ->
    to_name(binary_to_list(Name));
to_name(Name) when is_atom(Name) ->
    to_name(atom_to_list(Name));
to_name(Name) ->
    to_name(Name, [], 0).

to_name([], Acc, _I) ->
    case string:strip(lists:reverse(Acc), both, $_) of
        [] -> "_";
        Name -> Name
    end;
to_name(_, Acc, N) when N >= 80 ->
    to_name([], Acc, 80);
to_name([C|T], Acc, I) when C >= $A andalso C =< $Z ->
    to_name(T, [C+32|Acc], I+1);
to_name([C|T], Acc, I) when (C >= $a andalso C =< $z) orelse (C >= $0 andalso C =< $9) orelse C =:= $_ ->
    to_name(T, [C|Acc], I+1);
to_name("д"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("л"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("п"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("ь"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("ц"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("Д"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("Л"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("П"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("Ь"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("Ц"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("й"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("и"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("Й"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("И"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("н"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("м"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("Н"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("М"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("ъ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("щ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("Ъ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("Щ"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("у"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("т"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("У"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("Т"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("Я"++T, Acc, I) -> to_name(T, [$s,$s|Acc], I+2);
to_name("з"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
to_name("З"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
to_name("ш"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("Ш"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("е"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("Е"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("Ђ"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("я"++T, Acc, I) -> to_name(T, [$i,$j|Acc], I+2);
to_name("@"++T, Acc, I) -> to_name(T, [$_,$t,$a,$_|Acc], I+4);
% Cyrillic support (from http://en.wikipedia.org/wiki/Romanization_of_Russian)
to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$b|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$b|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$v|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$v|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$d|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$d|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$o,$y|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$o,$y|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$z|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$z|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$j|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$j|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$k|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$k|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$m|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$m|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$o|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$p|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$p|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$r|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$r|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$t|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$t|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$u|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$f|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$f|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$h|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$h|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$c|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$c|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$s|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$s|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$h,$s|Acc], I+3);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$h,$s|Acc], I+3);
to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$y|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$y|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$e|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$h,$e|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$u,$y|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$u,$y|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$a,$y|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$a,$y|Acc], I+2);
% Ukrainian support
to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$e,$y|Acc], I+2);
to_name("?"++T, Acc, I) -> to_name(T, [$e,$y|Acc], I+2);
% Polish support
to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$a|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$e|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$c|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$l|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$n|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$z|Acc], I+1);
% Turkish support
to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$s|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$g|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
to_name("?"++T, Acc, I) -> to_name(T, [$i|Acc], I+1);
% Some entities - we might want to add generic code here, depends
% on where to_name/1 is used (can we assume that the input is always html?)
to_name("&amp;"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
to_name("&lt;"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
to_name("&gt;"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
to_name("&#39;"++T, Acc, I) -> to_name(T, [$_|Acc], I+1);
% Other sequences of characters are mapped to $_
to_name([_C|T], [$_|_] = Acc, I) ->
    to_name(T, Acc, I+1);
to_name([_C|T], Acc, I) ->
    to_name(T, [$_|Acc], I+1).


%% @doc Replace a string inside another string
%% Copyright 2008 Rusty Klophaus  (Nitrogen, MIT License)
replace([], _, _) -> [];
replace(String, S1, S2) when is_list(String), is_list(S1), is_list(S2) ->
	Length = length(S1),
	case string:substr(String, 1, Length) of 
		S1 -> 
			S2 ++ replace(string:substr(String, Length + 1), S1, S2);
		_ -> 
			[hd(String)|replace(tl(String), S1, S2)]
	end.


%% @doc Truncate a string.  Append the '...' character at the place of break off.
%% @spec truncate(String, int()) -> String
truncate(L, N) ->
	truncate(L, N, "…").

truncate(B, N, Append) when is_binary(B) ->
	truncate(z_convert:to_list(B), N, Append);
truncate(_L, N, _Append) when N =< 0 ->
	[];
truncate(L, N, Append) ->
	truncate(L, N, Append, in_word, [], in_word, []).
	

	truncate([], _, _Append, _LastState, _Last, _AccState, Acc) ->
		lists:reverse(Acc);
	truncate(_, 0, _Append, sentence, Last, _AccState, _Acc) ->
		lists:reverse(Last);
	truncate(_, 0, Append, _, [], _AccState, Acc) ->
		lists:reverse(insert_acc(Append, Acc));
	truncate(_, 0, Append, _LastState, Last, _AccState, _Acc) ->
		lists:reverse(insert_acc(Append, Last));
	truncate([C|Rest], N, Append, LastState, Last, AccState, Acc) 
		when C == $.; C == $!; C == $? ->
			case AccState of
				in_word -> truncate(Rest, N-1, Append, sentence, [C|Acc], sentence, [C|Acc]);
				word    -> truncate(Rest, N-1, Append, sentence, [C|Acc], sentence, [C|Acc]);
				_ 		-> truncate(Rest, N-1, Append, LastState, Last,   sentence, [C|Acc])
			end;
	truncate([C|Rest], N, Append, LastState, Last, AccState, Acc) 
		when C == $;; C == $-; C == $, ->
			case AccState of
				in_word -> truncate(Rest, N-1, Append, sentence,  Acc,  word, [C|Acc]);
				_ 		-> truncate(Rest, N-1, Append, LastState, Last, word, [C|Acc])
			end;
	truncate([C|Rest], N, Append, LastState, Last, AccState, Acc) 
		when C == 32; C == 9; C == 10; C == 13; C == $/; C == $|; C == $(; C == $); C == $" ->
			case AccState of
				in_word -> truncate(Rest, N-1, Append, word, Acc, word, [C|Acc]);
				_       -> truncate(Rest, N-1, Append, LastState, Last, word, [C|Acc])
			end;
	truncate([$&|_]=Input, N, Append, LastState, Last, AccState, Acc) ->
		{Rest1,Acc1} = get_entity(Input,Acc),
		case AccState of
			in_word -> truncate(Rest1, N-1, Append, word, Acc1, word, Acc1);
			_ 		-> truncate(Rest1, N-1, Append, LastState, Last, word, Acc1)
		end;

	%% Overlong encoding: start of a 2-byte sequence, but code point <= 127
	truncate([X,A|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 192, X =< 193 ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [A,X|Acc]);
	%% Start of 2-byte sequence
	truncate([X,A|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 194, X =< 223 ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [A,X|Acc]);
	%% Start of 3-byte sequence
	truncate([X,A,B|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 224, X =< 239 ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [B,A,X|Acc]);
	%% Start of 4-byte sequence
	truncate([X,A,B,C|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 240, X =< 244 ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [C,B,A,X|Acc]);
	%% Restricted by RFC 3629: start of 4-byte sequence for codepoint above 10FFFF
	truncate([X,A,B,C|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 245, X =< 247 ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [C,B,A,X|Acc]);
	%% Restricted by RFC 3629: start of 5-byte sequence
	truncate([X,A,B,C,D|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 248, X =< 251 ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [D,C,B,A,X|Acc]);
	%% Restricted by RFC 3629: start of 6-byte sequence
	truncate([X,A,B,C,D,E|Rest], N, Append, LastState, Last, _AccState, Acc) when X >= 252, X =< 253 ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [E,D,C,B,A,X|Acc]);
	
	%% Any other character
	truncate([C|Rest], N, Append, LastState, Last, _AccState, Acc) ->
		truncate(Rest, N-1, Append, LastState, Last, in_word, [C|Acc]).

	insert_acc([], Acc) ->
		Acc;
	insert_acc([H|T], Acc) ->
		insert_acc(T, [H|Acc]).
	
    get_entity([], Acc) ->
    	{[],Acc};
    get_entity([$;|Rest], Acc) ->
    	{Rest,[$;|Acc]};
    get_entity([C|Rest], Acc) ->
    	get_entity(Rest, [C|Acc]).


truncatewords(S, Words) ->
    truncatewords(S, Words, "…").
truncatewords(S, Words, Append) when is_binary(S) ->
    truncatewords(z_convert:to_list(S), in_space, Words, Append, []);
truncatewords(S, Words, Append) when is_list(S) ->
    truncatewords(S, in_space, Words, Append, []).

    truncatewords(_S, _State, 0, Append, Acc) ->
        lists:reverse(trim_left_func(Acc, fun iswordsep/1), Append);
    truncatewords([], _State, _Words, _Append, Acc) ->
        lists:reverse(Acc);
    truncatewords([C|Rest], in_space, Words, Append, Acc) ->
        case iswordsep(C) of
            true -> truncatewords(Rest, in_space, Words, Append, [C|Acc]);
            false -> truncatewords(Rest, in_word, Words, Append, [C|Acc])
        end;
    truncatewords([C|Rest], in_word, Words, Append, Acc) ->
        case iswordsep(C) of
            true -> truncatewords(Rest, in_space, Words-1, Append, [C|Acc]);
            false -> truncatewords(Rest, in_word, Words, Append, [C|Acc])
        end.

    iswordsep($\s) -> true;
    iswordsep($\n) -> true;
    iswordsep($\r) -> true;
    iswordsep($\t) -> true;
    iswordsep($,) -> true;
    iswordsep($:) -> true;
    iswordsep($;) -> true;
    iswordsep(_) -> false.


%% @doc Split the binary into lines. Line separators can be \r, \n or \r\n.
split_lines(B) when is_binary(B) ->
	split_lines(B, <<>>, []).
	
	split_lines(<<>>, Line, Acc) ->
		lists:reverse([Line|Acc]);
 	split_lines(<<13,10,Rest/binary>>, Line, Acc) ->
		split_lines(Rest, <<>>, [Line|Acc]);
 	split_lines(<<13,Rest/binary>>, Line, Acc) ->
		split_lines(Rest, <<>>, [Line|Acc]);
 	split_lines(<<10,Rest/binary>>, Line, Acc) ->
		split_lines(Rest, <<>>, [Line|Acc]);
	split_lines(<<C, Rest/binary>>, Line, Acc) ->
		split_lines(Rest, <<Line/binary, C>>, Acc).


%% @doc Escape special characters for ical RFC2445 elements
escape_ical(L) when is_list(L) ->
	escape_ical(iolist_to_binary(L));
escape_ical(B) when is_binary(B) ->
	escape_ical(B, <<>>, 0);
escape_ical(A) when is_atom(A) ->
	escape_ical(atom_to_list(A)).

	escape_ical(<<>>, Acc, _N) -> Acc;
	escape_ical(B, Acc, N) when N >= 70 -> escape_ical(B, <<Acc/binary, 13, 10, 32>>, 0);
	escape_ical(<<13, 10, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $n>>, N+2);
	escape_ical(<<10, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $n>>, N+2);
	escape_ical(<<9, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, 32>>, N+1);
	escape_ical(<<$", Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $">>, N+2);
	escape_ical(<<$,, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $,>>, N+2);
	escape_ical(<<$:, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $", $:, $">>, N+3);
	escape_ical(<<$;, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $;>>, N+2);
	escape_ical(<<$\\, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, $\\, $\\>>, N+2);
	escape_ical(<<C, Rest/binary>>, Acc, N) -> escape_ical(Rest, <<Acc/binary, C>>, N+1).

%% @doc Return true if Start is a prefix of Word
%% @spec starts_with(String, String) -> bool()
starts_with(Start, B) when is_binary(Start), is_binary(B) ->
    StartSize = size(Start),
    case B of
        <<Start:StartSize/binary, _/binary>> -> true;
        _ -> false
    end;
starts_with(Start, String) ->
    starts_with(iolist_to_binary(Start), iolist_to_binary(String)).


%% @doc Return true iff Word ends with End
%% @spec ends_with(String, String) -> bool()
ends_with(End, B) when is_binary(End), is_binary(B) ->
    StartSize = size(B) - size(End),
    case B of
        <<_:StartSize/binary, End/binary>> -> true;
        _ ->false
    end;
ends_with(End, String) ->
    ends_with(iolist_to_binary(End), iolist_to_binary(String)).


%% @doc Return true iff What is found in the string
%% @spec contains(String, String) -> bool()
contains(What, B) when is_binary(What), is_binary(B) ->
    contains(What, size(What), B, 0);
contains(What, String) ->
    contains(iolist_to_binary(What), iolist_to_binary(String)).

    contains(_What, _SizeWhat, B, C) when C > size(B) ->
        false;
    contains(What, SizeWhat, B, C) ->
        case B of
            <<_:C/binary, What:SizeWhat/binary, _/binary>> ->true;
            _ ->contains(What, SizeWhat, B, C + 1)
        end.
%% @doc Split a string, see http://www.erlang.org/pipermail/erlang-questions/2008-October/038896.html
%% @spec split(String, String) -> list()

split(String, []) ->
     split0(String);
split(String, [Sep]) when is_integer(Sep) ->
     split1(String, Sep);
split(String, [C1,C2|L]) when is_integer(C1), is_integer(C2) ->
     split2(String, C1, C2, L).

%% Split a string at "", which is deemed to occur _between_
%% adjacent characters, but queerly, not at the beginning
%% or the end.

split0([C|Cs]) ->
     [[C] | split0(Cs)];
split0([]) ->
     [].

%% Split a string at a single character separator.

split1(String, Sep) ->
     split1_loop(String, Sep, "").

split1_loop([Sep|String], Sep, Rev) ->
     [lists:reverse(Rev) | split1(String, Sep)];
split1_loop([Chr|String], Sep, Rev) ->
     split1_loop(String, Sep, [Chr|Rev]);
split1_loop([], _, Rev) ->
     [lists:reverse(Rev)].

%% Split a string at a multi-character separator
%% [C1,C2|L].  These components are split out for
%% a fast match.

split2(String, C1, C2, L) ->
     split2_loop(String, C1, C2, L, "").

split2_loop([C1|S = [C2|String]], C1, C2, L, Rev) ->
     case split_prefix(L, String)
       of no   -> split2_loop(S, C1, C2, L, [C1|Rev])
        ; Rest -> [lists:reverse(Rev) | split2(Rest, C1, C2, L)]
     end;
split2_loop([Chr|String], C1, C2, L, Rev) ->
     split2_loop(String, C1, C2, L, [Chr|Rev]);
split2_loop([], _, _, _, Rev) ->
     [lists:reverse(Rev)].

split_prefix([C|L], [C|S]) -> split_prefix(L, S);
split_prefix([],    S)     -> S;
split_prefix(_,     _)     -> no.

test() ->
    A = "ьзgen",
    A = to_lower(to_upper(A)),
    "ucgen" = to_name(A),

    "a" = first_char("aap"),
    "?" = first_char("???xx"),
    "?" = first_char("?aap"),
    ok.