PageRenderTime 180ms CodeModel.GetById 12ms app.highlight 149ms RepoModel.GetById 1ms app.codeStats 0ms

/src/markdown/markdown.erl

https://code.google.com/p/zotonic/
Erlang | 1285 lines | 905 code | 140 blank | 240 comment | 11 complexity | 0e866a9d70f6b396392968721c4397fc MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1%%%-------------------------------------------------------------------
   2%%% @author    Gordon Guthrie
   3%%% @copyright (C) 2009, Gordon Guthrie
   4%%% @doc,
   5%%%
   6%%% @end
   7%%% Created : 10 Sep 2009 by gordonguthrie@backawinner.gg
   8%%%-------------------------------------------------------------------
   9
  10-module(markdown).
  11
  12-export([conv/1,
  13         conv_utf8/1,
  14         conv_file/2]).
  15
  16-import(lists, [flatten/1, reverse/1]).
  17
  18-include_lib("eunit/include/eunit.hrl").
  19
  20-define(SPACE, 32).
  21-define(TAB,    9).
  22-define(LF,    10).
  23-define(CR,    13).
  24-define(NBSP, 160).
  25-define(AMP, $&, $a, $m, $p, $;).
  26-define(COPY, $&, $c, $o, $p, $y, $;).
  27
  28%%% the lexer first lexes the input
  29%%% make_lines does 2 passes:
  30%%% * it chops the lexed strings into lines which it represents as a
  31%%%   list of lists
  32%%% * it then types the lines into the following:
  33%%% * normal lines
  34%%% * reference style links
  35%%% * reference style images
  36%%% * special line types
  37%%%   - blank
  38%%%   - SETEXT header lines
  39%%%   - ATX header lines
  40%%%   - blockquote
  41%%%   - unordered lists
  42%%%   - ordered lists
  43%%%   - code blocks
  44%%%   - horizontal rules
  45%%% the parser then does its magic interpolating the references as appropriate
  46conv(String) -> Lex = lex(String),
  47                % io:format("Lex is ~p~n", [Lex]),
  48                UntypedLines = make_lines(Lex),
  49                % io:format("UntypedLines are ~p~n", [UntypedLines]),
  50                {TypedLines, Refs} = type_lines(UntypedLines),
  51                % io:format("TypedLines are ~p~nRefs is ~p~n",
  52                %          [TypedLines, Refs]),
  53                parse(TypedLines, Refs).
  54
  55-spec conv_utf8(list()) -> list().
  56conv_utf8(Utf8) ->
  57    Str = xmerl_ucs:from_utf8(Utf8),
  58    Res = conv(Str),
  59    xmerl_ucs:to_utf8(Res).    
  60                
  61conv_file(FileIn, FileOut) ->
  62    case file:open(FileIn, [read]) of
  63        {ok, Device} -> Input = get_all_lines(Device,[]),
  64                        Output = conv(Input),
  65                        write(FileOut, Output);
  66        _            -> error
  67    end.
  68
  69get_all_lines(Device, Accum) ->
  70    case io:get_line(Device,"") of
  71        eof  -> file:close(Device),
  72                Accum;
  73        Line ->
  74            get_all_lines(Device,Accum ++ Line)
  75    end.
  76
  77write(File, Text) ->
  78    _Return=filelib:ensure_dir(File),
  79    case file:open(File, [write]) of
  80        {ok, Id} ->
  81            io:fwrite(Id, "~s~n", [Text]),
  82            file:close(Id);
  83        _ ->
  84            error
  85    end.
  86
  87
  88%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  89%%%
  90%%% Parse the lines interpolating the references as appropriate
  91%%%
  92%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  93
  94parse(TypedLines, Refs) ->
  95    string:strip(p1(TypedLines, Refs, 0, []), both, $\n).
  96
  97%% goes through the lines
  98%% Variable 'R' contains the References and 'I' is the indent level
  99
 100%% Terminal clause
 101p1([], _R, _I, Acc)    -> flatten(reverse(Acc)); 
 102
 103%% Tags have the highest precedence...
 104p1([{tag, Tag} | T], R, I, Acc) ->
 105    case T of
 106        []                -> p1([], R, I, 
 107                                ["</p>", make_tag_str(Tag, R), "<p>" | Acc]);
 108        [{blank, _} | T2] -> p1(T2, R, I, 
 109                                [make_tag_str(Tag, R) | Acc]);
 110        _Other            -> p1(T, R, I, 
 111                                [pad(I) ++ make_tag_str(Tag, R) | Acc])
 112    end;
 113
 114p1([{blocktag, [{{{tag, open}, Type}, Tg}] = _Tag} | T], R, I, Acc) ->
 115    {Block, Rest} = grab_for_blockhtml(T, Type, []),
 116    Str = lists:flatten([Tg, "\n" | Block]),
 117    p1(Rest, R, I, [Str | Acc]);
 118    
 119%% blank lines/linefeeds are gobbled down
 120p1([{Type, _} | T], R, I, Acc)
 121  when Type == blank orelse Type == linefeed ->
 122    Rest = grab_empties(T),
 123    p1(Rest, R, I, [pad(I) ++ "\n" | Acc]); 
 124
 125%% two consecutive normal lines should be concatenated...
 126%% remembering the pad the second line with the indent...
 127p1([{normal, P1}, {normal, P2} | T], R, I, Acc) ->
 128    p1([{normal, merge(P1, pad(I), P2)} | T], R, I, Acc);
 129%% as should a normal and linefeed
 130
 131%% setext h1 is a look behind and it overrides blockquote and code...
 132p1([{normal, P}, {setext_h1, _} | T], R, I, Acc) ->
 133    p1(T, R, I,  [pad(I) ++ "<h1>" ++ make_str(snip(P), R)
 134                        ++ "</h1>\n\n" | Acc]); 
 135p1([{blockquote, P}, {setext_h1, _} | T], R, I, Acc) ->
 136    p1(T, R, I,  [pad(I) ++ "<h1>" ++ make_str(snip(P), R)
 137                        ++ "</h1>\n\n" | Acc]); 
 138p1([{{codeblock, P}, _}, {setext_h1, _} | T], R, I, Acc) ->
 139    p1(T, R, I,  [pad(I) ++ "<h1>" ++ make_str(snip(P), R)
 140                        ++ "</h1>\n\n" | Acc]); 
 141p1([{blockquote, P}, {h2_or_hr, _} | T], R, I, Acc) ->
 142    p1(T, R, I,  [pad(I) ++ "<h2>" ++ make_str(snip(P), R)
 143                        ++ "</h2>\n\n" | Acc]); 
 144p1([{{codeblock, P}, _}, {h2_or_hr, _} | T], R, I, Acc) ->
 145    p1(T, R, I,  [pad(I) ++ "<h2>" ++ make_str(snip(P), R)
 146                        ++ "</h2>\n\n" | Acc]); 
 147
 148%% but a setext with no lookbehind is just rendered as a normal line,
 149%% so change its type and rethrow it
 150p1([{setext_h1, P} | T], R, I, Acc) ->
 151    p1([{normal, P} | T], R, I, Acc); 
 152
 153%% setext h2 might be a look behind
 154p1([{normal, P}, {h2_or_hr, _} | T], R, I, Acc) ->
 155    P2 = string:strip(make_str(snip(P), R), both, ?SPACE),
 156    p1(T, R, I, [pad(I) ++ "<h2>" ++ P2 ++ "</h2>\n\n" | Acc]); 
 157
 158%% blockquotes swallow each other
 159%% replace the first blockquote mark with a space...
 160p1([{blockquote, P1}, {blockquote, [_ | P2]} | T], R, I, Acc) ->
 161    p1([{blockquote, merge(P1, pad(I), [{{ws, sp}, " "} | P2])} | T], R, I, Acc);
 162%% blockquotes swallow normal
 163p1([{blockquote, P1}, {normal, P2} | T], R, I, Acc) ->
 164    p1([{blockquote, merge(P1, pad(I + 1), P2)} | T], R, I, Acc);
 165%% blockquote
 166p1([{blockquote, P} | T], R, I, Acc) ->
 167    [{{md, gt}, _} | T1] = P,
 168    T2 = string:strip(make_str(T1, R)),
 169    p1(T, R, I,
 170       ["\n<blockquote>\n" ++ pad(I + 1) ++ "<p>" ++ T2 ++ "</p>\n</blockquote>" | Acc]);
 171    
 172%% one normal is just normal...
 173p1([{normal, P} | T], R, I, Acc) ->
 174    P2 = string:strip(make_str(snip(P), R), both, ?SPACE),
 175    p1(T, R, I, [pad(I) ++ "<p>" ++ P2 ++ "</p>\n" | Acc]);
 176
 177%% atx headings
 178p1([{{h1, P}, _} | T], R, I, Acc) ->
 179    NewP = string:strip(make_str(snip(P), R), right),
 180    p1(T, R, I,  [pad(I) ++ "<h1>" ++ NewP ++ "</h1>\n\n" | Acc]); 
 181p1([{{h2, P}, _} | T], R, I, Acc) ->
 182    NewP = string:strip(make_str(snip(P), R), right),
 183    p1(T, R, I,  [pad(I) ++ "<h2>" ++ NewP ++ "</h2>\n\n" | Acc]); 
 184p1([{{h3, P}, _} | T], R, I, Acc) ->
 185    NewP = string:strip(make_str(snip(P), R), right),
 186    p1(T, R, I,  [pad(I) ++ "<h3>" ++ NewP ++ "</h3>\n\n" | Acc]); 
 187p1([{{h4, P}, _} | T], R, I, Acc) ->
 188    NewP = string:strip(make_str(snip(P), R), right),
 189    p1(T, R, I,  [pad(I) ++ "<h4>" ++ NewP ++ "</h4>\n\n" | Acc]); 
 190p1([{{h5, P}, _} | T], R, I, Acc) ->
 191    NewP = string:strip(make_str(snip(P), R), right),
 192    p1(T, R, I,  [pad(I) ++ "<h5>" ++ NewP ++ "</h5>\n\n" | Acc]); 
 193p1([{{h6, P}, _} | T], R, I, Acc) ->
 194    NewP = string:strip(make_str(snip(P), R), right),
 195    p1(T, R, I,  [pad(I) ++ "<h6>" ++ NewP ++ "</h6>\n\n" | Acc]); 
 196
 197%% unordered lists swallow normal and codeblock lines
 198p1([{{ul, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) ->
 199    p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);    
 200p1([{{ul, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) ->
 201    p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);    
 202p1([{{ul, _P}, _} | _T] = List, R, I, Acc) ->
 203    {Rest, NewAcc} = parse_list(ul, List, R, I, [], false),
 204    p1(Rest, R, I,  [pad(I) ++ "<ul>\n" ++ NewAcc
 205                           ++ pad(I) ++ "</ul>\n" | Acc]);
 206
 207%% ordered lists swallow normal and codeblock lines
 208p1([{{ol, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) ->
 209    p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);    
 210p1([{{ol, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) ->
 211    p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);    
 212p1([{{ol, _P}, _} | _T] = List, R, I, Acc) ->
 213    {Rest, NewAcc} = parse_list(ol, List, R, I, [], false),
 214    p1(Rest, R, I,  [pad(I) ++ "<ol>\n" ++ NewAcc
 215                           ++ pad(I) ++ "</ol>\n" | Acc]);
 216
 217%% codeblock consumes any following empty lines
 218%% and other codeblocks
 219p1([{{codeblock, P1}, S1}, {{codeblock, P2}, S2} | T], R, I, Acc) ->
 220    p1([{{codeblock, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);
 221p1([{{codeblock, P}, _} | T], R, I, Acc) ->
 222    Rest = grab_empties(T),
 223    p1(Rest, R, I,  ["<pre><code>" ++ make_str(snip(P), R)
 224                     ++ "\n</code></pre>\n\n" | Acc]);
 225
 226%% horizontal rules
 227p1([{hr, _} | T], R, I, Acc) ->
 228    p1(T, R, I,  ["<hr />" | Acc]);
 229%% h2_or_hr is greedy for normal lines
 230p1([{h2_or_hr, P1}, {normal, P2} | T], R, I, Acc) ->
 231    p1([{normal, flatten([P1 | P2])} | T], R, I, Acc);
 232%% the clause with a normal before an 'h2_or_hr' has already been
 233%% handled further up the tree, so this is a bona fide 'hr'...
 234p1([{h2_or_hr, _} | T], R, I, Acc) ->
 235    p1(T, R, I,  ["<hr />" | Acc]); 
 236
 237%% Now start pulling out inline refs etc, etc
 238p1([{inlineref, _P} | T], R, I, Acc) ->
 239    p1(T, R, I, Acc).
 240
 241grab_for_blockhtml([], Type, Acc) ->
 242    {lists:reverse(["</" ++ Type ++ ">" | Acc]), []};
 243grab_for_blockhtml([{blocktag, [{{{tag, close}, Type}, Tg}]}
 244                    | T], Type,  Acc) ->
 245    {lists:reverse([Tg | Acc]), T};
 246grab_for_blockhtml([{blocktag, [{{{tag, _}, GrabType}, Tg}]}
 247                    | T], Type,  Acc) when GrabType =/= Type ->
 248    % blocktags grabbed in a blocktag need a line ending pushed
 249    grab_for_blockhtml(T, Type, ["\n", Tg | Acc]);
 250grab_for_blockhtml([{tag, {{{tag, self_closing}, _Ty}, Tg}}
 251                    | T], Type, Acc) ->
 252    grab_for_blockhtml(T, Type, [Tg | Acc]);
 253grab_for_blockhtml([H | T], Type, Acc) ->
 254    {_Type, Content} = H,
 255    Str = make_plain_str(Content),
 256    grab_for_blockhtml(T, Type, [Str | Acc]).
 257
 258grab_empties([{linefeed, _} | T]) -> grab_empties(T);
 259grab_empties([{blank, _} | T])    -> grab_empties(T);
 260grab_empties(List)                -> List.
 261
 262merge(P1, Pad, P2) ->
 263    NewP1 = make_br(P1),
 264    flatten([NewP1, {string, Pad} | P2]).
 265
 266make_br(List) -> make_br1(reverse(List)).
 267
 268make_br1([{{lf, _}, _},
 269          {{ws, comp}, _} | T]) -> reverse([{tags, " <br />\n"} | T]);
 270make_br1([{{lf, _}, _},
 271          {{ws, tab}, _} | T])  -> reverse([{tags, " <br />\n"} | T]);
 272make_br1(List)                  -> reverse(List).
 273
 274pad(N) -> pad1(N, []).
 275
 276pad1(0, Acc)            -> Acc;
 277pad1(N, Acc) when N > 0 -> pad1(N - 1, ["  " | Acc]). 
 278
 279%% this is a bit messy because of the way that hard lines are treated...
 280%% If your li's have a blank line between them the item gets wrapped in a para,
 281%% if not, they don't
 282%% BUT if one item is <p> wrapped then the next is too
 283parse_list(_Type, [], _R, _I, A, _) ->
 284    {[], reverse(A)};
 285parse_list(Type, [{{Type, P}, _} | T], R, I, A, Wrap) ->
 286    {Rest, NewP, NewWrap} = grab(T, R, [], Wrap),
 287    Li = case NewWrap of
 288             false -> Ret = parse([{normal, P}], R),
 289                      % need to strip off the extra <p></p>'s
 290                      Ret2 = string:left(Ret, length(Ret) - 4),
 291                      Ret3 = string:right(Ret2, length(Ret2) -3),
 292                      Ret3 ++ "\n" ++ NewP ++ pad(I);
 293             true  -> string:strip(parse([{normal, P}], R), right, ?LF)
 294                          ++ NewP ++ pad(I) 
 295         end,
 296    NewWrap2 = case T of
 297                   []         -> false; % doesnt matter
 298                   [H2 | _T2] -> case H2 of
 299                                     {linefeed, _} -> true;
 300                                     _             -> false
 301                                 end
 302               end,
 303    parse_list(Type, Rest, R, I, [pad(I) ++ "<li>"
 304                                  ++ string:strip(Li, right, ?LF)
 305                                  ++ "</li>\n" | A], NewWrap2);
 306parse_list(_Type, List, _R, _I, A, _) ->
 307    {List, reverse(A)}.
 308
 309%% grab grabs normals, double codeblocks, linefeeds and blanks
 310%% BUT stop grabbing if a normal if preceeded by a linefeed or blank
 311%% UNLESS the normal starts with white space :(
 312%% the third return parameter is 'true' if the 'li' should be
 313%% wrapped in '<p></p>' and false if it shouldn't
 314grab([{{codeblock, _}, S} | T] = List, R, Acc, W) ->
 315    case is_blockquote(S, T) of
 316        {{true, R1}, T2}       -> grab(T2, R,
 317                                       ["</blockquote>",
 318                                        make_esc_str(R1, R),
 319                                        "<blockquote>" | Acc], W);
 320        {{esc_false, R1}, _T2} -> {R1, reverse(Acc), false};
 321        {false, T2}            -> 
 322            case is_double_indent(S) of
 323                false      ->
 324                    {List, reverse(Acc), false};
 325                {true, R2} ->
 326                    % if it is a double indent - delete 4 spaces
 327                    % no it makes not sense to me neither :(
 328                    grab(T2, R, ["    " ++ make_esc_str(R2, R) | Acc], W)
 329            end
 330    end;
 331grab([{linefeed, _} | T], R, Acc, false) ->
 332    grab2(T, R, Acc, T, Acc, true);
 333grab([{linefeed, _} | T], R, Acc, true) ->
 334    grab2(T, R, ["\n" | Acc], T, Acc, true);
 335grab([{blank, _} | T], R, Acc, false) ->
 336    grab2(T, R, Acc, T, Acc, true);
 337grab([{blank, _} | T], R, Acc, true) ->
 338    grab2(T, R, ["\n" | Acc], T, Acc, true);
 339grab([{normal, P} | T], R, Acc, W) ->
 340     Li = case W of
 341              false -> make_esc_str(P, R);
 342              true  -> "<p>"++ string:strip(make_esc_str(P, R), right, ?LF)
 343                           ++ "</p>" 
 344          end,
 345     grab(T, R, [Li | Acc], W);
 346grab(List, _R, Acc, W) ->
 347    {List, reverse(Acc), W}.
 348
 349%% the problem is knowing when to grab, if the list is followed by a long
 350%% string of blank lines and linefeeds and a normal then the linefeeds aren't
 351%% grabbed
 352%% if the list if followed by blank lines and linefeeds and a normal with an
 353%% initial whitespace it is grabbed...
 354grab2([{normal, P2} | T], R, Acc, LO, AO, W) ->
 355    case P2 of
 356        [{{ws, _}, _} | T2] ->
 357            Li = case W of
 358                     false -> make_esc_str(T2, R);
 359                     true  -> "<p>" ++
 360                                  string:strip(make_esc_str(T2, R), right, ?LF)
 361                                  ++ "</p>"
 362                 end,
 363            grab(T, R, [Li | Acc], W);
 364        _ ->
 365            {LO, AO, false}
 366    end;
 367grab2([{linefeed, _} | T], R, Acc, LO, AO, _W) ->
 368    grab2(T, R, ["\n" | Acc], LO, AO, true);
 369grab2([{blank, _} | T], R, Acc, LO, AO, _W) ->
 370    grab2(T, R, ["\n" | Acc], LO, AO, true);
 371%% We dont want to grab this stuff so return the old list and the old acc
 372grab2(_List, _R, _Acc, LO, AO, _W) ->
 373    {LO, AO, true}.
 374
 375is_double_indent(List) -> is_double_indent1(List, 0).
 376
 377%% double indent is any combination of tabs and spaces that add
 378%% up to 8
 379is_double_indent1([], _N)                  -> false;
 380is_double_indent1(Rest, N) when N > 7      -> {true, Rest};
 381is_double_indent1([{{ws, sp}, _} | T], N)  -> is_double_indent1(T, N + 1);
 382is_double_indent1([{{ws, tab}, _} | T], N) -> is_double_indent1(T, N + 4);
 383is_double_indent1(_List, _N)               -> false.
 384
 385is_blockquote(List, T) ->
 386    case is_bq1(List, 0) of
 387        false          -> {false, T};
 388        {esc_false, R} -> {{esc_false, R}, T};
 389        {true, R}      -> {NewT, NewR} = grab2(T, R),
 390                          {{true, NewR}, NewT}
 391    end.
 392
 393is_bq1([], _N)                            -> false;
 394is_bq1([{{ws, sp}, _} | T], N)            -> is_bq1(T, N + 1);
 395is_bq1([{{ws, tab}, _} | T], N)           -> is_bq1(T, N + 4);
 396is_bq1([{{md, gt}, _},
 397        {{ws, _}, _} | T], N) when N > 3  -> {true, T};
 398is_bq1([{{punc, bslash}, _},
 399        {{md, gt}, GT},
 400        {{ws, _}, WS} | T], N) when N > 3 -> {esc_false, [GT, WS | T]};
 401is_bq1(_List, _N)                         -> false.
 402
 403grab2(List, R) -> gb2(List, reverse(R)).
 404
 405gb2([], Acc)               -> {[], flatten(reverse(Acc))};
 406gb2([{blank, _} | T], Acc) -> {T, flatten(reverse(Acc))};
 407gb2([{_Type, P} | T], Acc) -> gb2(T, [P | Acc]).
 408
 409%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 410%%%
 411%%% Make the lines from the raw tokens
 412%%%
 413%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 414make_lines(Tokens) -> ml1(Tokens, [], []).
 415
 416ml1([], [], A2)                -> reverse(A2);
 417ml1([], A1, A2)                -> ml1([], [], [reverse(A1) | A2]);
 418ml1([{{lf, _}, _} = H | T], A1, A2) -> ml1(T, [], [ml2(H, A1) | A2]);
 419ml1([H | T], A1, A2)                -> ml1(T, [H | A1], A2).
 420
 421ml2(H, List) -> reverse([H | List]).
 422
 423%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 424%%%
 425%%% Process the lines and give each line a type. The valid types are:
 426%%% * normal line
 427%%% * reference style links
 428%%% * reference style images
 429%%% * special line types
 430%%%   - blank
 431%%%   - SETEXT header lines
 432%%%   - ATX header lines
 433%%%   - unordered lists (including code blocks)
 434%%%   - ordered lists (including code blocks)
 435%%%   - blockquotes
 436%%%   - code blocks
 437%%%   - horizontal rules
 438%%%
 439%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 440type_lines(Lines) ->
 441    {Refs, TypedLines} = t_l1(Lines, [], []),
 442    % io:format("TypedLines before stripping ~p~n", [TypedLines]),
 443    {strip_lines(TypedLines), Refs}.
 444
 445t_l1([], A1, A2) -> {A1, reverse(A2)};
 446%% this clause extracts URL and Image refs
 447%% (it is the only one that uses A1 and A2...
 448%% inlines can have up to 3 spaces before it
 449t_l1([[{{ws, sp}, _},
 450       {{inline, open}, _} | T1] = H | T2], A1, A2) ->
 451    t_inline(H, T1, T2, A1, A2);
 452t_l1([[{{ws, tab}, _},
 453       {{inline, open}, _} | T1] = H | T2], A1, A2) ->
 454    t_inline(H, T1, T2, A1, A2);
 455t_l1([[{{ws, comp}, W},
 456       {{inline, open}, _} | T1] = H | T2], A1, A2) ->
 457    case gt(W, 3) of
 458        {true, _R} -> t_inline(H, T1, T2, A1, A2);
 459        false      -> t_l1(T1, A1, [{normal , H} | A2]) % same exit at the final clause!
 460    end,
 461    t_inline(H, T1, T2, A1, A2);
 462t_l1([[{{inline, open}, _} | T1] = H | T2], A1, A2) ->
 463    t_inline(H, T1, T2, A1, A2);
 464
 465%% types setext lines
 466t_l1([[{{md, eq}, _} | _T] = H | T], A1, A2) ->
 467    t_l1(T, A1, [type_setext_h1(H) | A2]);
 468%% NOTE 1: generates a ul as the default not a normal line
 469%% NOTE 2: depending on the context this might generate an <h2> header
 470%%         or an <hr />
 471%% NOTE 3: space - is typed to a bullet down in <ul> land...
 472t_l1([[{{md, dash}, _} | _T] = H | T], A1, A2) ->
 473    t_l1(T, A1, [type_setext_h2(H) | A2]);
 474
 475%% types atx lines
 476t_l1([[{{md, atx}, _} | _T] = H | T], A1, A2) ->
 477    t_l1(T, A1, [type_atx(H) | A2]);
 478
 479%% types blockquotes
 480%% a blockquote on its own or followed by a linefeed is
 481%% displayed 'as is' by showdown
 482t_l1([[{{md, gt}, _} | []] = H | T], A1, A2) ->
 483    t_l1(T, A1, [{normal, H} | A2]);
 484t_l1([[{{md, gt}, _}, {{lf, _}, _} | []] = H | T], A1, A2) ->
 485    t_l1(T, A1, [{normal, H} | A2]);
 486%% one with anything after it starts a blockquote
 487t_l1([[{{md, gt}, _} | _T1] = H | T], A1, A2) ->
 488    t_l1(T, A1, [{blockquote, H} | A2]);
 489
 490%% types unordered lists lines
 491%% NOTE 1: the dashed version is generated in type_setext_h2
 492%% NOTE 2: the asterix version also might generate a horizontal rule
 493%%         which is why it jumps to type_star2 <-- note the 2!!
 494t_l1([[{{ws, _}, _}, {{md, star}, _} = ST1,
 495       {{ws, _}, _} = WS1 | T1] = H | T], A1, A2) ->
 496    t_l1(T, A1, [{type_star2([ST1, WS1 | T1]), H} | A2]);
 497t_l1([[{{md, star}, _}, {{ws, _}, _} | _T1] = H | T], A1, A2) ->
 498    t_l1(T, A1, [{type_star2(H), H} | A2]);
 499t_l1([[{{ws, _}, _}, {{md, plus}, _},
 500       {{ws, _}, _} = W | T1] = H | T], A1, A2) ->
 501    t_l1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]);
 502t_l1([[{{md, plus}, _}, {{ws, _}, _} = W | T1] = H | T], A1, A2) ->
 503    t_l1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]);
 504%% UL based on dashes
 505t_l1([[{{ws, _}, _}, {{md, dash}, _},
 506       {{ws, _}, _} = W | T1] = H | T], A1, A2) ->
 507    t_l1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]);
 508
 509%% types ordered lists...
 510t_l1([[{{ws, _}, _}, {num, _} = N1| T1] | T], A1, A2) ->
 511    t_l1(T, A1, [type_ol([N1 | T1]) | A2]);
 512t_l1([[{num, _} | _T] = H | T], A1, A2) ->
 513    t_l1(T, A1, [type_ol(H) | A2]);
 514
 515%% types horizontal rules for stars and underscores
 516%% dashes and some stars are done elsewhere...
 517t_l1([[{{md, underscore}, _} | _T1] = H | T], A1, A2) ->
 518    t_l1(T, A1, [type_underscore(H) | A2]);
 519t_l1([[{{md, star}, _} | _T1] = H | T], A1, A2) ->
 520    t_l1(T, A1, [type_star(H) | A2]);
 521
 522%% Block level tags - these are look ahead they must be
 523%% on a single line (ie directly followed by a lf and nothing else
 524t_l1([[{{{tag, _Type}, Tag}, _ } = H | T1] = List | T], A1, A2) ->
 525    case is_blank(T1) of
 526        false -> t_l1(T, A1, [{normal , List} | A2]);
 527        true  -> case is_block_tag(Tag) of
 528                     true  -> t_l1(T, A1, [{blocktag , [H]} | A2]);
 529                     false -> t_l1(T, A1, [{tag, [H | T1]} | A2])
 530                 end
 531    end;
 532
 533%% types a blank line or a code block
 534t_l1([[{{lf, _}, _}| []]  = H | T], A1, A2) ->
 535    t_l1(T, A1, [{linefeed, H} | A2]);
 536t_l1([[{{ws, _}, _} | _T1] = H | T], A1, A2) ->
 537    t_l1(T, A1, [type_ws(H) | A2]);
 538
 539%% Final clause...
 540t_l1([H | T], A1, A2) ->
 541    t_l1(T, A1, [{normal , H} | A2]).
 542
 543t_inline(H, T1, T2, A1, A2) ->
 544    case snip_ref(T1) of
 545        {Type, {Id, {Url, Title}}} -> t_l1(T2, flatten([{Id, {Url, Title}} | A1]),
 546                                           [{Type, H} | A2]);
 547        normal                     -> t_l1(T2, A1, [{normal, H} | A2])
 548    end.
 549
 550%% strips blanks from the beginning and end
 551strip_lines(List) -> reverse(strip_l1(reverse(strip_l1(List)))).
 552
 553strip_l1([{linefeed, _} | T]) -> strip_l1(T);
 554strip_l1([{blank, _} | T])    -> strip_l1(T);
 555strip_l1(List)                -> List.
 556
 557%%
 558%% Loads of type rules...
 559%%
 560
 561is_blank([])                  -> true;
 562is_blank([{{lf, _}, _} | []]) -> true;
 563is_blank([{{ws, _}, _} | T])  -> is_blank(T);
 564is_blank(_List)               -> false.
 565
 566is_block_tag("address")    -> true;
 567is_block_tag("blockquote") -> true;
 568is_block_tag("center")     -> true;
 569is_block_tag("dir")        -> true;
 570is_block_tag("div")        -> true;
 571is_block_tag("dl")         -> true;
 572is_block_tag("fieldset")   -> true;
 573is_block_tag("form")       -> true;
 574is_block_tag("h1")         -> true;
 575is_block_tag("h2")         -> true;
 576is_block_tag("h3")         -> true;
 577is_block_tag("h4")         -> true;
 578is_block_tag("h5")         -> true;
 579is_block_tag("h6")         -> true;
 580is_block_tag("hr")         -> true;
 581is_block_tag("isindex")    -> true;
 582is_block_tag("menu")       -> true;
 583is_block_tag("noframes")   -> true;
 584is_block_tag("noscript")   -> true;
 585is_block_tag("ol")         -> true;
 586is_block_tag("p")          -> true;
 587is_block_tag("pre")        -> true;
 588is_block_tag("table")      -> true;
 589is_block_tag("thead")      -> true;
 590is_block_tag("tbody")      -> true;
 591is_block_tag("tr")         -> true;
 592is_block_tag("td")         -> true;
 593is_block_tag("ul")         -> true;
 594is_block_tag(_Other)       -> false.
 595
 596type_underscore(List) ->
 597    case type_underscore1(trim_right(List)) of
 598        hr    -> {hr, List};
 599        maybe -> {type_underscore2(List), List}
 600    end.
 601
 602type_underscore1([])                          -> hr;
 603type_underscore1([{{md, underscore}, _} | T]) -> type_underscore1(T);
 604type_underscore1(_List)                       -> maybe.
 605
 606type_underscore2(List) ->
 607    case trim_right(List) of % be permissive of trailing spaces
 608        [{{md, underscore}, _}, {{ws, _}, _},
 609         {{md, underscore}, _}, {{ws, _}, _},
 610         {{md, underscore}, _}]               -> hr;
 611        _Other                                -> normal
 612    end.
 613
 614type_star(List) ->
 615    Trim = trim_right(List),
 616    case type_star1(Trim) of % be permssive of trailing spaces
 617        hr    -> {hr, trim_right(Trim)};
 618        maybe -> Type = type_star2(List),
 619                 % if it is a normal line we prepend it with a special
 620                 % non-space filling white space character
 621                 case Type of
 622                     normal -> {normal, [{{ws, none}, none} | List]};
 623                     _      -> {Type, List}
 624                 end
 625    end.
 626
 627type_star1([])                    -> hr;
 628type_star1([{{md, star}, _} | T]) -> type_star1(T);
 629type_star1(_List)                 -> maybe.
 630
 631type_star2(List) ->
 632    case trim_right(List) of
 633        [{{md, star}, _}, {{ws, _}, _},
 634         {{md, star}, _}, {{ws, _}, _},
 635         {{md, star}, _}]                -> hr;
 636        _Other ->
 637            case List of
 638                [{{md, star}, _},
 639                 {{ws, _}, _}= WS | T] -> {ul, make_list_str([WS | T])};
 640                _Other2                -> normal
 641            end
 642    end.
 643
 644type_ol(List) ->
 645    case type_ol1(List, []) of
 646        normal            -> {normal, List};
 647        {ol, Str}         -> {{ol, Str}, List};
 648        {esc_normal, Str} -> {normal, Str}
 649    end.
 650
 651
 652%% this line terminates on an escaped fullstop after a number
 653%% (but you need to drop the bslash...)
 654type_ol1([{num, _} = N,
 655          {{punc, bslash}, _},
 656          {{punc, fullstop}, _} = P | T], Acc) ->
 657    {esc_normal, flatten([reverse(Acc), N, P | T])};
 658%% we accumulate the digits in case we need to escape a full stop in a normal line
 659type_ol1([{num, _} = H | T], Acc)  -> type_ol1(T, [H | Acc]);
 660type_ol1([{{punc, fullstop}, _},
 661          {{ws, _}, _} | T], _Acc) -> {ol, T};
 662type_ol1(_List, _Acc)              -> normal.
 663
 664%% You need to understand what this function is trying to d...
 665%% '### blah' is fine
 666%% '### blah ###' is reduced to '### blah' because trailing #'s are
 667%% just for show but...
 668%% '##' is like appling '#' to '#' <-- applying 1 less styling to a single #
 669%% and '###' is like appling '##' to '#' etc, etc
 670%% but after you hit 6#'s you just get this for a single hash
 671%% ie '#############' is like applying '######' to a single '#'
 672%% but/and '######## blah' is like apply '######' to '## blah'
 673%% strip trailing #'s as they are decorative only...
 674type_atx(List) ->
 675    {Sz, R} = get_atx_size(List),
 676    A = [{{md, atx}, "#"}],
 677    Type =
 678        case is_all_hashes(R) of
 679            true  ->
 680                if
 681                    Sz == 1 ->
 682                        normal; 
 683                    ((Sz > 1) andalso (Sz < 6)) ->
 684                        Ns = integer_to_list(Sz - 1),
 685                        Hn = list_to_atom("h" ++ Ns),
 686                        {Hn, A};
 687                    ((Sz == 6) andalso (R == [])) ->
 688                        {h5, A};
 689                    ((Sz == 6) andalso (R == [{{lf, lf}, "\n"}])) ->
 690                        {h5, A};
 691                    ((Sz == 6) andalso (R == [{{lf, crlf}, "\r\n"}])) ->
 692                        {h5, A};
 693                    ((Sz == 6) andalso (R =/= [])) ->
 694                        {h6, A}
 695                end;
 696            false ->
 697                Ns = integer_to_list(Sz),
 698                Hn = list_to_atom("h" ++ Ns),
 699                {Hn, strip_atx(R)}
 700        end,
 701    {Type, List}.
 702
 703is_all_hashes([])                   -> true;
 704is_all_hashes([{{md, atx}, _} | T]) -> is_all_hashes(T);
 705is_all_hashes([{{lf, _}, _} | []])  -> true;
 706is_all_hashes(_List)                -> false.
 707
 708get_atx_size(List) -> g_atx_size1(List, 0).
 709
 710% this function also strips whitespace to the left...
 711g_atx_size1([{{md, atx}, _} = A | T], N) when N == 6 -> {6, [A | T]};
 712g_atx_size1([{{md, atx}, _} | T], N)                 -> g_atx_size1(T, N + 1);
 713g_atx_size1([{{ws, _}, _} | T], N)                   -> g_atx_size1(T, N);
 714g_atx_size1(List, N)                                 -> {N, List}.
 715
 716strip_atx(List) -> reverse(s_atx1(reverse(List))).
 717
 718s_atx1([{{lf, _}, _}, {{md, atx}, _} | T]) -> s_atx1(T);
 719s_atx1([{{md, atx}, _} | T])               -> s_atx1(T);
 720s_atx1(List)                               -> List.
 721
 722type_setext_h1(List) -> type_s_h1_1(List, []).
 723
 724%% terminates on running out or new line
 725type_s_h1_1([{{lf, _}, _} = L | []], Acc) -> {setext_h1, reverse([L | Acc])};
 726type_s_h1_1([], Acc)                      -> {setext_h1, reverse(Acc)};
 727type_s_h1_1([[] | T], Acc)                -> type_s_h1_1(T, Acc);
 728type_s_h1_1([{{md, eq}, _} = H | T], Acc) -> type_s_h1_1(T, [H | Acc]);
 729type_s_h1_1(L, Acc)                       ->  {normal, flatten([Acc | L])}.
 730
 731type_setext_h2(List) ->
 732    case type_s_h2_1(List) of
 733        h2_or_hr -> {h2_or_hr, List};
 734        not_h2   -> {type_s_h2_2(trim_right(List)), List}
 735    end.                            
 736%% terminates on running out or new line
 737type_s_h2_1([{{lf, _}, _} | []])   -> h2_or_hr;
 738type_s_h2_1([])                    -> h2_or_hr;
 739type_s_h2_1([[] | T])              -> type_s_h2_1(T);
 740type_s_h2_1([{{md, dash}, _} | T]) -> type_s_h2_1(T);
 741type_s_h2_1(_L)                    -> not_h2.
 742
 743type_s_h2_2([{{md, dash}, _}, {{ws,_}, _},
 744             {{md, dash}, _}, {{ws, _}, _},
 745             {{md, dash}, _}])              -> hr;
 746type_s_h2_2([{{md, dash}, _},
 747             {{ws, _}, _} = WS | T])        -> {ul, make_list_str([WS | T])};
 748type_s_h2_2(_List)                          -> normal.
 749
 750type_ws(List) ->
 751    case type_ws1(List) of
 752        blank         -> {blank, List};
 753        try_codeblock ->
 754            case type_ws2(List) of
 755                normal           -> {normal, List};
 756                {codeblock, Ret} -> {{codeblock, Ret}, List}
 757            end
 758    end.
 759
 760type_ws1([])                  -> blank;
 761type_ws1([{{lf, _}, _} | []]) -> blank;
 762type_ws1([[] | T])            -> type_ws1(T);
 763type_ws1([{{ws, _}, _} | T])  -> type_ws1(T);
 764type_ws1(_L)                  -> try_codeblock.
 765
 766%% 4 or more spaces takes you over the limit
 767%% (a tab is 4...)
 768type_ws2([{{ws, tab}, _} | T])  -> {codeblock, T};
 769type_ws2([{{ws, comp}, W} | T]) -> case gt(W, 4) of
 770                                           {true, R} -> {codeblock, [R| T]};
 771                                           false     -> normal
 772                                       end;
 773type_ws2([{{ws, sp}, _} | _T])  -> normal.
 774
 775gt(String, Len) ->
 776    ExpString = re:replace(String, "\t", "    ", [{return, list}]),
 777    ExpStringLen = length(ExpString),
 778    if
 779        ExpStringLen >= Len -> WS = string:substr(ExpString, Len + 1,
 780                                                  ExpStringLen),
 781                               {true, {{ws, sp}, WS}};
 782        ExpStringLen <  Len -> false
 783    end.
 784
 785%% make a tag into a string
 786make_tag_str(L, R) -> make_tag1(L, R, []).
 787
 788make_tag1([], _R, Acc) -> lists:reverse(Acc);
 789make_tag1([{{{tag, _Type}, _Tag}, B} | T], R, Acc) ->
 790    make_tag1(T, R, [B | Acc]);
 791make_tag1([H | T], R, Acc) ->
 792    make_tag1(T, R, [make_str([H], R) | Acc]).
 793
 794esc_tag(String) -> esc_t1(String, []).
 795
 796esc_t1([], Acc)          -> lists:reverse(Acc);
 797esc_t1([?NBSP | T], Acc) -> esc_t1(T, [?SPACE | Acc]); % non-breaking space to space
 798esc_t1([H | T], Acc)     -> esc_t1(T, [H | Acc]).
 799                  
 800%% if it is a list we need to discard the initial white space...
 801make_list_str([{{ws, _}, _} | T] = List) ->
 802    case is_double_indent(List) of
 803        false     -> T;
 804        {true, R} -> flatten([{tags, "<pre><code>"} ,R ,
 805                              {tags, "</code></pre>\n\n"} | []])
 806    end.
 807
 808%% All ref processing can ignore the original values 'cos those
 809%% have already been captured at a higher level
 810snip_ref(List) ->
 811    case get_id(List) of
 812        {[{_, Id}], Rest} -> {_Rest2, Ref, Title} = parse_inline(Rest),
 813                             Ref2 = trim(Ref),
 814                             Rs = htmlencode(make_plain_str(Ref2)),
 815                             Ts = make_plain_str(Title),
 816                             {inlineref, {Id, {Rs, Ts}}};
 817        normal            -> normal
 818    end.
 819
 820get_id(List) -> g_id1(List, []).
 821
 822g_id1([], _Acc)                         -> normal;
 823g_id1([{{inline, close}, _},
 824       {{punc, colon}, _}, {{ws, _}, _}
 825       | T], Acc)                       -> {reverse(Acc), T};
 826g_id1([H | T], Acc)                     -> g_id1(T, [H | Acc]).
 827
 828parse_inline(List) -> p_in1(List, []).
 829
 830%% snip off the terminal linefeed (if there is one...)
 831p_in1([{{lf, _}, _} | []], A)            -> {[], reverse(A), []}; 
 832p_in1([], A)                             -> {[], reverse(A), []}; 
 833%% brackets can be escaped
 834p_in1([{{punc, bslash}, _},
 835       {bra, _} = B | T], A)             -> p_in1(T, [B | A]);
 836p_in1([{{punc, bslash}, _},
 837       {ket, _} = B | T], A)             -> p_in1(T, [B | A]);
 838p_in1([{{punc, bslash}, _},
 839       {{punc, doubleq}, _} = Q | T], A) -> p_in1(T, [Q | A]);
 840p_in1([{{punc, bslash}, _},
 841       {{punc, singleq}, _} = Q | T], A) -> p_in1(T, [Q | A]);
 842%% these clauses capture the start of the title...
 843p_in1([{{punc, doubleq}, _} | T], A)     -> p_in2(T, reverse(A), doubleq, []);
 844p_in1([{{punc, singleq}, _} | T], A)     -> p_in2(T, reverse(A), singleq, []);
 845p_in1([{bra, _} | T], A)            -> p_in2(T, reverse(A), brackets, []);
 846p_in1([{ket, _} | T], A)                 -> {T, reverse(A), []};
 847p_in1([H | T], A)                        -> p_in1(T, [H | A]).
 848
 849%% this gets titles in single and double quotes
 850%% the delimiter type is passed in as 'D'
 851p_in2([], Url, _D, A)                              -> {[], Url, flatten(reverse(A))};
 852%% brackets can be escaped
 853p_in2([{{punc, bslash}, _},
 854       {bra, _} = B | T], Url, D, A)               -> p_in2(T, Url, D, [B | A]);
 855p_in2([{{punc, bslash}, _},
 856       {ket, _} = B | T], Url, D, A)               -> p_in2(T, Url, D, [B | A]);
 857%% quotes can be escaped
 858p_in2([{{punc, bslash}, _},
 859       {{punc, doubleq}, _}= Q | T], Url, D, A)    -> p_in2(T, Url, D, [Q | A]);
 860p_in2([{{punc, bslash}, _},
 861       {{punc, singleq}, _} = Q | T], Url, D, A)   -> p_in2(T, Url, D, [Q | A]);
 862%% these clauses capture the end of the title and drop the delimiter...
 863p_in2([{{punc, doubleq}, _} | T], Url, doubleq, A) -> p_in2(T, Url, none, A);
 864p_in2([{{punc, singleq}, _} | T], Url, singleq, A) -> p_in2(T, Url, none, A);
 865p_in2([{ket, _} | T], Url, brackets, A)            -> p_in2(T, Url, none, A);
 866%% terminator clause
 867p_in2([{ket, _} | T], Url, none, A)                -> {T, Url, flatten(reverse(A))};
 868%% this clause silently discards stuff after the delimiter...
 869p_in2([_H | T], Url, none, A)                      -> p_in2(T, Url, none, [A]);
 870p_in2([H | T], Url, D, A)                          -> p_in2(T, Url, D, [H | A]).
 871
 872trim(String) -> trim_left(trim_right(String)).
 873
 874trim_right(String) -> reverse(trim_left(reverse(String))).
 875
 876trim_left([{{ws, _}, _} | T]) -> trim_left(T);
 877trim_left([[] | T])           -> trim_left(T);
 878trim_left(List)               -> List.
 879
 880snip(List) -> List2 = reverse(List),
 881              case List2 of
 882                  [{{lf, _}, _} | T] -> lists:reverse(T);
 883                  _                  -> List
 884              end.
 885
 886%% end of ref processing
 887
 888%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 889%%%
 890%%% Build the Lexed Token List
 891%%% This is a two part lexer, first it chunks the input and then on the second
 892%%% pass it gathers it into lines and types the lines
 893%%%
 894%%% NOTE that there are two different styles of processing lines:
 895%%% * markdown transformed
 896%%% * block
 897%%% inside block processing the whole text is dumped and just url encoded
 898%%% and the original text is always maintained during the lexing/parsing
 899%%% so that it can be recreated if the context requires it...
 900%%%
 901%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 902
 903lex(String) -> merge_ws(l1(String, [], [])).
 904
 905merge_ws(List) -> m_ws1(List, []).
 906
 907m_ws1([], Acc) -> reverse(Acc);                  
 908m_ws1([{{ws, _}, W1}, {{ws, _}, W2} | T], Acc) ->
 909    m_ws1([{{ws, comp}, W1 ++ W2} | T], Acc);
 910m_ws1([H | T], Acc) -> m_ws1(T, [H | Acc]).
 911
 912%% this is the terminal head which ends the parsing...
 913l1([], [], A2)             -> flatten(reverse(A2));
 914l1([], A1, A2)             -> l1([], [], [l2(A1) | A2]);
 915%% these two heads capture opening and closing tags
 916l1([$<, $/|T], A1, A2)     -> {Tag, NewT} = closingdiv(T, []),
 917                              l1(NewT, [], [Tag, l2(A1) | A2]);
 918l1([$< | T], A1, A2)       -> {Tag, NewT} = openingdiv(T),
 919                              l1(NewT, [], [Tag , l2(A1) | A2]);
 920%% these clauses are the normal lexer clauses
 921l1([$= | T], A1, A2)       -> l1(T, [], [{{md, eq}, "="},   l2(A1) | A2]);
 922l1([$- | T], A1, A2)       -> l1(T, [], [{{md, dash}, "-"}, l2(A1) | A2]);
 923l1([$# | T], A1, A2)       -> l1(T, [], [{{md, atx}, "#"},  l2(A1) | A2]);
 924l1([$> | T], A1, A2)       -> l1(T, [], [{{md, gt}, ">"},   l2(A1) | A2]);
 925l1([$+ | T], A1, A2)       -> l1(T, [], [{{md, plus}, "+"}, l2(A1) | A2]);
 926l1([$* | T], A1, A2)       -> l1(T, [], [{{md, star}, "*"}, l2(A1) | A2]);
 927l1([$_ | T], A1, A2)       -> l1(T, [], [{{md, underscore}, "_"}, l2(A1) | A2]);
 928l1([$1 | T], A1, A2)       -> l1(T, [], [{num, "1"}, l2(A1) | A2]);
 929l1([$2 | T], A1, A2)       -> l1(T, [], [{num, "2"}, l2(A1) | A2]);
 930l1([$3 | T], A1, A2)       -> l1(T, [], [{num, "3"}, l2(A1) | A2]);
 931l1([$4 | T], A1, A2)       -> l1(T, [], [{num, "4"}, l2(A1) | A2]);
 932l1([$5 | T], A1, A2)       -> l1(T, [], [{num, "5"}, l2(A1) | A2]);
 933l1([$6 | T], A1, A2)       -> l1(T, [], [{num, "6"}, l2(A1) | A2]);
 934l1([$7 | T], A1, A2)       -> l1(T, [], [{num, "7"}, l2(A1) | A2]);
 935l1([$8 | T], A1, A2)       -> l1(T, [], [{num, "8"}, l2(A1) | A2]);
 936l1([$9 | T], A1, A2)       -> l1(T, [], [{num, "9"}, l2(A1) | A2]);
 937l1([$0 | T], A1, A2)       -> l1(T, [], [{num, "0"}, l2(A1) | A2]);
 938l1([$. | T], A1, A2)       -> l1(T, [], [{{punc, fullstop}, "."}, l2(A1) | A2]);
 939l1([$: | T], A1, A2)       -> l1(T, [], [{{punc, colon}, ":"}, l2(A1) | A2]);
 940l1([$' | T], A1, A2)       -> l1(T, [], [{{punc, singleq}, "'"}, l2(A1) | A2]); %'
 941l1([$" | T], A1, A2)       -> l1(T, [], [{{punc, doubleq}, "\""}, l2(A1) | A2]); %"
 942l1([$` | T], A1, A2)       -> l1(T, [], [{{punc, backtick}, "`"}, l2(A1) | A2]); %"
 943l1([$! | T], A1, A2)       -> l1(T, [], [{{punc, bang}, "!"}, l2(A1) | A2]); %"
 944l1([$\\ | T], A1, A2)      -> l1(T, [], [{{punc, bslash}, "\\"}, l2(A1) | A2]); %"
 945l1([$/ | T], A1, A2)       -> l1(T, [], [{{punc, fslash}, "/"}, l2(A1) | A2]); %"
 946l1([$( | T], A1, A2)       -> l1(T, [], [{bra, "("}, l2(A1) | A2]);
 947l1([$) | T], A1, A2)       -> l1(T, [], [{ket, ")"}, l2(A1) | A2]);
 948l1([$[ | T], A1, A2)       -> l1(T, [], [{{inline, open}, "["}, l2(A1) | A2]);
 949l1([$] | T], A1, A2)       -> l1(T, [], [{{inline, close}, "]"}, l2(A1) | A2]);
 950%% note there is a special 'whitespace' {{ws, none}, ""} which is used to generate non-space
 951%% filling whitespace for cases like '*bob* is great' which needs a non-space filling
 952%% whitespace prepended to trigger emphasis so it renders as "<em>bob</em> is great...
 953%% that 'character' doesn't exist so isn't in the lexer but appears in the parser
 954l1([?SPACE | T], A1, A2)   -> l1(T, [], [{{ws, sp}, " "}, l2(A1) | A2]);
 955l1([?TAB | T], A1, A2)     -> l1(T, [], [{{ws, tab}, "\t"}, l2(A1) | A2]);
 956l1([?NBSP | T], A1, A2)    -> l1(T, [], [{{ws, sp}, "&nbsp"}, l2(A1) | A2]);
 957l1([?CR, ?LF | T], A1, A2) -> l1(T, [], [{{lf, crlf}, [?CR , ?LF]}, l2(A1) | A2]);
 958l1([?LF | T], A1, A2)      -> l1(T, [], [{{lf, lf}, [?LF]}, l2(A1) | A2]);
 959%% l1([?CR | T], A1, A2)      -> l1(T, [], [{{lf, cr}, [?CR]}, l2(A1) | A2]);
 960%% this final clause accumulates line fragments
 961l1([H|T], A1, A2)          -> l1(T, [H |A1] , A2).
 962
 963l2([])   -> [];
 964l2(List) -> {string, flatten(reverse(List))}.
 965
 966%% need to put in regexes for urls and e-mail addies
 967openingdiv(String) ->
 968    case get_url(String) of
 969        {{url, URL}, R1} -> {{url, URL}, R1};
 970        not_url          ->
 971            case get_email_addie(String) of
 972                {{email, EM}, R2} -> {{email, EM}, R2};
 973                not_email         -> openingdiv1(String, [])
 974            end
 975    end.
 976
 977% dumps out a list if it is not an opening div
 978openingdiv1([], Acc)         -> {flatten([{{punc, bra}, "<"}
 979                                          | lex(reverse(Acc))]), []};  
 980openingdiv1([$/,$>| T], Acc) -> Acc2 = flatten(reverse(Acc)),
 981                                Acc3 = string:to_lower(Acc2),
 982                                [Tag | _T] = string:tokens(Acc3, " "),
 983                                {{{{tag, self_closing}, Tag}, "<"
 984                                  ++ Acc2 ++ "/>"}, T};
 985%% special for non-tags
 986openingdiv1([$>| T], [])     -> {[{{punc, bra}, "<"},
 987                                          {{punc, ket}, ">"}], T};
 988openingdiv1([$>| T], Acc)    -> Acc2 = flatten(reverse(Acc)),
 989                                Acc3 = string:to_lower(Acc2),
 990                                [Tag | _T] = string:tokens(Acc3, " "),
 991                                {{{{tag, open}, Tag}, "<"
 992                                  ++ Acc2 ++ ">"}, T};
 993openingdiv1([H|T], Acc)      -> openingdiv1(T, [H | Acc]).
 994
 995% dumps out a list if it is not an closing div
 996closingdiv([], Acc)     -> {flatten([{{punc, bra}, "<"},
 997                                     {{punc, fslash}, "/"}
 998                                     | lex(reverse(Acc))]), []};  
 999closingdiv([$>| T], Acc) -> Acc2 = flatten(reverse(Acc)),
1000                            Acc3 = string:to_lower(Acc2),
1001                            [Tag | _T] = string:tokens(Acc3, " "),
1002                            {{{{tag, close}, Tag}, "</"
1003                              ++ Acc2 ++ ">"}, T};
1004closingdiv([H|T], Acc)   -> closingdiv(T, [H | Acc]).
1005
1006get_url(String) -> HTTP_regex = "^(H|h)(T|t)(T|t)(P|p)(S|s)*://",
1007                   case re:run(String, HTTP_regex) of
1008                       nomatch    -> not_url;
1009                       {match, _} -> get_url1(String, [])
1010                   end.
1011
1012get_url1([], Acc)            -> URL = flatten(reverse(Acc)),
1013                                {{url, URL}, []};
1014% allow escaped kets
1015get_url1([$\\, $> | T], Acc) -> get_url1(T, [$>, $\\ | Acc]);
1016get_url1([$> | T], Acc)      -> URL = flatten(reverse(Acc)),
1017                                {{url, URL}, T};
1018get_url1([H | T], Acc)       -> get_url1(T, [H | Acc]).
1019
1020get_email_addie(String) ->
1021    Snip_regex = ">",
1022    case re:run(String, Snip_regex) of
1023        nomatch                -> not_email;
1024        {match, [{N, _} | _T]} ->
1025            {Possible, [$> | T]} = lists:split(N, String),
1026            EMail_regex = "[a-z0-9!#$%&'*+/=?^_`{|}~-]+"
1027                ++ "(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*"
1028                ++ "@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+"
1029                ++ "(?:[a-zA-Z]{2}|com|org|net|gov|mil"
1030                ++ "|biz|info|mobi|name|aero|jobs|museum)",
1031            case re:run(Possible, EMail_regex) of
1032                nomatch    -> not_email;
1033                {match, _} -> {{email, Possible}, T}
1034            end
1035    end.
1036
1037%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1038%%%
1039%%% Internal functions
1040%%%
1041%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1042make_plain_str(List) -> m_plain(List, []).
1043
1044m_plain([], Acc)                           -> flatten(reverse(Acc));
1045m_plain([{{ws, none}, none} | T], Acc)     -> m_plain(T, [" " | Acc]);
1046m_plain([{_, Str} | T], Acc)               -> m_plain(T, [Str | Acc]).
1047
1048make_esc_str(List, Refs) -> m_esc(List, Refs, []).
1049
1050m_esc([], _R, A)               -> flatten(reverse(A));
1051m_esc([{tags, Tag} | T], R, A) -> m_esc(T, R, [{tags, Tag} | A]);
1052m_esc([H | T], R, A)           -> m_esc(T, R, [make_str([H], R) | A]).
1053
1054    
1055make_str(List, Refs) -> m_str1(List, Refs, []).
1056
1057m_str1([], _R, A) ->
1058    Flat = flatten(reverse(A)),
1059    htmlchars(Flat);
1060m_str1([{{punc, bang}, B}, {{inline, open}, O} | T], R, A) ->
1061    case get_inline(T, R, [], img) of
1062        {Rest, {Url, Title, Acc}} -> Tag = [make_img_tag(Url, Acc, Title)],
1063                                     m_str1(Rest, R, [Tag | A]);
1064        {Rest, Tag}               -> m_str1(Rest, R, [Tag, O, B | A])
1065    end;
1066%% escape inline open's...
1067m_str1([{{punc, bslash}, _}, {{inline, open}, O} | T], R, A) ->
1068    m_str1(T, R, [O | A]);
1069m_str1([{{inline, open}, O} | T], R, A) ->
1070    case get_inline(T, R, [], url) of
1071        {Rest, {Url, Title, Acc}} ->
1072            Tit = case Title of
1073                      [] -> [];
1074                      _  -> " title=\"" ++ Title ++ "\""
1075                  end,
1076            Tag = [{tags, "<a href=\"" ++ Url ++ "\""
1077                    ++ Tit ++ ">"}, Acc,
1078                   {tags, "</a>"} | []],
1079            m_str1(Rest, R, [Tag | A]);
1080        {Rest, Tag} ->
1081            m_str1(Rest, R, [Tag, O | A])
1082    end;
1083m_str1([{email, Addie} | T], R, A) ->
1084    m_str1(T, R, [{tags, "\" />"}, Addie, {tags, "<a href=\"mailto:"}| A]);
1085m_str1([{url, Url} | T], R, A) ->
1086    m_str1(T, R, [ {tags, "</a>"}, Url, {tags, "\">"}, Url,
1087                   {tags, "<a href=\""} | A]);
1088m_str1([{tags, _} = Tag | T], R, A) ->
1089    m_str1(T, R, [Tag | A]);
1090m_str1([{{{tag, Type}, Tag}, _} | T], R, A) ->
1091    Tag2 = esc_tag(Tag),
1092    TagStr = case Type of
1093                 open         -> {tags, "&lt;"  ++ Tag2 ++ "&gt;"};
1094                 close        -> {tags, "&lt;/" ++ Tag2 ++ "&gt;"};
1095                 self_closing -> {tags, "&lt;"  ++ Tag2 ++ " /&gt;"}
1096             end,
1097    m_str1(T, R, [TagStr | A]);
1098m_str1([{_, Orig} | T], R, A)  ->
1099    m_str1(T, R, [Orig | A]).
1100
1101% if the inline doesn't terminate its not an inline...
1102get_inline([], _R, A, _) ->
1103    {[], make_plain_str(reverse(A))};
1104% a url can contain an image inline
1105get_inline([{{punc, bang}, _B}, {{inline, open}, _O} | T], R, A, url) ->
1106    {Rest, {Url, Title, Acc}} = get_inline(T, R, A, img),
1107    Tag = make_img_tag(Url, Acc, Title),
1108    % We double tag the tag so that it can get through the flatteners..
1109    get_inline(Rest, R, [{tags, Tag} | A], url);
1110get_inline([{{inline, close}, _}, {bra, _} | T], _R, A, _) ->
1111    {Rest, Url, Title} = parse_inline(T),
1112    Tag = {string:strip(make_plain_str(Url)),
1113           make_plain_str(Title),
1114           make_plain_str(reverse(A))},
1115    {Rest, Tag};
1116%% for img's but not url's you need to allow a single space between them
1117%% to be compatible with showdown :(
1118get_inline([{{inline, close}, _}, {{ws, sp}, _}, {bra, _} | T], _R, A, img) ->
1119    {Rest, Url, Title} = parse_inline(T),
1120    Tag = {string:strip(make_plain_str(Url)),
1121           make_plain_str(Title),
1122           make_plain_str(reverse(A))},
1123    {Rest, Tag};
1124%% this clause detects references to images/links...
1125get_inline([{{inline, close}, _}, {{inline, open}, _} | T], R, A, _) ->
1126    Text = make_plain_str(reverse(A)),
1127    case get_id_diff(T) of
1128        normal            -> {[], make_plain_str(reverse(A))};
1129        {[{_, Id}], Rest} ->
1130            {Url, Title} = case lists:keyfind(Id, 1, R) of
1131                               false          -> {"", ""};
1132                               {Id, {U, Tit}} -> {U, Tit}
1133                           end,
1134            Tag = {Url, Title, Text},
1135            {Rest, Tag};
1136        _Other -> {[], make_plain_str(reverse(A))} % random failing id's
1137    end;
1138%% so does this one - just delete the space and rethrow it
1139get_inline([{{inline, close}, _} = C , {{ws, _}, _},
1140            {{inline, open}, _} = O | T], R, A, Type) ->
1141    get_inline([C, O | T], R, A, Type);
1142%% this is the markdown extension clause that takes an id in square brackets without
1143%% any additional stuff as a valid id marker
1144get_inline([{{inline, close}, _} | T], R, A, _) ->
1145    Id = make_plain_str(reverse(A)),
1146    case lists:keyfind(Id, 1, R) of
1147                       false              -> {T, flatten([Id , $]])};
1148                       {Id, {Url, Title}} -> Tag = {Url, Title, Id},
1149                                             {T, Tag}
1150          end;
1151get_inline([H | T], R, A, Type) ->
1152    get_inline(T, R, [H | A], Type).
1153
1154get_id_diff(List) -> g_id_diff1(List, []).
1155
1156g_id_diff1([], _Acc)                         -> normal;
1157g_id_diff1([{{inline, close}, _}| T], Acc)   -> {reverse(Acc), T};
1158g_id_diff1([H | T], Acc)                     -> g_id_diff1(T, [H | Acc]).
1159
1160%% convert ascii into html characters
1161htmlencode(List) ->
1162    htmlencode(List, []).
1163 
1164htmlencode([], Acc) ->
1165    lists:flatten(lists:reverse(Acc));
1166 
1167htmlencode([$&   | Rest], Acc) -> htmlencode(Rest, ["&amp;" | Acc]);
1168htmlencode([$<   | Rest], Acc) -> htmlencode(Rest, ["&lt;" | Acc]);
1169htmlencode([$>   | Rest], Acc) -> htmlencode(Rest, ["&gt;" | Acc]);
1170htmlencode([160  | Rest], Acc) -> htmlencode(Rest, ["&nbsp;" | Acc]);
1171htmlencode([Else | Rest], Acc) -> htmlencode(Rest, [Else | Acc]).
1172
1173htmlchars(List) -> htmlchars1(List, []).
1174 
1175htmlchars1([], Acc) -> flatten(reverse(Acc));
1176%% tags are just wheeched out unescaped
1177htmlchars1([{tags, Tag} | T], Acc)   -> htmlchars1(T, [Tag | Acc]);
1178%% line ends are pushed to a space..
1179htmlchars1([?CR, ?LF | T], Acc)      -> htmlchars1(T, ["\n" | Acc]);
1180htmlchars1([?LF | T], Acc)           -> htmlchars1(T, ["\n" | Acc]);
1181htmlchars1([?CR | T], Acc)           -> htmlchars1(T, ["\r" | Acc]);
1182%% emphasis is a bit strange - must be preceeded by or followed by
1183%% white space to work and can also be escaped
1184%% there is a non-space filling white space represented by the atom 'none'
1185%% which is created in the parser (NOT IN THE LEXER!) and which triggers
1186%% emphasis or strong tags being turned on...
1187htmlchars1([$\\, $*, $*, $* | T], A) -> htmlchars1(T, [$*, $*, $* | A]);
1188htmlchars1([$*, $*, $* | T], A)      -> {T2, NewA} = superstrong(T, $*),
1189                                        htmlchars1(T2, [NewA | A]);
1190% repeat for strong
1191htmlchars1([$\\, $*, $* | T], A)     -> htmlchars1(T, [$*, $* | A]);
1192htmlchars1([$*, $* | T], A)          -> {T2, NewA} = strong(T, $*),
1193                                        htmlchars1(T2, [NewA | A]);
1194%% likewise for strong
1195htmlchars1([$\\, $* | T], A)         -> htmlchars1(T, [$* | A]);
1196htmlchars1([$* | T], A)              -> {T2, NewA} = emphasis(T, $*),
1197                                        htmlchars1(T2, [NewA | A]);
1198%% and again for underscores
1199htmlchars1([$\\, $_, $_, $_ | T], A) -> htmlchars1(T, [$_, $_, $_ | A]);
1200%% the none atom is the non-space filling whitespace 
1201htmlchars1([$_, $_, $_ | T], A)      -> {T2, NewA} = superstrong(T, $_),
1202                                        htmlchars1(T2, [NewA | A]);
1203% and strong
1204%% and again for underscores
1205htmlchars1([$\\, $_, $_ | T], A)     -> htmlchars1(T, [$_, $…

Large files files are truncated, but you can click here to view the full file