PageRenderTime 164ms CodeModel.GetById 57ms app.highlight 89ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/xmerl/src/xmerl_scan.erl

http://github.com/gebi/jungerl
Erlang | 3778 lines | 2842 code | 378 blank | 558 comment | 32 complexity | 573894cf37ee0fe26e2519ee216f684f MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1%%% The contents of this file are subject to the Erlang Public License,
   2%%% Version 1.0, (the "License"); you may not use this file except in
   3%%% compliance with the License. You may obtain a copy of the License at
   4%%% http://www.erlang.org/license/EPL1_0.txt
   5%%%
   6%%% Software distributed under the License is distributed on an "AS IS"
   7%%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
   8%%% the License for the specific language governing rights and limitations
   9%%% under the License.
  10%%%
  11%%% The Original Code is xmerl-0.15
  12%%%
  13%%% The Initial Developer of the Original Code is Ericsson Telecom
  14%%% AB. Portions created by Ericsson are Copyright (C), 1998, Ericsson
  15%%% Telecom AB. All Rights Reserved.
  16%%%
  17%%% Contributor(s):
  18%%%    Mickael Remond <mickael.remond@IDEALX.com>:
  19%%%    Johan Blom <johan.blom@mobilearts.se>
  20%%%    Richard Carlsson
  21%%%    Fredrik Linder
  22%%%
  23%%%----------------------------------------------------------------------
  24%%% #0.    BASIC INFORMATION
  25%%%----------------------------------------------------------------------
  26%%% File:       xmerl_scan.erl
  27%%% Author       : Ulf Wiger <ulf.wiger@ericsson.com>
  28%%% Description  : Simgle-pass XML scanner. See xmerl.hrl for data defs.
  29%%% 
  30%%% Modules used : ets, file, filename, io, lists, ucs, uri
  31%%% 
  32%%%----------------------------------------------------------------------
  33%% @doc 
  34%%     The XML parser is activated through 
  35%%     <tt>xmerl_scan:string/[1,2]</tt> or 
  36%%     <tt>xmerl_scan:file/[1,2]</tt>.
  37%%     It returns records of the type defined in xmerl.hrl.
  38%% See also <a href="xmerl_examples.html">tutorial</a> on customization
  39%% functions.
  40%% @type global_state(). <p>
  41%% The global state of the scanner, represented by the #xmerl_scanner{} record.
  42%% </p>
  43%% @type option_list(). <p>Options allows to customize the behaviour of the
  44%%     scanner.
  45%% See also <a href="xmerl_examples.html">tutorial</a> on customization
  46%% functions.
  47%% </p>
  48%% Possible options are:
  49%% <dl>
  50%%  <dt><code>{acc_fun, Fun}</code></dt>
  51%%    <dd>Call back function to accumulate contents of entity.</dd>
  52%%  <dt><code>{continuation_fun, Fun} |
  53%%            {continuation_fun, Fun, ContinuationState}</code></dt>
  54%%    <dd>Call back function to decide what to do if the scanner runs into eof
  55%%     before the document is complete.</dd>
  56%%  <dt><code>{event_fun, Fun} |
  57%%            {event_fun, Fun, EventState}</code></dt>
  58%%    <dd>Call back function to handle scanner events.</dd>
  59%%  <dt><code>{fetch_fun, Fun} |
  60%%            {fetch_fun, Fun, FetchState}</code></dt>
  61%%    <dd>Call back function to fetch an external resource.</dd>
  62%%  <dt><code>{hook_fun, Fun} |
  63%%            {hook_fun, Fun, HookState}</code></dt>
  64%%    <dd>Call back function to process the document entities once
  65%%     identified.</dd>
  66%%  <dt><code>{close_fun, Fun}</code></dt>
  67%%    <dd>Called when document has been completely parsed.</dd>
  68%%  <dt><code>{rules, ReadFun, WriteFun, RulesState} |
  69%%            {rules, Rules}</code></dt>
  70%%    <dd>Handles storing of scanner information when parsing.</dd>
  71%%  <dt><code>{user_state, UserState}</code></dt>
  72%%    <dd>Global state variable accessible from all customization functions</dd>
  73%%
  74%%  <dt><code>{fetch_path, PathList}</code></dt>
  75%%    <dd>PathList is a list of
  76%%     directories to search when fetching files. If the file in question
  77%%     is not in the fetch_path, the URI will be used as a file
  78%%     name.</dd>
  79%%  <dt><code>{space, Flag}</code></dt>
  80%%    <dd>'preserve' (default) to preserve spaces, 'normalize' to
  81%%    accumulate consecutive whitespace and replace it with one space.</dd>
  82%%  <dt><code>{line, Line}</code></dt>
  83%%    <dd>To specify starting line for scanning in document which contains
  84%%    fragments of XML.</dd>
  85%%  <dt><code>{namespace_conformant, Flag}</code></dt>
  86%%    <dd>Controls whether to behave as a namespace conformant XML parser,
  87%%    'false' (default) to not otherwise 'true'.</dd>
  88%%  <dt><code>{validation, Flag}</code></dt>
  89%%    <dd>Controls whether to process as a validating XML parser,
  90%%    'false' (default) to not otherwise 'true'.</dd>
  91%%  <dt><code>{quiet, Flag}</code></dt>
  92%%    <dd>Set to 'true' if xmerl should behave quietly and not output any info
  93%%    to standard output (default 'false').</dd>
  94%%  <dt><code>{doctype_DTD, DTD}</code></dt>
  95%%    <dd>Allows to specify DTD name when it isn't available in the XML
  96%%    document.</dd>
  97%%  <dt><code>{xmlbase, Dir}</code></dt>
  98%%    <dd>XML Base directory. If using string/1 default is current directory.
  99%%    If using file/1 default is directory of given file.</dd>
 100%%  <dt><code>{encoding, Enc}</code></dt>
 101%%    <dd>Set default character set used (default UTF-8).
 102%%    This character set is used only if not explicitly given by the XML
 103%%    declaration. </dd>
 104%% </dl>
 105%% @end 
 106%% Only used internally are:
 107%%  <dt><code>{environment,Env}</code></dt>
 108%%   <dd>What is this?</dd>
 109%% <dt><code>{text_decl,Bool}</code></dt>
 110%%   <dd>What is this?</dd>
 111
 112-module(xmerl_scan).
 113-vsn('0.19').
 114-date('03-09-16').
 115
 116
 117%% main API
 118-export([string/1, string/2,
 119	 file/1, file/2]).
 120
 121%% access functions for various states
 122-export([user_state/1, user_state/2,
 123	 event_state/1, event_state/2,
 124	 hook_state/1, hook_state/2,
 125	 rules_state/1, rules_state/2,
 126	 fetch_state/1, fetch_state/2,
 127	 cont_state/1, cont_state/2]).
 128
 129%% helper functions. To xmerl_lib ??
 130-export([accumulate_whitespace/4]).
 131
 132%-define(debug, 1).
 133-include("xmerl.hrl").		% record def, macros
 134-include_lib("kernel/include/file.hrl").
 135
 136
 137-define(fatal(Reason, S),
 138	if
 139	    S#xmerl_scanner.quiet ->
 140		ok;
 141	    true ->
 142		ok=io:format("~p- fatal: ~p~n", [?LINE, Reason])
 143	end,
 144	fatal(Reason, S)).
 145
 146
 147-define(ustate(U, S), S#xmerl_scanner{user_state = U}).
 148
 149
 150%% Functions to access the various states
 151
 152%%% @spec user_state(S::global_state()) -> global_state()
 153%%% @equiv user_state(UserState,S)
 154user_state(#xmerl_scanner{user_state = S}) -> S.
 155
 156%%% @spec event_state(S::global_state()) -> global_state()
 157%%% @equiv event_state(EventState,S)
 158event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S.
 159
 160%%% @spec hook_state(S::global_state()) -> global_state()
 161%%% @equiv hook_state(HookState,S)
 162hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S.
 163
 164%%% @spec rules_state(S::global_state()) -> global_state()
 165%%% @equiv rules_state(RulesState,S)
 166rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S.
 167
 168%%% @spec fetch_state(S::global_state()) -> global_state()
 169%%% @equiv fetch_state(FetchState,S)
 170fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S.
 171
 172%%% @spec cont_state(S::global_state()) -> global_state()
 173%%% @equiv cont_state(ContinuationState,S)
 174cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S.
 175
 176
 177%%%% Functions to modify the various states
 178
 179%%% @spec user_state(UserState, S::global_state()) -> global_state()
 180%%% @doc For controlling the UserState, to be used in a user function.
 181%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
 182user_state(X, S) ->
 183    S#xmerl_scanner{user_state = X}.
 184
 185%%% @spec event_state(EventState, S::global_state()) -> global_state()
 186%%% @doc For controlling the EventState, to be used in an event
 187%%% function, and called at the beginning and at the end of a parsed entity.
 188%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
 189event_state(X, S=#xmerl_scanner{fun_states = FS}) ->
 190    FS1 = FS#xmerl_fun_states{event = X},
 191    S#xmerl_scanner{fun_states = FS1}.
 192
 193%%% @spec hook_state(HookState, S::global_state()) -> global_state()
 194%%% @doc For controlling the HookState, to be used in a hook
 195%%% function, and called when the parser has parsed a complete entity.
 196%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
 197hook_state(X, S=#xmerl_scanner{fun_states = FS}) ->
 198    FS1 = FS#xmerl_fun_states{hook = X},
 199    S#xmerl_scanner{fun_states = FS1}.
 200
 201%%% @spec rules_state(RulesState, S::global_state()) -> global_state()
 202%%% @doc For controlling the RulesState, to be used in a rules
 203%%% function, and called when the parser store scanner information in a rules
 204%%% database.
 205%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
 206rules_state(X, S=#xmerl_scanner{fun_states = FS}) ->
 207    FS1 = FS#xmerl_fun_states{rules = X},
 208    S#xmerl_scanner{fun_states = FS1}.
 209
 210%%% @spec fetch_state(FetchState, S::global_state()) -> global_state()
 211%%% @doc For controlling the FetchState, to be used in a fetch
 212%%% function, and called when the parser fetch an external resource (eg. a DTD).
 213%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
 214fetch_state(X, S=#xmerl_scanner{fun_states = FS}) ->
 215    FS1 = FS#xmerl_fun_states{fetch = X},
 216    S#xmerl_scanner{fun_states = FS1}.
 217
 218%%% @spec cont_state(ContinuationState, S::global_state()) -> global_state()
 219%%% @doc For controlling the ContinuationState, to be used in a continuation
 220%%% function, and called when the parser encounters the end of the byte stream.
 221%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
 222cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
 223    FS1 = FS#xmerl_fun_states{cont = X},
 224    S#xmerl_scanner{fun_states = FS1}.
 225
 226
 227%% @spec file(Filename::string()) -> {xmlElement(),Rest}
 228%%   Rest = list()
 229%% @equiv file(Filename, [])
 230file(F) ->
 231    file(F, []).
 232
 233%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest}
 234%%   Rest = list()
 235%%% @doc Parse file containing an XML document
 236file(F, Options) ->
 237    ExtCharset=case lists:keysearch(encoding,1,Options) of
 238		   {value,{_,Val}} -> Val;
 239		   false -> undefined
 240	       end,
 241    case int_file(F,Options,ExtCharset) of
 242	{Res, Tail,S=#xmerl_scanner{close_fun=Close}} ->
 243	    Close(S), % for side effects only - final state is dropped
 244	    {Res,Tail};
 245	{error, Reason} ->
 246	    {error, Reason};
 247	Other ->
 248	    {error, Other}
 249    end.
 250
 251int_file(F, Options,_ExtCharset) ->
 252%     io:format("int_file F=~p~n",[F]),
 253    case file:read_file(F) of
 254	{ok, Bin} ->
 255	    int_string(binary_to_list(Bin), Options, filename:dirname(F));
 256	Error ->
 257	    Error
 258    end.
 259
 260int_file_decl(F, Options,_ExtCharset) ->
 261%     io:format("int_file_decl F=~p~n",[F]),
 262    case file:read_file(F) of
 263	{ok, Bin} ->
 264	    int_string_decl(binary_to_list(Bin), Options, filename:dirname(F));
 265	Error ->
 266	    Error
 267    end.
 268
 269%% @spec string(Text::list()) -> {xmlElement(),Rest}
 270%%   Rest = list()
 271%% @equiv string(Test, [])
 272string(Str) ->  
 273    string(Str, []).
 274
 275%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest}
 276%%   Rest = list()
 277%%% @doc Parse string containing an XML document
 278string(Str, Options) ->
 279    case int_string(Str, Options) of
 280	{Res, Tail, S=#xmerl_scanner{close_fun = Close}} ->
 281	    Close(S),    % for side effects only - final state is dropped
 282	    {Res,Tail};
 283	{error, Reason} ->
 284	    {error, Reason};  % (This can't happen, currently)
 285	Other ->
 286	    {error, Other}
 287    end.
 288
 289int_string(Str, Options) ->
 290    {ok,  XMLBase} = file:get_cwd(),
 291    int_string(Str, Options, XMLBase).
 292
 293int_string(Str, Options, XMLBase) ->
 294    S=initial_state0(Options,XMLBase),
 295    case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
 296	{auto,'iso-10646-utf-1',Str2} ->
 297	    scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
 298	{external,'iso-10646-utf-1',Str2} ->
 299	    scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
 300	{undefined,undefined,Str2} ->
 301	    scan_document(Str2, S);
 302	{external,ExtCharset,Str2} ->
 303	    scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
 304    end.
 305
 306int_string_decl(Str, Options, XMLBase) ->
 307    S=initial_state0(Options,XMLBase),
 308    case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
 309	{auto,'iso-10646-utf-1',Str2} ->
 310	    scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
 311	{external,'iso-10646-utf-1',Str2} ->
 312	    scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
 313	{undefined,undefined,Str2} ->
 314	    scan_decl(Str2, S);
 315	{external,ExtCharset,Str2} ->
 316	    scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
 317    end.
 318    
 319
 320
 321initial_state0(Options,XMLBase) ->
 322    initial_state(Options, #xmerl_scanner{
 323		    event_fun = fun event/2,
 324		    hook_fun = fun hook/2,
 325		    acc_fun = fun acc/3,
 326		    fetch_fun = fun fetch/2,
 327		    close_fun = fun close/1,
 328		    continuation_fun = fun cont/3,
 329		    rules_read_fun = fun rules_read/3,
 330		    rules_write_fun = fun rules_write/4,
 331		    rules_delete_fun= fun rules_delete/3,
 332		    xmlbase = XMLBase
 333		   }).
 334
 335initial_state([{event_fun, F}|T], S) ->
 336    initial_state(T, S#xmerl_scanner{event_fun = F});
 337initial_state([{event_fun, F, ES}|T], S) ->
 338    S1 = event_state(ES, S#xmerl_scanner{event_fun = F}),
 339    initial_state(T, S1);
 340initial_state([{acc_fun, F}|T], S) ->
 341    initial_state(T, S#xmerl_scanner{acc_fun = F});
 342initial_state([{hook_fun, F}|T], S) ->
 343    initial_state(T, S#xmerl_scanner{hook_fun = F});
 344initial_state([{hook_fun, F, HS}|T], S) ->
 345    S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}),
 346    initial_state(T, S1);
 347initial_state([{close_fun, F}|T], S) ->
 348    initial_state(T, S#xmerl_scanner{close_fun = F});
 349initial_state([{fetch_fun, F}|T], S) ->
 350    initial_state(T, S#xmerl_scanner{fetch_fun = F});
 351initial_state([{fetch_fun, F, FS}|T], S) ->
 352    S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}),
 353    initial_state(T, S1);
 354initial_state([{fetch_path, P}|T], S) ->
 355    initial_state(T, S#xmerl_scanner{fetch_path = P});
 356initial_state([{continuation_fun, F}|T], S) ->
 357    initial_state(T, S#xmerl_scanner{continuation_fun = F});
 358initial_state([{continuation_fun, F, CS}|T], S) ->
 359    S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}),
 360    initial_state(T, S1);
 361initial_state([{rules, R}|T], S) ->
 362    initial_state(T, S#xmerl_scanner{rules = R,
 363				     keep_rules = true});
 364initial_state([{rules, Read, Write, RS}|T], S) ->
 365    S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read,
 366					 rules_write_fun = Write,
 367					 keep_rules = true}),
 368    initial_state(T, S1);
 369initial_state([{user_state, F}|T], S) ->
 370    initial_state(T, S#xmerl_scanner{user_state = F});
 371initial_state([{space, L}|T], S) ->
 372    initial_state(T, S#xmerl_scanner{space = L});
 373initial_state([{line, L}|T], S) ->
 374    initial_state(T, S#xmerl_scanner{line = L});
 375initial_state([{namespace_conformant, F}|T], S) when F==true; F==false ->
 376    initial_state(T, S#xmerl_scanner{namespace_conformant = F});
 377initial_state([{validation, F}|T], S) when F==true; F==false ->
 378    initial_state(T, S#xmerl_scanner{validation = F});
 379initial_state([{quiet, F}|T], S) when F==true; F==false ->
 380    initial_state(T, S#xmerl_scanner{quiet = F});
 381initial_state([{doctype_DTD,DTD}|T], S) ->
 382    initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
 383initial_state([{text_decl,Bool}|T], S) ->
 384    initial_state(T,S#xmerl_scanner{text_decl=Bool});
 385initial_state([{environment,Env}|T], S) ->
 386    initial_state(T,S#xmerl_scanner{environment=Env});
 387initial_state([{xmlbase, D}|T], S) ->
 388    initial_state(T, S#xmerl_scanner{xmlbase = D});
 389initial_state([{encoding, Enc}|T], S) ->
 390    initial_state(T, S#xmerl_scanner{encoding = Enc});
 391initial_state([], S=#xmerl_scanner{rules = undefined}) ->
 392    Tab = ets:new(rules, [set, public]),
 393    S#xmerl_scanner{rules = Tab};
 394initial_state([], S) ->
 395    S.
 396
 397
 398%%% -----------------------------------------------------
 399%%% Default modifier functions
 400
 401%%% Hooks:
 402%%% - {element, Line, Name, Attrs, Content}
 403%%% - {processing_instruction, Line, Data}
 404
 405hook(X, State) ->
 406    {X, State}.
 407
 408%%% Events:
 409%%%
 410%%% #xmerl_event{event : started | ended,
 411%%%              line  : integer(),
 412%%%		 col   : integer(),
 413%%%              data}
 414%%%
 415%%% Data		Events
 416%%% document		started, ended
 417%%% #xmlElement		started, ended
 418%%% #xmlAttribute	ended
 419%%% #xmlPI		ended
 420%%% #xmlComment		ended
 421%%% #xmlText		ended
 422event(_X, S) ->
 423    S.
 424
 425%% The acc/3 function can return either {Acc�, S'} or {Acc', Pos', S'},
 426%% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or
 427%% X#xmlAttribute.pos (whichever is the current object type.)
 428%% The acc/3 function is not allowed to redefine the type of object
 429%% being defined, but _is_ allowed to either ignore it or split it 
 430%% into multiple objects (in which case {Acc',Pos',S'} should be returned.)
 431%% If {Acc',S'} is returned, Pos will be incremented by 1 by default.
 432%% Below is an example of an acceptable operation
 433acc(X = #xmlText{value = Text}, Acc, S) ->
 434    {[X#xmlText{value = lists:flatten(Text)}|Acc], S};
 435acc(X, Acc, S) ->
 436    {[X|Acc], S}.
 437
 438fetch({system, URI}, S) ->
 439    fetch_URI(URI, S);
 440fetch({public, _PublicID, URI}, S) ->
 441    fetch_URI(URI, S).
 442
 443%%% Always assume an external resource can be found locally! Thus
 444%%% don't bother fetching with e.g. HTTP. Returns the path where the
 445%%% resource is found.  The path to the external resource is given by
 446%%% URI directly or the option fetch_path (additional paths) or
 447%%% directory (base path to external resource)
 448fetch_URI(URI, S) ->
 449    %% assume URI is a filename
 450    Split = filename:split(URI),
 451    Filename = fun([])->[];(X)->lists:last(X) end (Split),
 452    Fullname = 
 453	case Split of %% how about Windows systems?
 454	    ["file:"|Name]-> %% absolute path, see RFC2396 sect 3
 455		%% file:/dtd_name 
 456		filename:join(["/"|Name]);
 457	    ["/"|Rest] when Rest /= [] ->
 458		%% absolute path name
 459		URI;
 460	    ["http:"|_Rest] ->
 461		{http,URI};
 462	    [] -> %% empty systemliteral
 463		[];
 464	    _ ->
 465		filename:join(S#xmerl_scanner.xmlbase, URI)
 466	end,
 467    Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname),
 468    ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]),
 469    {ok, Path, S}.
 470
 471path_locate(_, _, {http,_}=URI) ->
 472    URI;
 473path_locate(_, _, []) ->
 474    [];
 475path_locate([Dir|Dirs], FN, FullName) ->
 476    F = filename:join(Dir, FN),
 477    case file:read_file_info(F) of
 478	{ok, #file_info{type = regular}} ->
 479	    {file,F};
 480	_ ->
 481	    path_locate(Dirs, FN, FullName)
 482    end;
 483path_locate([], _FN, FullName) ->
 484    {file,FullName}.
 485
 486
 487cont(_F, Exception, US) ->
 488    Exception(US).
 489
 490close(S) ->
 491    S.
 492
 493
 494%%% -----------------------------------------------------
 495%%% Scanner
 496
 497%%% [1] document ::= prolog element Misc*
 498scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
 499				     line = L, col = C,
 500				     environment=Env,
 501				     encoding=Charset,
 502				     validation=ValidateResult}) ->
 503    S1 = Event(#xmerl_event{event = started,
 504			    line = L,
 505			    col = C,
 506			    data = document}, S),
 507    
 508    %% Transform to given character set.
 509    %% Note that if another character set is given in the encoding 
 510    %% attribute in a XML declaration that one will be used later
 511    Str=if
 512	    Charset=/=undefined -> % Default character set is UTF-8
 513		ucs:to_unicode(Str0,list_to_atom(Charset));
 514	    true ->
 515		Str0
 516	end,
 517
 518    {"<"++T2, S2} = scan_prolog(Str, S1, _StartPos = 1),
 519    {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1),
 520    {Tail, S4}=scan_misc(T3, S3, _StartPos = 1),
 521    
 522    S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
 523					       line = S4#xmerl_scanner.line,
 524					       col = S4#xmerl_scanner.col,
 525					       data = document}, S4),
 526    
 527    S6 = case ValidateResult of
 528	     false ->
 529		 cleanup(S5);
 530	     true when Env == element; Env == prolog ->
 531		 check_decl2(S5),
 532		 case xmerl_validate:validate(S5,Res) of
 533		     {'EXIT',{error,Reason}} ->
 534			 S5b=cleanup(S5),
 535			 ?fatal({failed_validation,Reason}, S5b);
 536		     {'EXIT',Reason} ->
 537			 S5b=cleanup(S5),
 538			 ?fatal({failed_validation,Reason}, S5b);
 539		     {error,Reason} ->
 540			 S5b=cleanup(S5),
 541			 ?fatal({failed_validation,Reason}, S5b);
 542		     {error,Reason,_Next} ->
 543			 S5b=cleanup(S5),
 544			 ?fatal({failed_validation,Reason}, S5b);
 545		     _XML ->
 546			 cleanup(S5)
 547		 end;
 548	     true ->
 549		 cleanup(S5)
 550	 end,
 551
 552    {Res, Tail, S6}.
 553
 554
 555scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
 556				line = L, col = C,
 557				environment=_Env,
 558				encoding=_Charset,
 559				validation=_ValidateResult}) ->
 560    S1 = Event(#xmerl_event{event = started,
 561			    line = L,
 562			    col = C,
 563			    data = document}, S),
 564    
 565    case scan_prolog(Str, S1, _StartPos = 1) of
 566	{T2="<"++_, S2} ->
 567	    {{S2#xmerl_scanner.user_state,T2},[],S2};
 568	{[], S2}->
 569	    {[],[],S2};
 570	{T2, S2} ->
 571	    {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
 572				    _Lang=[],_Parents=[],#xmlNamespace{}),
 573	    {T2,[],S3}
 574    end.
 575
 576
 577%%% [22] Prolog
 578%%% prolog    ::=    XMLDecl? Misc* (doctypedecl Misc*)?
 579%%%
 580%% Text declaration may be empty
 581scan_prolog([], S=#xmerl_scanner{text_decl=true},_Pos) ->
 582    {[],S};
 583scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
 584    ?dbg("cont()...~n", []),
 585    F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end,
 586      fun(S1) -> {[], S1} end,
 587      S);
 588scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
 589  when ?whitespace(hd(T)) ->
 590    {Charset,T3, S3}=
 591    if
 592	Col==1,L==1,S0#xmerl_scanner.text_decl==true -> 
 593	    ?dbg("prolog(\"<?xml\")~n", []),
 594	    ?bump_col(5),
 595	    {_,T1,S1} = mandatory_strip(T,S0),
 596	    {Decl,T2, S2}=scan_text_decl(T1,S1),
 597	    Encoding=Decl#xmlDecl.encoding,
 598	    {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
 599	Col==1,L==1 -> 
 600	    ?dbg("prolog(\"<?xml\")~n", []),
 601	    ?bump_col(5),
 602	    {Decl,T2, S2}=scan_xml_decl(T, S0),
 603	    Encoding=Decl#xmlDecl.encoding,
 604	    {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
 605	true ->
 606	    ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
 607    end,
 608    %% Now transform to declared character set.
 609    if
 610	Charset==Charset0 -> % Document already transformed to this charset!
 611	    scan_prolog(T3, S3, Pos);
 612	Charset0=/=undefined -> % Document transformed to other bad charset!
 613	    ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S3);
 614	Charset=/=undefined -> % Document not previously transformed
 615	    T4=ucs:to_unicode(T3,list_to_atom(Charset)),
 616	    scan_prolog(T4, S3, Pos);
 617	true -> % No encoding info given
 618	    scan_prolog(T3, S3, Pos)
 619    end;
 620scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog,
 621						encoding=Charset}, Pos) ->
 622    ?dbg("prolog(\"<!DOCTYPE\")~n", []),
 623    ?bump_col(9),
 624    %% If no known character set assume it is UTF-8
 625    T1=if
 626	   Charset==undefined -> ucs:to_unicode(T,'utf-8');
 627	   true -> T
 628       end,
 629    {T2, S1} = scan_doctype(T1, S),
 630    scan_misc(T2, S1, Pos);
 631scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=Charset},Pos) ->
 632    ?dbg("prolog(\"<\")~n", []),
 633    
 634    %% Check for Comments, PI before possible DOCTYPE declaration
 635    ?bump_col(1),
 636    %% If no known character set assume it is UTF-8
 637    T=if
 638	  Charset==undefined -> ucs:to_unicode(Str,'utf-8');
 639	  true -> Str
 640      end,
 641    {T1, S1}=scan_misc(T, S, Pos),
 642    scan_prolog2(T1,S1,Pos).
 643
 644
 645
 646scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
 647    ?dbg("cont()...~n", []),
 648    F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end,
 649      fun(S1) -> {[], S1} end,
 650      S);
 651scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) ->
 652    ?dbg("prolog(\"<!DOCTYPE\")~n", []),
 653    ?bump_col(9),
 654    {T1, S1} = scan_doctype(T, S),
 655    scan_misc(T1, S1, Pos);
 656scan_prolog2(Str = "<!" ++ _, S, _Pos) ->
 657    ?dbg("prolog(\"<!\")~n", []),
 658    %% In e.g. a DTD, we jump directly to markup declarations
 659    scan_ext_subset(Str, S);
 660scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
 661    ?dbg("prolog(\"<\")~n", []),
 662    
 663    %% Check for more Comments and PI after DOCTYPE declaration
 664    ?bump_col(1),
 665    scan_misc(Str, S, Pos).
 666
 667
 668
 669
 670%%% [27] Misc ::=   	Comment | PI | S
 671%% Note:
 672%% - Neither of Comment and PI are returned in the resulting parsed
 673%%   structure.
 674%% - scan_misc/3 implements Misc* as that is how the rule is always used
 675scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
 676    ?dbg("cont()...~n", []),
 677    F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end,
 678      fun(S1) -> {[], S1} end,
 679      S);
 680scan_misc("<!--" ++ T, S, Pos) -> % Comment
 681    {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
 682    scan_misc(T1,S1,Pos);
 683scan_misc("<?" ++ T, S0, Pos) -> % PI
 684    ?dbg("prolog(\"<?\")~n", []),
 685    ?bump_col(2),
 686    {_PI, T1, S1} = scan_pi(T, S, Pos),
 687    scan_misc(T1,S1,Pos);
 688scan_misc([H|T], S, Pos) when ?whitespace(H) ->
 689    ?dbg("prolog(whitespace)~n", []),
 690    scan_misc(T,S,Pos);
 691scan_misc(T,S,_Pos) ->
 692    {T,S}.
 693
 694
 695cleanup(S=#xmerl_scanner{keep_rules = false,
 696			 rules = Rules}) ->
 697    ets:delete(Rules),
 698    S#xmerl_scanner{rules = undefined};
 699cleanup(S) ->
 700    S.
 701
 702%%% Prolog and Document Type Declaration XML 1.0 Section 2.8
 703%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
 704%% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
 705scan_xml_decl(T, S) ->
 706    %% VersionInfo [24] is mandatory
 707    {_,T1,S2} = mandatory_strip(T,S),
 708    "version" ++ T2 = T1,
 709    {T3, S3} = scan_eq(T2, S2),
 710    {Vsn, T4, S4} = scan_xml_vsn(T3, S3),
 711    Attr = #xmlAttribute{name = version,
 712			 parents = [{xml, _XMLPos = 1}],
 713			 value = Vsn},
 714    scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}).
 715
 716scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) ->
 717    ?dbg("cont()...~n", []),
 718    F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end,
 719      fun(S1) -> {[], [], S1} end,
 720      S);
 721scan_xml_decl("?>" ++ T, S0, Decl) ->
 722    ?bump_col(2),
 723    return_xml_decl(T,S,Decl);
 724scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) ->
 725    {_,T1,S1}=mandatory_strip(T,S),
 726    scan_xml_decl2(T1,S1,Decl).
 727
 728
 729scan_xml_decl2("?>" ++ T, S0,Decl) ->
 730    ?bump_col(2),
 731    return_xml_decl(T,S,Decl);
 732scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event},
 733	      Decl0 = #xmlDecl{attributes = Attrs}) ->
 734    %% [80] EncodingDecl
 735    ?bump_col(8),
 736    {T1, S1} = scan_eq(T, S),
 737    {EncName, T2, S2} = scan_enc_name(T1, S1),
 738    LowEncName=httpd_util:to_lower(EncName),
 739    Attr = #xmlAttribute{name = encoding,
 740			 parents = [{xml, _XMLPos = 1}],
 741			 value = LowEncName},
 742    Decl = Decl0#xmlDecl{encoding = LowEncName,
 743			 attributes = [Attr|Attrs]},
 744    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 
 745					       line = S0#xmerl_scanner.line, 
 746					       col = S0#xmerl_scanner.col,
 747					       data = Attr}, S2),
 748    case T2 of
 749	"?>" ++ _T3 ->
 750	    scan_xml_decl3(T2,S3,Decl);
 751	_ ->
 752	    {_,T3,S4} = mandatory_strip(T2,S3),
 753	    scan_xml_decl3(T3, S4, Decl)
 754    end;
 755scan_xml_decl2(T="standalone" ++ _T,S,Decl) ->
 756    scan_xml_decl3(T,S,Decl).
 757
 758scan_xml_decl3("?>" ++ T, S0,Decl) ->
 759    ?bump_col(2),
 760    return_xml_decl(T,S,Decl);
 761scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event},
 762	      Decl0 = #xmlDecl{attributes = Attrs}) ->
 763    %% [32] SDDecl 
 764    ?bump_col(10),
 765    {T1, S1} = scan_eq(T, S),
 766    {StValue,T2,S2}=scan_standalone_value(T1,S1),
 767    Attr = #xmlAttribute{name = standalone,
 768			 parents = [{xml, _XMLPos = 1}],
 769			 value = StValue},
 770    Decl = Decl0#xmlDecl{standalone = StValue,
 771			 attributes = [Attr|Attrs]},
 772    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 
 773					       line = S0#xmerl_scanner.line, 
 774					       col = S0#xmerl_scanner.col,
 775					       data = Attr}, S2),
 776    {_,T3,S4} = strip(T2,S3),
 777    "?>" ++ T4 = T3,
 778    return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl).
 779
 780
 781return_xml_decl(T,S=#xmerl_scanner{hook_fun = Hook,
 782				   event_fun = Event},
 783		Decl0 = #xmlDecl{attributes = Attrs}) ->
 784    ?strip1,
 785    Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
 786    S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
 787					       line = S#xmerl_scanner.line,
 788					       col = S#xmerl_scanner.col,
 789					       data = Decl}, S1),
 790    {Ret, S3} = Hook(Decl, S2),
 791    {Ret, T1, S3}.
 792    
 793
 794scan_standalone_value("'yes'" ++T,S0)->
 795    ?bump_col(5),
 796    {'yes',T,S#xmerl_scanner{standalone=yes}};
 797scan_standalone_value("\"yes\"" ++T,S0)->
 798    ?bump_col(5),
 799    {'yes',T,S#xmerl_scanner{standalone=yes}};
 800scan_standalone_value("'no'" ++T,S0) ->
 801    ?bump_col(4),
 802    {'no',T,S};
 803scan_standalone_value("\"no\"" ++T,S0) ->
 804    ?bump_col(4),
 805    {'no',T,S}.
 806
 807%%%
 808%%% Text declaration XML 1.0 section 4.3.1
 809%%% [77] TextDecl  ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
 810scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) ->
 811    {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S),
 812    "encoding" ++ T2 = T1,
 813    S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8},
 814    {T3, S3} = scan_eq(T2, S2),
 815    {EncName, T4, S4} = scan_enc_name(T3, S3),
 816    LowEncName=httpd_util:to_lower(EncName),
 817    ?strip5,
 818    Attr = #xmlAttribute{name = encoding,
 819			 parents = [{xml,1}],
 820			 value = LowEncName},
 821    Decl = Decl0#xmlDecl{encoding = LowEncName,
 822 			 attributes = [Attr|Attrs]},
 823    S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended,
 824					     line = S5#xmerl_scanner.line,
 825					     col = S5#xmerl_scanner.col,
 826					     data = Attr}, S5),
 827    scan_text_decl(T5,S6,Decl).
 828
 829scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = Hook,
 830					   event_fun = Event}, 
 831	       Decl0 = #xmlDecl{attributes = Attrs}) ->
 832    ?bump_col(2),
 833    ?strip1,
 834    Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
 835    S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
 836					       line = S0#xmerl_scanner.line,
 837					       col = S0#xmerl_scanner.col,
 838					       data = Decl}, S1),
 839    {Ret, S3} = Hook(Decl, S2),
 840    {Ret, T1, S3}.
 841
 842scan_optional_version("version"++T,S0) ->
 843    ?bump_col(7),
 844    ?strip1,
 845    {T2, S2} = scan_eq(T1, S1),
 846    {Vsn, T3, S3} = scan_xml_vsn(T2, S2),
 847    {_,T4,S4} = mandatory_strip(T3,S3),
 848    Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn},
 849    {#xmlDecl{attributes=[Attr]},T4,S4};
 850scan_optional_version(T,S) ->
 851    {#xmlDecl{attributes=[]},T,S}.
 852    
 853
 854
 855%%%%%%% [81] EncName
 856scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) ->
 857    ?dbg("cont()...~n", []),
 858    F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end,
 859      fun(S1) -> ?fatal(expected_encoding_name, S1) end,
 860      S);
 861scan_enc_name([H|T], S0) when H >= $"; H =< $' -> 
 862    ?bump_col(1),
 863    scan_enc_name(T, S, H, []).
 864
 865
 866scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
 867    ?dbg("cont()...~n", []),
 868    F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end,
 869      fun(S1) -> ?fatal(expected_encoding_name, S1) end,
 870      S);
 871scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
 872    ?bump_col(1),
 873    scan_enc_name2(T, S, Delim, [H|Acc]);
 874scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
 875    ?bump_col(1),
 876    scan_enc_name2(T, S, Delim, [H|Acc]);
 877scan_enc_name([H|_T],S,_Delim,_Acc) ->
 878    ?fatal({error,{unexpected_character_in_Enc_Name,H}},S).
 879
 880scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
 881    ?dbg("cont()...~n", []),
 882    F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end,
 883      fun(S1) -> ?fatal(expected_encoding_name, S1) end,
 884      S);
 885scan_enc_name2([H|T], S0, H, Acc) ->
 886    ?bump_col(1),
 887    {lists:reverse(Acc), T, S};
 888scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
 889    ?bump_col(1),
 890    scan_enc_name2(T, S, Delim, [H|Acc]);
 891scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
 892    ?bump_col(1),
 893    scan_enc_name2(T, S, Delim, [H|Acc]);
 894scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 ->
 895    ?bump_col(1),
 896    scan_enc_name2(T, S, Delim, [H|Acc]);
 897scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- ->
 898    ?bump_col(1),
 899    scan_enc_name2(T, S, Delim, [H|Acc]).
 900
 901
 902%%%%%%% [26] VersionNum
 903%%% VersionNum    ::=    ([a-zA-Z0-9_.:] | '-')+
 904scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) ->
 905    ?dbg("cont()...~n", []),
 906    F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end,
 907      fun(S1) -> ?fatal(unexpected_end, S1) end,
 908      S);
 909scan_xml_vsn([H|T], S) when H==$"; H==$'->
 910    xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []).
 911
 912xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
 913    ?dbg("cont()...~n", []),
 914    F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end, 
 915      fun(S1) -> ?fatal(unexpected_end, S1) end,
 916      S);
 917xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) ->
 918    {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}};
 919xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z ->
 920    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
 921xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z ->
 922    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
 923xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 ->
 924    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
 925xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) ->
 926    case lists:member(H, "_.:-") of
 927	true ->
 928	    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
 929	false ->
 930	    ?fatal({invalid_vsn_char, H}, S)
 931    end.
 932
 933%%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
 934
 935scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
 936    ?dbg("cont()...~n", []),
 937    F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end,
 938      fun(S1) -> ?fatal(unexpected_end, S1) end,
 939      S);
 940scan_pi(Str = [H1,H2,H3 | T],S=#xmerl_scanner{line = L, col = C}, Pos)
 941  when H1==$x;H1==$X ->
 942    %% names beginning with [xX][mM][lL] are reserved for future use.
 943    if 
 944	((H2==$m) or (H2==$M)) and
 945	((H3==$l) or (H3==$L)) ->
 946	    scan_wellknown_pi(T,S,Pos);
 947	true ->
 948	    {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
 949	    scan_pi(T1, S1, Target, L, C, Pos, [])
 950    end;
 951scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) ->
 952    {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
 953    scan_pi(T1, S1, Target, L, C, Pos,[]).
 954
 955
 956%%% More info on xml-stylesheet can be found at:
 957%%%   "Associating Style Sheets with XML documents", Version 1.0,
 958%%%   W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/)
 959scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) ->
 960    ?dbg("prolog(\"<?xml-stylesheet\")~n", []),
 961    ?bump_col(16),
 962    scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]);
 963scan_wellknown_pi(Str,S,_Pos) ->
 964    ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S).
 965
 966
 967% scan_pi(Str="?>"++_T,S,Target, L, C, Pos) ->
 968%     scan_pi(Str,S,Target, L, C, Pos,[]);
 969% scan_pi(Str=[],S,Target, L, C, Pos) ->
 970%     scan_pi(Str,S,Target, L, C, Pos,[]);
 971% scan_pi(T,S,Target, L, C, Pos) ->
 972%     {_,T1,S1} = mandatory_strip(T,S),
 973%     scan_pi(T1,S1,Target, L, C, Pos,[]).
 974
 975
 976scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
 977    ?dbg("cont()...~n", []),
 978    F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end,
 979      fun(S1) -> ?fatal(unexpected_end, S1) end,
 980      S);
 981scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
 982				       event_fun = Event}, 
 983	Target, L, C, Pos, Acc) ->
 984    ?bump_col(2),
 985    PI = #xmlPI{name = Target,
 986		pos = Pos,
 987		value = lists:reverse(Acc)},
 988    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
 989					       line = L,
 990					       col = C,
 991					       data = PI}, S),
 992    {Ret, S2} = Hook(PI, S1),
 993    {Ret, T, S2};
 994scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) ->
 995    ?strip1,
 996    scan_pi2(T1, S1, Target, L, C, Pos, Acc).
 997
 998
 999scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
1000    ?dbg("cont()...~n", []),
1001    F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end,
1002      fun(S1) -> ?fatal(unexpected_end, S1) end,
1003      S);
1004scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
1005				       event_fun = Event}, 
1006	Target, L, C, Pos, Acc) ->
1007    ?bump_col(2),
1008    PI = #xmlPI{name = Target,
1009		pos = Pos,
1010		value = lists:reverse(Acc)},
1011    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
1012					       line = L,
1013					       col = C,
1014					       data = PI}, S),
1015    {Ret, S2} = Hook(PI, S1),
1016    {Ret, T, S2};
1017scan_pi2([H|T], S0, Target, L, C, Pos, Acc) ->
1018    ?bump_col(1),
1019    wfc_legal_char(H,S),
1020    scan_pi2(T, S, Target, L, C, Pos, [H|Acc]).
1021
1022
1023
1024%% [28] doctypedecl ::= 
1025%%   '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
1026scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) ->
1027    ?dbg("cont()...~n", []),
1028    F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end,
1029      fun(S1) -> ?fatal(unexpected_end, S1) end,
1030      S);
1031scan_doctype(T, S) ->
1032    {_,T1,S1} = mandatory_strip(T,S),
1033    {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
1034    ?strip3,
1035    scan_doctype1(T3, S3#xmerl_scanner{doctype_name =  DTName}).
1036
1037
1038%% [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1039%%		     | 'PUBLIC' S PubidLiteral S SystemLiteral
1040scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) ->
1041    ?dbg("cont()...~n", []),
1042    F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end,
1043      fun(S1) -> ?fatal(unexpected_end, S1) end,
1044      S);
1045scan_doctype1("PUBLIC" ++ T, S0) ->
1046    ?bump_col(6),
1047    {_,T1,S1} = mandatory_strip(T,S),
1048    {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
1049    {_,T3,S3} = mandatory_strip(T2,S2),
1050    {SL, T4, S4} = scan_system_literal(T3, S3),
1051    ?strip5,
1052    scan_doctype2(T5, S5, {public, PIDL, SL});
1053scan_doctype1("SYSTEM" ++ T, S0) ->
1054    ?bump_col(6),
1055    {_,T1,S1} = mandatory_strip(T,S),
1056    {SL, T2, S2} = scan_system_literal(T1, S1),
1057    ?strip3,
1058    scan_doctype2(T3, S3, {system, SL});
1059scan_doctype1(T, S) ->
1060    scan_doctype2(T, S, undefined).
1061
1062
1063scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
1064    ?dbg("cont()...~n", []),
1065    F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end,
1066      fun(S1) -> ?fatal(unexpected_end, S1) end,
1067      S);
1068scan_doctype2("[" ++ T, S0, DTD) ->
1069    ?bump_col(1),
1070    ?strip1,
1071    scan_doctype3(T1, S1, DTD);
1072scan_doctype2(">" ++ T, S0, DTD) ->
1073    ?bump_col(1),
1074    ?strip1,
1075    S2 = fetch_DTD(DTD, S1),
1076    check_decl(S2),
1077    {T1, S2}.
1078
1079%% [28a] DeclSep   ::= PEReference | S
1080%% [28b] intSubset ::= (markupdecl | DeclSep)*
1081scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
1082    ?dbg("cont()...~n", []),
1083    F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end,
1084      fun(S1) -> ?fatal(unexpected_end, S1) end,
1085      S);
1086scan_doctype3("%" ++ T, S0, DTD) ->
1087    ?bump_col(1),
1088    {PERefName, T1, S1} = scan_pe_reference(T, S),
1089    ?strip2,
1090    case expand_pe_reference(PERefName, S2,as_PE) of
1091	{system, _} = Name ->
1092	    S3 = fetch_DTD(Name, S2),
1093	    scan_doctype3(T2, S3, DTD);
1094	{public, _} = Name ->
1095	    S3 = fetch_DTD(Name, S2),
1096	    scan_doctype3(T2, S3, DTD);
1097	{public, _, _} = Name ->
1098	    S3 = fetch_DTD(Name, S2),
1099	    scan_doctype3(T2, S3, DTD);
1100	ExpRef when list(ExpRef) -> % Space added, see Section 4.4.8
1101	    {_,T3,S3} = strip(ExpRef++T2,S2),
1102	    scan_doctype3(T3,S3,DTD)
1103    end;
1104scan_doctype3("]" ++ T, S0, DTD) ->
1105    ?bump_col(1),
1106    ?strip1,
1107    S2 = fetch_DTD(DTD, S1),
1108    check_decl(S2),
1109    ">" ++ T2 = T1,
1110    {T2, S2};
1111scan_doctype3(T, S, DTD) ->
1112    {_, T1, S1} = scan_markup_decl(T, S),
1113    scan_doctype3(T1, S1, DTD).
1114
1115
1116
1117fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when list(URI)->
1118    %% allow to specify DTD name when it isn't available in xml stream
1119    fetch_DTD({system,URI},S);
1120fetch_DTD(undefined, S) ->
1121    S;
1122fetch_DTD(DTDSpec, S)-> 
1123    case fetch_and_parse(DTDSpec,S,[{text_decl,true},
1124				    {environment,{external,subset}}]) of
1125	NewS when record(NewS,xmerl_scanner) ->
1126	    NewS;
1127	{_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules
1128	    S
1129    end.
1130
1131fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch,
1132					 rules=Rules,
1133					 xmlbase = XMLBase},
1134		Options0) ->
1135    RetS =
1136    case Fetch(ExtSpec, S) of
1137	{ok, NewS} -> 
1138	    %% For backward compatibility only. This will be removed later!!
1139	    NewS;
1140	{ok, not_fetched,NewS} ->
1141	    NewS;
1142	{ok, DataRet, NewS = #xmerl_scanner{user_state = UState,
1143					    event_fun = Event,
1144					    hook_fun = Hook,
1145					    fetch_fun = Fetch1,
1146					    close_fun = Close1,
1147					    continuation_fun = Cont,
1148					    acc_fun = Acc,
1149					    rules_read_fun = Read,
1150					    rules_write_fun = Write,
1151					    validation = Valid,
1152					    quiet = Quiet,
1153					    encoding = Charset
1154					   }} ->
1155	    EvS = event_state(NewS),
1156	    HoS = hook_state(NewS),
1157	    FeS = fetch_state(NewS),
1158	    CoS = cont_state(NewS),
1159	    Options = Options0++[{user_state, UState},
1160				 {rules, Rules},
1161				 {event_fun, Event, EvS},
1162				 {hook_fun, Hook, HoS},
1163				 {fetch_fun, Fetch1, FeS},
1164				 {close_fun, Close1},
1165				 {continuation_fun, Cont, CoS},
1166				 {rules, Read, Write, ""},
1167				 {acc_fun, Acc},
1168				 {validation,Valid},
1169				 {quiet,Quiet},
1170				 {encoding,Charset}],
1171
1172	    case DataRet of
1173		{file, F} ->
1174		    int_file_decl(F, Options,Charset);
1175		{string, String} ->
1176		    int_string_decl(String, Options,XMLBase);
1177		 _ ->
1178		    %% other scheme
1179		    {DataRet,[],NewS}
1180	    end;
1181	Error ->
1182	    ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S)
1183    end,
1184    case RetS of
1185	#xmerl_scanner{} ->
1186	    RetS#xmerl_scanner{text_decl=false,
1187			       environment=S#xmerl_scanner.environment};
1188	_ -> RetS
1189    end.
1190
1191
1192fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) ->
1193    case Fetch(ExtSpec,S) of
1194	{ok, not_fetched,_NewS} ->
1195	    ?fatal({error_fetching_external_source,ExtSpec},S);
1196	{ok, DataRet, NewS} ->
1197	    String =
1198		case DataRet of
1199		    {file,F} ->	
1200			get_file(F,S);
1201		    {string,Str} ->
1202			binary_to_list(Str);
1203		    _ -> DataRet
1204		end,
1205	    {String, NewS};
1206	 _ ->
1207	    ?fatal({error_fetching_external_resource,ExtSpec},S)
1208    end.
1209
1210get_file(F,S) ->
1211%     io:format("get_file F=~p~n",[F]),
1212    case file:read_file(F) of
1213	{ok,Bin} ->	    
1214	    binary_to_list(Bin);
1215	Err ->
1216	    ?fatal({error_reading_file,F,Err},S)
1217    end.
1218%% check_decl/1
1219%% Now it is necessary to check that all referenced types is declared,
1220%% since it is legal to reference some xml types before they are
1221%% declared.
1222check_decl(#xmerl_scanner{validation=false}) ->
1223    ok;
1224check_decl(#xmerl_scanner{rules=Tab} = S) ->
1225    check_notations(Tab,S),
1226    check_elements(Tab,S), %% check also attribute defs for element
1227    check_entities(Tab,S).
1228	    
1229check_notations(Tab,S) ->
1230    case ets:match(Tab,{{notation,'$1'},undeclared}) of
1231	[[]] -> ok;
1232	[] ->  ok;
1233	[L] when list(L) ->
1234	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
1235	Err ->
1236	    ?fatal({error_missing_declaration_in_DTD,Err},S)
1237    end.
1238
1239check_elements(Tab,S) ->
1240    case ets:match(Tab,{{elem_def,'_'},'$2'},10) of
1241	{_,_}=M ->
1242	    Fun = fun({Match,'$end_of_table'},_F) ->
1243			  lists:foreach(fun(X)->check_elements2(X,S) end,
1244					Match),
1245			  ok;
1246		     ('$end_of_table',_) ->
1247			  ok;
1248		     ({Match,Cont},F) ->
1249			  lists:foreach(fun(X)->check_elements2(X,S) end,
1250					Match),
1251			  F(ets:match(Cont),F)
1252		  end,
1253	    Fun(M,Fun);
1254	'$end_of_table' -> ok;
1255	Err -> ?fatal({error_missing_declaration_in_DTD,Err},S)
1256    end.
1257
1258% it is not an error to declare attributes for an element that is not
1259% declared.
1260check_elements2([#xmlElement{attributes=Attrs}],S) ->
1261    check_attributes(Attrs,S);
1262check_elements2(_,_) ->
1263    ok.
1264
1265check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) ->
1266    case lists:keysearch('ID',2,Rest) of
1267	{value,Att2} ->
1268	    ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S);
1269	_ ->
1270	    ok
1271    end,
1272    vc_ID_Attribute_Default(Attr,S),
1273    check_attributes(Rest,S);
1274check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) ->
1275    vc_Enumeration(Attr,S),
1276    check_attributes(T,S);
1277check_attributes([{_,Ent,_,_,_}=Attr|T],S) 
1278  when Ent=='ENTITY';Ent=='ENTITIES' ->
1279    vc_Entity_Name(Attr,S),
1280    check_attributes(T,S);
1281check_attributes([_|T],S) ->
1282    check_attributes(T,S);
1283check_attributes([],_S) ->
1284    ok.
1285
1286check_entities(Tab,S=#xmerl_scanner{validation=true}) ->
1287    case ets:match(Tab,{{entity,'$1'},undeclared}) of
1288	[[]] -> ok;
1289	[] ->  ok;
1290	[L] when list(L) ->
1291	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
1292	Err ->
1293	    ?fatal({error_missing_declaration_in_DTD,Err},S)
1294    end;
1295check_entities(_,_) ->
1296    ok.
1297
1298
1299%% check_decl2/1: checks that all referenced ID attributes are declared
1300check_decl2(S=#xmerl_scanner{rules=Tab}) ->
1301    check_referenced_ids(Tab,S).
1302
1303
1304check_referenced_ids(Tab,S) ->
1305    case ets:match(Tab,{{id,'$1'},undeclared}) of
1306	[[]] -> ok;
1307	[] ->  ok;
1308	[L] when list(L) ->
1309	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
1310	Err ->
1311	    ?fatal({error_missing_declaration_in_DTD,Err},S)
1312    end.
1313
1314%%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl
1315
1316scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) ->
1317    ?dbg("cont()...~n", []),
1318    F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end,
1319      fun(S1) -> {[], S1} end,
1320      S);
1321scan_ext_subset("%" ++ T, S0) -> 
1322    %% DeclSep [28a]: WFC: PE Between Declarations.
1323    %% The replacement text of a parameter entity reference in a
1324    %% DeclSep must match the production extSubsetDecl.
1325    ?bump_col(1),
1326    {_,T1,S1} = scan_decl_sep(T,S),
1327    scan_ext_subset(T1, S1);
1328scan_ext_subset("<![" ++ T, S0) ->
1329    ?bump_col(3),
1330    ?strip1,
1331    {_, T2, S2} = scan_conditional_sect(T1, S1),
1332    scan_ext_subset(T2,S2);
1333scan_ext_subset(T, S) when ?whitespace(hd(T)) ->
1334    {_,T1,S1} = strip(T,S),
1335    scan_ext_subset(T1, S1);
1336scan_ext_subset(T, S) ->
1337    {_, T1, S1} = scan_markup_decl(T, S),
1338    scan_ext_subset(T1, S1).
1339
1340
1341%%%%%%% [28a] DeclSep ::= PEReference | S
1342
1343scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read,
1344				 rules_write_fun=Write,
1345				 rules_delete_fun=Delete}) ->
1346    {PERefName, T1, S1} = scan_pe_reference(T, S),
1347    {ExpandedRef,S2} =
1348	case expand_pe_reference(PERefName,S1,as_PE) of
1349	    Tuple when tuple(Tuple) ->
1350		%% {system,URI} or {public,URI}
1351		{ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
1352		{EntV,_,_S2} = scan_entity_value(ExpRef, S1, no_delim,
1353						 PERefName,parameter),
1354		%% should do an update Write(parameter_entity) so next
1355		%% expand_pe_reference is faster
1356		Delete(parameter_entity,PERefName,_S2),
1357		_S3 = Write(parameter_entity,PERefName,EntV,_S2),
1358		EntV2 = Read(parameter_entity,PERefName,_S3),
1359		{" " ++ EntV2 ++ " ",_S3};
1360	    ExpRef ->
1361		{ExpRef,S1}
1362	end,		     
1363    {_, T3, S3} = strip(ExpandedRef,S2),
1364    {_T4,S4} = scan_ext_subset(T3,S3),
1365    strip(T1,S4).
1366
1367%%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect
1368
1369scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) ->
1370    ?dbg("cont()...~n", []),
1371    F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end,
1372      fun(S1) -> ?fatal(unexpected_end, S1) end,
1373      S);
1374scan_conditional_sect("IGNORE" ++ T, S0) ->
1375    ?bump_col(6),
1376    ?strip1,
1377    "[" ++ T2 = T1,
1378    {_,T3,S3} = strip(T2,S1),
1379    scan_ignore(T3,S3);
1380scan_conditional_sect("INCLUDE" ++ T, S0) ->
1381    ?bump_col(7),
1382    ?strip1,
1383    "[" ++ T2 = T1,
1384    {_,T3,S3} = strip(T2,S1),
1385    scan_include(T3, S3);
1386scan_conditional_sect("%"++T,S0) ->
1387    ?bump_col(1),
1388    ?bump_col(1),
1389    {PERefName, T1, S1} = scan_pe_reference(T, S),
1390    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
1391    {_,T2,S2} = strip(ExpRef ++ T1,S1),
1392    scan_conditional_sect(T2,S2).
1393
1394
1395%%%% [63] ignoreSect	 ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
1396%%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
1397%%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
1398scan_ignore(Str,S) ->
1399    scan_ignore(Str,S,0).
1400
1401scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) ->
1402    ?dbg("cont()...~n", []),
1403    F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end,
1404      fun(S1) -> ?fatal(unexpected_end, S1) end,
1405      S);
1406scan_ignore("<![" ++ T, S0,Level) ->
1407    %% nested conditional section. Topmost condition is ignore, though
1408    ?bump_col(3),
1409    scan_ignore(T, S,Level+1);
1410scan_ignore("]]>" ++ T, S0,0) ->
1411    ?bump_col(3),
1412    {[], T, S};
1413scan_ignore("]]>" ++ T, S0,Level) ->
1414    ?bump_col(3),
1415    scan_ignore(T, S,Level-1);
1416scan_ignore([_H|T],S0,Level) ->
1417    ?bump_col(1),
1418    scan_ignore(T,S,Level).
1419
1420
1421%%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
1422scan_include([], S=#xmerl_scanner{continuation_fun = F}) ->
1423    ?dbg("cont()...~n", []),
1424    F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end,
1425      fun(S1) -> ?fatal(unexpected_end, S1) end,
1426      S);
1427scan_include("]]>" ++ T, S0) ->
1428    ?bump_col(3),
1429    {[], T, S};
1430scan_include("%" ++ T, S0) ->
1431    ?bump_col(1),
1432    {PERefName, T1, S1} = scan_pe_reference(T, S),
1433    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
1434    {_,T2,S2} = strip(ExpRef ++ T1,S1),
1435    scan_include(T2, S2);
1436scan_include("<![" ++ T, S0) ->
1437    ?bump_col(3),
1438    ?strip1,
1439    {_, T2, S2} = scan_conditional_sect(T1, S1),
1440    ?strip3,
1441    scan_include(T3,S3);
1442scan_include(T, S) ->
1443    {_, T1, S1} = scan_markup_decl(T, S),
1444    scan_include(T1, S1).
1445
1446
1447%%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | 
1448%%%%%%%                     NotationDecl | PI |Comment
1449%%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
1450
1451%% Validity constraint: Unique Type Declaration: No element type may be
1452%% declared more than once.
1453%%
1454scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
1455    ?dbg("cont()...~n", []),
1456    F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end,
1457      fun(S1) -> {[], [], S1} end,
1458      S);
1459scan_markup_decl("<!--" ++ T, S0) ->
1460    ?bump_col(4),
1461    {_, T1, S1} = scan_comment(T, S),
1462    ?strip2;
1463scan_markup_decl("<?" ++ T, S0) ->
1464    ?bump_col(2),
1465    {_PI, T1, S1} = scan_pi(T, S,_Pos=markup),
1466    ?strip2;
1467scan_markup_decl("<!ELEMENT" ++ T, 
1468		 #xmerl_scanner{rules_read_fun = Read,
1469				rules_write_fun = Write,
1470				rules_delete_fun = Delete} = S0) ->
1471    ?bump_col(9),
1472    {_,T1,S1} = mandatory_strip(T,S),
1473    {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
1474    Element  = 
1475	case Read(elem_def, Ename, S2) of
1476	    El = #xmlElement{elementdef=Decl} when Decl /= undeclared ->
1477		case S2#xmerl_scanner.validation of
1478		    true ->
1479			?fatal({already_defined, Ename}, S2);
1480		    _ ->
1481			Delete(elem_def,Ename,S2),
1482			El
1483		end;
1484	    El = #xmlElement{} ->
1485		Delete(elem_def,Ename,S2),
1486		El;
1487	    undefined ->
1488		#xmlElement{}
1489	

Large files files are truncated, but you can click here to view the full file