xmerl_scan.erl | searchcode

/lib/xmerl/src/xmerl_scan.erl

https://github.com/bmizerany/jungerl · Erlang · 3778 lines · 2842 code · 378 blank · 558 comment · 32 complexity · 573894cf37ee0fe26e2519ee216f684f MD5 · raw file
Large files are truncated click here to view the full file

%%% The contents of this file are subject to the Erlang Public License,
%%% Version 1.0, (the "License"); you may not use this file except in
%%% compliance with the License. You may obtain a copy of the License at
%%% http://www.erlang.org/license/EPL1_0.txt
%%%
%%% Software distributed under the License is distributed on an "AS IS"
%%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%%% the License for the specific language governing rights and limitations
%%% under the License.
%%%
%%% The Original Code is xmerl-0.15
%%%
%%% The Initial Developer of the Original Code is Ericsson Telecom
%%% AB. Portions created by Ericsson are Copyright (C), 1998, Ericsson
%%% Telecom AB. All Rights Reserved.
%%%
%%% Contributor(s):
%%%    Mickael Remond <mickael.remond@IDEALX.com>:
%%%    Johan Blom <johan.blom@mobilearts.se>
%%%    Richard Carlsson
%%%    Fredrik Linder
%%%
%%%----------------------------------------------------------------------
%%% #0.    BASIC INFORMATION
%%%----------------------------------------------------------------------
%%% File:       xmerl_scan.erl
%%% Author       : Ulf Wiger <ulf.wiger@ericsson.com>
%%% Description  : Simgle-pass XML scanner. See xmerl.hrl for data defs.
%%% 
%%% Modules used : ets, file, filename, io, lists, ucs, uri
%%% 
%%%----------------------------------------------------------------------
%% @doc 
%%     The XML parser is activated through 
%%     <tt>xmerl_scan:string/[1,2]</tt> or 
%%     <tt>xmerl_scan:file/[1,2]</tt>.
%%     It returns records of the type defined in xmerl.hrl.
%% See also <a href="xmerl_examples.html">tutorial</a> on customization
%% functions.
%% @type global_state(). <p>
%% The global state of the scanner, represented by the #xmerl_scanner{} record.
%% </p>
%% @type option_list(). <p>Options allows to customize the behaviour of the
%%     scanner.
%% See also <a href="xmerl_examples.html">tutorial</a> on customization
%% functions.
%% </p>
%% Possible options are:
%% <dl>
%%  <dt><code>{acc_fun, Fun}</code></dt>
%%    <dd>Call back function to accumulate contents of entity.</dd>
%%  <dt><code>{continuation_fun, Fun} |
%%            {continuation_fun, Fun, ContinuationState}</code></dt>
%%    <dd>Call back function to decide what to do if the scanner runs into eof
%%     before the document is complete.</dd>
%%  <dt><code>{event_fun, Fun} |
%%            {event_fun, Fun, EventState}</code></dt>
%%    <dd>Call back function to handle scanner events.</dd>
%%  <dt><code>{fetch_fun, Fun} |
%%            {fetch_fun, Fun, FetchState}</code></dt>
%%    <dd>Call back function to fetch an external resource.</dd>
%%  <dt><code>{hook_fun, Fun} |
%%            {hook_fun, Fun, HookState}</code></dt>
%%    <dd>Call back function to process the document entities once
%%     identified.</dd>
%%  <dt><code>{close_fun, Fun}</code></dt>
%%    <dd>Called when document has been completely parsed.</dd>
%%  <dt><code>{rules, ReadFun, WriteFun, RulesState} |
%%            {rules, Rules}</code></dt>
%%    <dd>Handles storing of scanner information when parsing.</dd>
%%  <dt><code>{user_state, UserState}</code></dt>
%%    <dd>Global state variable accessible from all customization functions</dd>
%%
%%  <dt><code>{fetch_path, PathList}</code></dt>
%%    <dd>PathList is a list of
%%     directories to search when fetching files. If the file in question
%%     is not in the fetch_path, the URI will be used as a file
%%     name.</dd>
%%  <dt><code>{space, Flag}</code></dt>
%%    <dd>'preserve' (default) to preserve spaces, 'normalize' to
%%    accumulate consecutive whitespace and replace it with one space.</dd>
%%  <dt><code>{line, Line}</code></dt>
%%    <dd>To specify starting line for scanning in document which contains
%%    fragments of XML.</dd>
%%  <dt><code>{namespace_conformant, Flag}</code></dt>
%%    <dd>Controls whether to behave as a namespace conformant XML parser,
%%    'false' (default) to not otherwise 'true'.</dd>
%%  <dt><code>{validation, Flag}</code></dt>
%%    <dd>Controls whether to process as a validating XML parser,
%%    'false' (default) to not otherwise 'true'.</dd>
%%  <dt><code>{quiet, Flag}</code></dt>
%%    <dd>Set to 'true' if xmerl should behave quietly and not output any info
%%    to standard output (default 'false').</dd>
%%  <dt><code>{doctype_DTD, DTD}</code></dt>
%%    <dd>Allows to specify DTD name when it isn't available in the XML
%%    document.</dd>
%%  <dt><code>{xmlbase, Dir}</code></dt>
%%    <dd>XML Base directory. If using string/1 default is current directory.
%%    If using file/1 default is directory of given file.</dd>
%%  <dt><code>{encoding, Enc}</code></dt>
%%    <dd>Set default character set used (default UTF-8).
%%    This character set is used only if not explicitly given by the XML
%%    declaration. </dd>
%% </dl>
%% @end 
%% Only used internally are:
%%  <dt><code>{environment,Env}</code></dt>
%%   <dd>What is this?</dd>
%% <dt><code>{text_decl,Bool}</code></dt>
%%   <dd>What is this?</dd>

-module(xmerl_scan).
-vsn('0.19').
-date('03-09-16').


%% main API
-export([string/1, string/2,
	 file/1, file/2]).

%% access functions for various states
-export([user_state/1, user_state/2,
	 event_state/1, event_state/2,
	 hook_state/1, hook_state/2,
	 rules_state/1, rules_state/2,
	 fetch_state/1, fetch_state/2,
	 cont_state/1, cont_state/2]).

%% helper functions. To xmerl_lib ??
-export([accumulate_whitespace/4]).

%-define(debug, 1).
-include("xmerl.hrl").		% record def, macros
-include_lib("kernel/include/file.hrl").


-define(fatal(Reason, S),
	if
	    S#xmerl_scanner.quiet ->
		ok;
	    true ->
		ok=io:format("~p- fatal: ~p~n", [?LINE, Reason])
	end,
	fatal(Reason, S)).


-define(ustate(U, S), S#xmerl_scanner{user_state = U}).


%% Functions to access the various states

%%% @spec user_state(S::global_state()) -> global_state()
%%% @equiv user_state(UserState,S)
user_state(#xmerl_scanner{user_state = S}) -> S.

%%% @spec event_state(S::global_state()) -> global_state()
%%% @equiv event_state(EventState,S)
event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S.

%%% @spec hook_state(S::global_state()) -> global_state()
%%% @equiv hook_state(HookState,S)
hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S.

%%% @spec rules_state(S::global_state()) -> global_state()
%%% @equiv rules_state(RulesState,S)
rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S.

%%% @spec fetch_state(S::global_state()) -> global_state()
%%% @equiv fetch_state(FetchState,S)
fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S.

%%% @spec cont_state(S::global_state()) -> global_state()
%%% @equiv cont_state(ContinuationState,S)
cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S.


%%%% Functions to modify the various states

%%% @spec user_state(UserState, S::global_state()) -> global_state()
%%% @doc For controlling the UserState, to be used in a user function.
%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
user_state(X, S) ->
    S#xmerl_scanner{user_state = X}.

%%% @spec event_state(EventState, S::global_state()) -> global_state()
%%% @doc For controlling the EventState, to be used in an event
%%% function, and called at the beginning and at the end of a parsed entity.
%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
event_state(X, S=#xmerl_scanner{fun_states = FS}) ->
    FS1 = FS#xmerl_fun_states{event = X},
    S#xmerl_scanner{fun_states = FS1}.

%%% @spec hook_state(HookState, S::global_state()) -> global_state()
%%% @doc For controlling the HookState, to be used in a hook
%%% function, and called when the parser has parsed a complete entity.
%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
hook_state(X, S=#xmerl_scanner{fun_states = FS}) ->
    FS1 = FS#xmerl_fun_states{hook = X},
    S#xmerl_scanner{fun_states = FS1}.

%%% @spec rules_state(RulesState, S::global_state()) -> global_state()
%%% @doc For controlling the RulesState, to be used in a rules
%%% function, and called when the parser store scanner information in a rules
%%% database.
%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
rules_state(X, S=#xmerl_scanner{fun_states = FS}) ->
    FS1 = FS#xmerl_fun_states{rules = X},
    S#xmerl_scanner{fun_states = FS1}.

%%% @spec fetch_state(FetchState, S::global_state()) -> global_state()
%%% @doc For controlling the FetchState, to be used in a fetch
%%% function, and called when the parser fetch an external resource (eg. a DTD).
%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
fetch_state(X, S=#xmerl_scanner{fun_states = FS}) ->
    FS1 = FS#xmerl_fun_states{fetch = X},
    S#xmerl_scanner{fun_states = FS1}.

%%% @spec cont_state(ContinuationState, S::global_state()) -> global_state()
%%% @doc For controlling the ContinuationState, to be used in a continuation
%%% function, and called when the parser encounters the end of the byte stream.
%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
    FS1 = FS#xmerl_fun_states{cont = X},
    S#xmerl_scanner{fun_states = FS1}.


%% @spec file(Filename::string()) -> {xmlElement(),Rest}
%%   Rest = list()
%% @equiv file(Filename, [])
file(F) ->
    file(F, []).

%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest}
%%   Rest = list()
%%% @doc Parse file containing an XML document
file(F, Options) ->
    ExtCharset=case lists:keysearch(encoding,1,Options) of
		   {value,{_,Val}} -> Val;
		   false -> undefined
	       end,
    case int_file(F,Options,ExtCharset) of
	{Res, Tail,S=#xmerl_scanner{close_fun=Close}} ->
	    Close(S), % for side effects only - final state is dropped
	    {Res,Tail};
	{error, Reason} ->
	    {error, Reason};
	Other ->
	    {error, Other}
    end.

int_file(F, Options,_ExtCharset) ->
%     io:format("int_file F=~p~n",[F]),
    case file:read_file(F) of
	{ok, Bin} ->
	    int_string(binary_to_list(Bin), Options, filename:dirname(F));
	Error ->
	    Error
    end.

int_file_decl(F, Options,_ExtCharset) ->
%     io:format("int_file_decl F=~p~n",[F]),
    case file:read_file(F) of
	{ok, Bin} ->
	    int_string_decl(binary_to_list(Bin), Options, filename:dirname(F));
	Error ->
	    Error
    end.

%% @spec string(Text::list()) -> {xmlElement(),Rest}
%%   Rest = list()
%% @equiv string(Test, [])
string(Str) ->  
    string(Str, []).

%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest}
%%   Rest = list()
%%% @doc Parse string containing an XML document
string(Str, Options) ->
    case int_string(Str, Options) of
	{Res, Tail, S=#xmerl_scanner{close_fun = Close}} ->
	    Close(S),    % for side effects only - final state is dropped
	    {Res,Tail};
	{error, Reason} ->
	    {error, Reason};  % (This can't happen, currently)
	Other ->
	    {error, Other}
    end.

int_string(Str, Options) ->
    {ok,  XMLBase} = file:get_cwd(),
    int_string(Str, Options, XMLBase).

int_string(Str, Options, XMLBase) ->
    S=initial_state0(Options,XMLBase),
    case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
	{auto,'iso-10646-utf-1',Str2} ->
	    scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
	{external,'iso-10646-utf-1',Str2} ->
	    scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
	{undefined,undefined,Str2} ->
	    scan_document(Str2, S);
	{external,ExtCharset,Str2} ->
	    scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
    end.

int_string_decl(Str, Options, XMLBase) ->
    S=initial_state0(Options,XMLBase),
    case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
	{auto,'iso-10646-utf-1',Str2} ->
	    scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
	{external,'iso-10646-utf-1',Str2} ->
	    scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
	{undefined,undefined,Str2} ->
	    scan_decl(Str2, S);
	{external,ExtCharset,Str2} ->
	    scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
    end.
    


initial_state0(Options,XMLBase) ->
    initial_state(Options, #xmerl_scanner{
		    event_fun = fun event/2,
		    hook_fun = fun hook/2,
		    acc_fun = fun acc/3,
		    fetch_fun = fun fetch/2,
		    close_fun = fun close/1,
		    continuation_fun = fun cont/3,
		    rules_read_fun = fun rules_read/3,
		    rules_write_fun = fun rules_write/4,
		    rules_delete_fun= fun rules_delete/3,
		    xmlbase = XMLBase
		   }).

initial_state([{event_fun, F}|T], S) ->
    initial_state(T, S#xmerl_scanner{event_fun = F});
initial_state([{event_fun, F, ES}|T], S) ->
    S1 = event_state(ES, S#xmerl_scanner{event_fun = F}),
    initial_state(T, S1);
initial_state([{acc_fun, F}|T], S) ->
    initial_state(T, S#xmerl_scanner{acc_fun = F});
initial_state([{hook_fun, F}|T], S) ->
    initial_state(T, S#xmerl_scanner{hook_fun = F});
initial_state([{hook_fun, F, HS}|T], S) ->
    S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}),
    initial_state(T, S1);
initial_state([{close_fun, F}|T], S) ->
    initial_state(T, S#xmerl_scanner{close_fun = F});
initial_state([{fetch_fun, F}|T], S) ->
    initial_state(T, S#xmerl_scanner{fetch_fun = F});
initial_state([{fetch_fun, F, FS}|T], S) ->
    S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}),
    initial_state(T, S1);
initial_state([{fetch_path, P}|T], S) ->
    initial_state(T, S#xmerl_scanner{fetch_path = P});
initial_state([{continuation_fun, F}|T], S) ->
    initial_state(T, S#xmerl_scanner{continuation_fun = F});
initial_state([{continuation_fun, F, CS}|T], S) ->
    S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}),
    initial_state(T, S1);
initial_state([{rules, R}|T], S) ->
    initial_state(T, S#xmerl_scanner{rules = R,
				     keep_rules = true});
initial_state([{rules, Read, Write, RS}|T], S) ->
    S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read,
					 rules_write_fun = Write,
					 keep_rules = true}),
    initial_state(T, S1);
initial_state([{user_state, F}|T], S) ->
    initial_state(T, S#xmerl_scanner{user_state = F});
initial_state([{space, L}|T], S) ->
    initial_state(T, S#xmerl_scanner{space = L});
initial_state([{line, L}|T], S) ->
    initial_state(T, S#xmerl_scanner{line = L});
initial_state([{namespace_conformant, F}|T], S) when F==true; F==false ->
    initial_state(T, S#xmerl_scanner{namespace_conformant = F});
initial_state([{validation, F}|T], S) when F==true; F==false ->
    initial_state(T, S#xmerl_scanner{validation = F});
initial_state([{quiet, F}|T], S) when F==true; F==false ->
    initial_state(T, S#xmerl_scanner{quiet = F});
initial_state([{doctype_DTD,DTD}|T], S) ->
    initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
initial_state([{text_decl,Bool}|T], S) ->
    initial_state(T,S#xmerl_scanner{text_decl=Bool});
initial_state([{environment,Env}|T], S) ->
    initial_state(T,S#xmerl_scanner{environment=Env});
initial_state([{xmlbase, D}|T], S) ->
    initial_state(T, S#xmerl_scanner{xmlbase = D});
initial_state([{encoding, Enc}|T], S) ->
    initial_state(T, S#xmerl_scanner{encoding = Enc});
initial_state([], S=#xmerl_scanner{rules = undefined}) ->
    Tab = ets:new(rules, [set, public]),
    S#xmerl_scanner{rules = Tab};
initial_state([], S) ->
    S.


%%% -----------------------------------------------------
%%% Default modifier functions

%%% Hooks:
%%% - {element, Line, Name, Attrs, Content}
%%% - {processing_instruction, Line, Data}

hook(X, State) ->
    {X, State}.

%%% Events:
%%%
%%% #xmerl_event{event : started | ended,
%%%              line  : integer(),
%%%		 col   : integer(),
%%%              data}
%%%
%%% Data		Events
%%% document		started, ended
%%% #xmlElement		started, ended
%%% #xmlAttribute	ended
%%% #xmlPI		ended
%%% #xmlComment		ended
%%% #xmlText		ended
event(_X, S) ->
    S.

%% The acc/3 function can return either {Acc�, S'} or {Acc', Pos', S'},
%% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or
%% X#xmlAttribute.pos (whichever is the current object type.)
%% The acc/3 function is not allowed to redefine the type of object
%% being defined, but _is_ allowed to either ignore it or split it 
%% into multiple objects (in which case {Acc',Pos',S'} should be returned.)
%% If {Acc',S'} is returned, Pos will be incremented by 1 by default.
%% Below is an example of an acceptable operation
acc(X = #xmlText{value = Text}, Acc, S) ->
    {[X#xmlText{value = lists:flatten(Text)}|Acc], S};
acc(X, Acc, S) ->
    {[X|Acc], S}.

fetch({system, URI}, S) ->
    fetch_URI(URI, S);
fetch({public, _PublicID, URI}, S) ->
    fetch_URI(URI, S).

%%% Always assume an external resource can be found locally! Thus
%%% don't bother fetching with e.g. HTTP. Returns the path where the
%%% resource is found.  The path to the external resource is given by
%%% URI directly or the option fetch_path (additional paths) or
%%% directory (base path to external resource)
fetch_URI(URI, S) ->
    %% assume URI is a filename
    Split = filename:split(URI),
    Filename = fun([])->[];(X)->lists:last(X) end (Split),
    Fullname = 
	case Split of %% how about Windows systems?
	    ["file:"|Name]-> %% absolute path, see RFC2396 sect 3
		%% file:/dtd_name 
		filename:join(["/"|Name]);
	    ["/"|Rest] when Rest /= [] ->
		%% absolute path name
		URI;
	    ["http:"|_Rest] ->
		{http,URI};
	    [] -> %% empty systemliteral
		[];
	    _ ->
		filename:join(S#xmerl_scanner.xmlbase, URI)
	end,
    Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname),
    ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]),
    {ok, Path, S}.

path_locate(_, _, {http,_}=URI) ->
    URI;
path_locate(_, _, []) ->
    [];
path_locate([Dir|Dirs], FN, FullName) ->
    F = filename:join(Dir, FN),
    case file:read_file_info(F) of
	{ok, #file_info{type = regular}} ->
	    {file,F};
	_ ->
	    path_locate(Dirs, FN, FullName)
    end;
path_locate([], _FN, FullName) ->
    {file,FullName}.


cont(_F, Exception, US) ->
    Exception(US).

close(S) ->
    S.


%%% -----------------------------------------------------
%%% Scanner

%%% [1] document ::= prolog element Misc*
scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
				     line = L, col = C,
				     environment=Env,
				     encoding=Charset,
				     validation=ValidateResult}) ->
    S1 = Event(#xmerl_event{event = started,
			    line = L,
			    col = C,
			    data = document}, S),
    
    %% Transform to given character set.
    %% Note that if another character set is given in the encoding 
    %% attribute in a XML declaration that one will be used later
    Str=if
	    Charset=/=undefined -> % Default character set is UTF-8
		ucs:to_unicode(Str0,list_to_atom(Charset));
	    true ->
		Str0
	end,

    {"<"++T2, S2} = scan_prolog(Str, S1, _StartPos = 1),
    {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1),
    {Tail, S4}=scan_misc(T3, S3, _StartPos = 1),
    
    S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
					       line = S4#xmerl_scanner.line,
					       col = S4#xmerl_scanner.col,
					       data = document}, S4),
    
    S6 = case ValidateResult of
	     false ->
		 cleanup(S5);
	     true when Env == element; Env == prolog ->
		 check_decl2(S5),
		 case xmerl_validate:validate(S5,Res) of
		     {'EXIT',{error,Reason}} ->
			 S5b=cleanup(S5),
			 ?fatal({failed_validation,Reason}, S5b);
		     {'EXIT',Reason} ->
			 S5b=cleanup(S5),
			 ?fatal({failed_validation,Reason}, S5b);
		     {error,Reason} ->
			 S5b=cleanup(S5),
			 ?fatal({failed_validation,Reason}, S5b);
		     {error,Reason,_Next} ->
			 S5b=cleanup(S5),
			 ?fatal({failed_validation,Reason}, S5b);
		     _XML ->
			 cleanup(S5)
		 end;
	     true ->
		 cleanup(S5)
	 end,

    {Res, Tail, S6}.


scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
				line = L, col = C,
				environment=_Env,
				encoding=_Charset,
				validation=_ValidateResult}) ->
    S1 = Event(#xmerl_event{event = started,
			    line = L,
			    col = C,
			    data = document}, S),
    
    case scan_prolog(Str, S1, _StartPos = 1) of
	{T2="<"++_, S2} ->
	    {{S2#xmerl_scanner.user_state,T2},[],S2};
	{[], S2}->
	    {[],[],S2};
	{T2, S2} ->
	    {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
				    _Lang=[],_Parents=[],#xmlNamespace{}),
	    {T2,[],S3}
    end.


%%% [22] Prolog
%%% prolog    ::=    XMLDecl? Misc* (doctypedecl Misc*)?
%%%
%% Text declaration may be empty
scan_prolog([], S=#xmerl_scanner{text_decl=true},_Pos) ->
    {[],S};
scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end,
      fun(S1) -> {[], S1} end,
      S);
scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
  when ?whitespace(hd(T)) ->
    {Charset,T3, S3}=
    if
	Col==1,L==1,S0#xmerl_scanner.text_decl==true -> 
	    ?dbg("prolog(\"<?xml\")~n", []),
	    ?bump_col(5),
	    {_,T1,S1} = mandatory_strip(T,S0),
	    {Decl,T2, S2}=scan_text_decl(T1,S1),
	    Encoding=Decl#xmlDecl.encoding,
	    {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
	Col==1,L==1 -> 
	    ?dbg("prolog(\"<?xml\")~n", []),
	    ?bump_col(5),
	    {Decl,T2, S2}=scan_xml_decl(T, S0),
	    Encoding=Decl#xmlDecl.encoding,
	    {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
	true ->
	    ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
    end,
    %% Now transform to declared character set.
    if
	Charset==Charset0 -> % Document already transformed to this charset!
	    scan_prolog(T3, S3, Pos);
	Charset0=/=undefined -> % Document transformed to other bad charset!
	    ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S3);
	Charset=/=undefined -> % Document not previously transformed
	    T4=ucs:to_unicode(T3,list_to_atom(Charset)),
	    scan_prolog(T4, S3, Pos);
	true -> % No encoding info given
	    scan_prolog(T3, S3, Pos)
    end;
scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog,
						encoding=Charset}, Pos) ->
    ?dbg("prolog(\"<!DOCTYPE\")~n", []),
    ?bump_col(9),
    %% If no known character set assume it is UTF-8
    T1=if
	   Charset==undefined -> ucs:to_unicode(T,'utf-8');
	   true -> T
       end,
    {T2, S1} = scan_doctype(T1, S),
    scan_misc(T2, S1, Pos);
scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=Charset},Pos) ->
    ?dbg("prolog(\"<\")~n", []),
    
    %% Check for Comments, PI before possible DOCTYPE declaration
    ?bump_col(1),
    %% If no known character set assume it is UTF-8
    T=if
	  Charset==undefined -> ucs:to_unicode(Str,'utf-8');
	  true -> Str
      end,
    {T1, S1}=scan_misc(T, S, Pos),
    scan_prolog2(T1,S1,Pos).



scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end,
      fun(S1) -> {[], S1} end,
      S);
scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) ->
    ?dbg("prolog(\"<!DOCTYPE\")~n", []),
    ?bump_col(9),
    {T1, S1} = scan_doctype(T, S),
    scan_misc(T1, S1, Pos);
scan_prolog2(Str = "<!" ++ _, S, _Pos) ->
    ?dbg("prolog(\"<!\")~n", []),
    %% In e.g. a DTD, we jump directly to markup declarations
    scan_ext_subset(Str, S);
scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
    ?dbg("prolog(\"<\")~n", []),
    
    %% Check for more Comments and PI after DOCTYPE declaration
    ?bump_col(1),
    scan_misc(Str, S, Pos).




%%% [27] Misc ::=   	Comment | PI | S
%% Note:
%% - Neither of Comment and PI are returned in the resulting parsed
%%   structure.
%% - scan_misc/3 implements Misc* as that is how the rule is always used
scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end,
      fun(S1) -> {[], S1} end,
      S);
scan_misc("<!--" ++ T, S, Pos) -> % Comment
    {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
    scan_misc(T1,S1,Pos);
scan_misc("<?" ++ T, S0, Pos) -> % PI
    ?dbg("prolog(\"<?\")~n", []),
    ?bump_col(2),
    {_PI, T1, S1} = scan_pi(T, S, Pos),
    scan_misc(T1,S1,Pos);
scan_misc([H|T], S, Pos) when ?whitespace(H) ->
    ?dbg("prolog(whitespace)~n", []),
    scan_misc(T,S,Pos);
scan_misc(T,S,_Pos) ->
    {T,S}.


cleanup(S=#xmerl_scanner{keep_rules = false,
			 rules = Rules}) ->
    ets:delete(Rules),
    S#xmerl_scanner{rules = undefined};
cleanup(S) ->
    S.

%%% Prolog and Document Type Declaration XML 1.0 Section 2.8
%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
%% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
scan_xml_decl(T, S) ->
    %% VersionInfo [24] is mandatory
    {_,T1,S2} = mandatory_strip(T,S),
    "version" ++ T2 = T1,
    {T3, S3} = scan_eq(T2, S2),
    {Vsn, T4, S4} = scan_xml_vsn(T3, S3),
    Attr = #xmlAttribute{name = version,
			 parents = [{xml, _XMLPos = 1}],
			 value = Vsn},
    scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}).

scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end,
      fun(S1) -> {[], [], S1} end,
      S);
scan_xml_decl("?>" ++ T, S0, Decl) ->
    ?bump_col(2),
    return_xml_decl(T,S,Decl);
scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) ->
    {_,T1,S1}=mandatory_strip(T,S),
    scan_xml_decl2(T1,S1,Decl).


scan_xml_decl2("?>" ++ T, S0,Decl) ->
    ?bump_col(2),
    return_xml_decl(T,S,Decl);
scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event},
	      Decl0 = #xmlDecl{attributes = Attrs}) ->
    %% [80] EncodingDecl
    ?bump_col(8),
    {T1, S1} = scan_eq(T, S),
    {EncName, T2, S2} = scan_enc_name(T1, S1),
    LowEncName=httpd_util:to_lower(EncName),
    Attr = #xmlAttribute{name = encoding,
			 parents = [{xml, _XMLPos = 1}],
			 value = LowEncName},
    Decl = Decl0#xmlDecl{encoding = LowEncName,
			 attributes = [Attr|Attrs]},
    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 
					       line = S0#xmerl_scanner.line, 
					       col = S0#xmerl_scanner.col,
					       data = Attr}, S2),
    case T2 of
	"?>" ++ _T3 ->
	    scan_xml_decl3(T2,S3,Decl);
	_ ->
	    {_,T3,S4} = mandatory_strip(T2,S3),
	    scan_xml_decl3(T3, S4, Decl)
    end;
scan_xml_decl2(T="standalone" ++ _T,S,Decl) ->
    scan_xml_decl3(T,S,Decl).

scan_xml_decl3("?>" ++ T, S0,Decl) ->
    ?bump_col(2),
    return_xml_decl(T,S,Decl);
scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event},
	      Decl0 = #xmlDecl{attributes = Attrs}) ->
    %% [32] SDDecl 
    ?bump_col(10),
    {T1, S1} = scan_eq(T, S),
    {StValue,T2,S2}=scan_standalone_value(T1,S1),
    Attr = #xmlAttribute{name = standalone,
			 parents = [{xml, _XMLPos = 1}],
			 value = StValue},
    Decl = Decl0#xmlDecl{standalone = StValue,
			 attributes = [Attr|Attrs]},
    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 
					       line = S0#xmerl_scanner.line, 
					       col = S0#xmerl_scanner.col,
					       data = Attr}, S2),
    {_,T3,S4} = strip(T2,S3),
    "?>" ++ T4 = T3,
    return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl).


return_xml_decl(T,S=#xmerl_scanner{hook_fun = Hook,
				   event_fun = Event},
		Decl0 = #xmlDecl{attributes = Attrs}) ->
    ?strip1,
    Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
    S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
					       line = S#xmerl_scanner.line,
					       col = S#xmerl_scanner.col,
					       data = Decl}, S1),
    {Ret, S3} = Hook(Decl, S2),
    {Ret, T1, S3}.
    

scan_standalone_value("'yes'" ++T,S0)->
    ?bump_col(5),
    {'yes',T,S#xmerl_scanner{standalone=yes}};
scan_standalone_value("\"yes\"" ++T,S0)->
    ?bump_col(5),
    {'yes',T,S#xmerl_scanner{standalone=yes}};
scan_standalone_value("'no'" ++T,S0) ->
    ?bump_col(4),
    {'no',T,S};
scan_standalone_value("\"no\"" ++T,S0) ->
    ?bump_col(4),
    {'no',T,S}.

%%%
%%% Text declaration XML 1.0 section 4.3.1
%%% [77] TextDecl  ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) ->
    {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S),
    "encoding" ++ T2 = T1,
    S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8},
    {T3, S3} = scan_eq(T2, S2),
    {EncName, T4, S4} = scan_enc_name(T3, S3),
    LowEncName=httpd_util:to_lower(EncName),
    ?strip5,
    Attr = #xmlAttribute{name = encoding,
			 parents = [{xml,1}],
			 value = LowEncName},
    Decl = Decl0#xmlDecl{encoding = LowEncName,
 			 attributes = [Attr|Attrs]},
    S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended,
					     line = S5#xmerl_scanner.line,
					     col = S5#xmerl_scanner.col,
					     data = Attr}, S5),
    scan_text_decl(T5,S6,Decl).

scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = Hook,
					   event_fun = Event}, 
	       Decl0 = #xmlDecl{attributes = Attrs}) ->
    ?bump_col(2),
    ?strip1,
    Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
    S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
					       line = S0#xmerl_scanner.line,
					       col = S0#xmerl_scanner.col,
					       data = Decl}, S1),
    {Ret, S3} = Hook(Decl, S2),
    {Ret, T1, S3}.

scan_optional_version("version"++T,S0) ->
    ?bump_col(7),
    ?strip1,
    {T2, S2} = scan_eq(T1, S1),
    {Vsn, T3, S3} = scan_xml_vsn(T2, S2),
    {_,T4,S4} = mandatory_strip(T3,S3),
    Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn},
    {#xmlDecl{attributes=[Attr]},T4,S4};
scan_optional_version(T,S) ->
    {#xmlDecl{attributes=[]},T,S}.
    


%%%%%%% [81] EncName
scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end,
      fun(S1) -> ?fatal(expected_encoding_name, S1) end,
      S);
scan_enc_name([H|T], S0) when H >= $"; H =< $' -> 
    ?bump_col(1),
    scan_enc_name(T, S, H, []).


scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end,
      fun(S1) -> ?fatal(expected_encoding_name, S1) end,
      S);
scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
    ?bump_col(1),
    scan_enc_name2(T, S, Delim, [H|Acc]);
scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
    ?bump_col(1),
    scan_enc_name2(T, S, Delim, [H|Acc]);
scan_enc_name([H|_T],S,_Delim,_Acc) ->
    ?fatal({error,{unexpected_character_in_Enc_Name,H}},S).

scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end,
      fun(S1) -> ?fatal(expected_encoding_name, S1) end,
      S);
scan_enc_name2([H|T], S0, H, Acc) ->
    ?bump_col(1),
    {lists:reverse(Acc), T, S};
scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
    ?bump_col(1),
    scan_enc_name2(T, S, Delim, [H|Acc]);
scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
    ?bump_col(1),
    scan_enc_name2(T, S, Delim, [H|Acc]);
scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 ->
    ?bump_col(1),
    scan_enc_name2(T, S, Delim, [H|Acc]);
scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- ->
    ?bump_col(1),
    scan_enc_name2(T, S, Delim, [H|Acc]).


%%%%%%% [26] VersionNum
%%% VersionNum    ::=    ([a-zA-Z0-9_.:] | '-')+
scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_xml_vsn([H|T], S) when H==$"; H==$'->
    xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []).

xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end, 
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) ->
    {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}};
xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z ->
    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z ->
    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 ->
    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) ->
    case lists:member(H, "_.:-") of
	true ->
	    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
	false ->
	    ?fatal({invalid_vsn_char, H}, S)
    end.

%%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'

scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_pi(Str = [H1,H2,H3 | T],S=#xmerl_scanner{line = L, col = C}, Pos)
  when H1==$x;H1==$X ->
    %% names beginning with [xX][mM][lL] are reserved for future use.
    if 
	((H2==$m) or (H2==$M)) and
	((H3==$l) or (H3==$L)) ->
	    scan_wellknown_pi(T,S,Pos);
	true ->
	    {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
	    scan_pi(T1, S1, Target, L, C, Pos, [])
    end;
scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) ->
    {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
    scan_pi(T1, S1, Target, L, C, Pos,[]).


%%% More info on xml-stylesheet can be found at:
%%%   "Associating Style Sheets with XML documents", Version 1.0,
%%%   W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/)
scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) ->
    ?dbg("prolog(\"<?xml-stylesheet\")~n", []),
    ?bump_col(16),
    scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]);
scan_wellknown_pi(Str,S,_Pos) ->
    ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S).


% scan_pi(Str="?>"++_T,S,Target, L, C, Pos) ->
%     scan_pi(Str,S,Target, L, C, Pos,[]);
% scan_pi(Str=[],S,Target, L, C, Pos) ->
%     scan_pi(Str,S,Target, L, C, Pos,[]);
% scan_pi(T,S,Target, L, C, Pos) ->
%     {_,T1,S1} = mandatory_strip(T,S),
%     scan_pi(T1,S1,Target, L, C, Pos,[]).


scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
				       event_fun = Event}, 
	Target, L, C, Pos, Acc) ->
    ?bump_col(2),
    PI = #xmlPI{name = Target,
		pos = Pos,
		value = lists:reverse(Acc)},
    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
					       line = L,
					       col = C,
					       data = PI}, S),
    {Ret, S2} = Hook(PI, S1),
    {Ret, T, S2};
scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) ->
    ?strip1,
    scan_pi2(T1, S1, Target, L, C, Pos, Acc).


scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
				       event_fun = Event}, 
	Target, L, C, Pos, Acc) ->
    ?bump_col(2),
    PI = #xmlPI{name = Target,
		pos = Pos,
		value = lists:reverse(Acc)},
    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
					       line = L,
					       col = C,
					       data = PI}, S),
    {Ret, S2} = Hook(PI, S1),
    {Ret, T, S2};
scan_pi2([H|T], S0, Target, L, C, Pos, Acc) ->
    ?bump_col(1),
    wfc_legal_char(H,S),
    scan_pi2(T, S, Target, L, C, Pos, [H|Acc]).



%% [28] doctypedecl ::= 
%%   '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_doctype(T, S) ->
    {_,T1,S1} = mandatory_strip(T,S),
    {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
    ?strip3,
    scan_doctype1(T3, S3#xmerl_scanner{doctype_name =  DTName}).


%% [75] ExternalID ::= 'SYSTEM' S SystemLiteral
%%		     | 'PUBLIC' S PubidLiteral S SystemLiteral
scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_doctype1("PUBLIC" ++ T, S0) ->
    ?bump_col(6),
    {_,T1,S1} = mandatory_strip(T,S),
    {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
    {_,T3,S3} = mandatory_strip(T2,S2),
    {SL, T4, S4} = scan_system_literal(T3, S3),
    ?strip5,
    scan_doctype2(T5, S5, {public, PIDL, SL});
scan_doctype1("SYSTEM" ++ T, S0) ->
    ?bump_col(6),
    {_,T1,S1} = mandatory_strip(T,S),
    {SL, T2, S2} = scan_system_literal(T1, S1),
    ?strip3,
    scan_doctype2(T3, S3, {system, SL});
scan_doctype1(T, S) ->
    scan_doctype2(T, S, undefined).


scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_doctype2("[" ++ T, S0, DTD) ->
    ?bump_col(1),
    ?strip1,
    scan_doctype3(T1, S1, DTD);
scan_doctype2(">" ++ T, S0, DTD) ->
    ?bump_col(1),
    ?strip1,
    S2 = fetch_DTD(DTD, S1),
    check_decl(S2),
    {T1, S2}.

%% [28a] DeclSep   ::= PEReference | S
%% [28b] intSubset ::= (markupdecl | DeclSep)*
scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_doctype3("%" ++ T, S0, DTD) ->
    ?bump_col(1),
    {PERefName, T1, S1} = scan_pe_reference(T, S),
    ?strip2,
    case expand_pe_reference(PERefName, S2,as_PE) of
	{system, _} = Name ->
	    S3 = fetch_DTD(Name, S2),
	    scan_doctype3(T2, S3, DTD);
	{public, _} = Name ->
	    S3 = fetch_DTD(Name, S2),
	    scan_doctype3(T2, S3, DTD);
	{public, _, _} = Name ->
	    S3 = fetch_DTD(Name, S2),
	    scan_doctype3(T2, S3, DTD);
	ExpRef when list(ExpRef) -> % Space added, see Section 4.4.8
	    {_,T3,S3} = strip(ExpRef++T2,S2),
	    scan_doctype3(T3,S3,DTD)
    end;
scan_doctype3("]" ++ T, S0, DTD) ->
    ?bump_col(1),
    ?strip1,
    S2 = fetch_DTD(DTD, S1),
    check_decl(S2),
    ">" ++ T2 = T1,
    {T2, S2};
scan_doctype3(T, S, DTD) ->
    {_, T1, S1} = scan_markup_decl(T, S),
    scan_doctype3(T1, S1, DTD).



fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when list(URI)->
    %% allow to specify DTD name when it isn't available in xml stream
    fetch_DTD({system,URI},S);
fetch_DTD(undefined, S) ->
    S;
fetch_DTD(DTDSpec, S)-> 
    case fetch_and_parse(DTDSpec,S,[{text_decl,true},
				    {environment,{external,subset}}]) of
	NewS when record(NewS,xmerl_scanner) ->
	    NewS;
	{_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules
	    S
    end.

fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch,
					 rules=Rules,
					 xmlbase = XMLBase},
		Options0) ->
    RetS =
    case Fetch(ExtSpec, S) of
	{ok, NewS} -> 
	    %% For backward compatibility only. This will be removed later!!
	    NewS;
	{ok, not_fetched,NewS} ->
	    NewS;
	{ok, DataRet, NewS = #xmerl_scanner{user_state = UState,
					    event_fun = Event,
					    hook_fun = Hook,
					    fetch_fun = Fetch1,
					    close_fun = Close1,
					    continuation_fun = Cont,
					    acc_fun = Acc,
					    rules_read_fun = Read,
					    rules_write_fun = Write,
					    validation = Valid,
					    quiet = Quiet,
					    encoding = Charset
					   }} ->
	    EvS = event_state(NewS),
	    HoS = hook_state(NewS),
	    FeS = fetch_state(NewS),
	    CoS = cont_state(NewS),
	    Options = Options0++[{user_state, UState},
				 {rules, Rules},
				 {event_fun, Event, EvS},
				 {hook_fun, Hook, HoS},
				 {fetch_fun, Fetch1, FeS},
				 {close_fun, Close1},
				 {continuation_fun, Cont, CoS},
				 {rules, Read, Write, ""},
				 {acc_fun, Acc},
				 {validation,Valid},
				 {quiet,Quiet},
				 {encoding,Charset}],

	    case DataRet of
		{file, F} ->
		    int_file_decl(F, Options,Charset);
		{string, String} ->
		    int_string_decl(String, Options,XMLBase);
		 _ ->
		    %% other scheme
		    {DataRet,[],NewS}
	    end;
	Error ->
	    ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S)
    end,
    case RetS of
	#xmerl_scanner{} ->
	    RetS#xmerl_scanner{text_decl=false,
			       environment=S#xmerl_scanner.environment};
	_ -> RetS
    end.


fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) ->
    case Fetch(ExtSpec,S) of
	{ok, not_fetched,_NewS} ->
	    ?fatal({error_fetching_external_source,ExtSpec},S);
	{ok, DataRet, NewS} ->
	    String =
		case DataRet of
		    {file,F} ->	
			get_file(F,S);
		    {string,Str} ->
			binary_to_list(Str);
		    _ -> DataRet
		end,
	    {String, NewS};
	 _ ->
	    ?fatal({error_fetching_external_resource,ExtSpec},S)
    end.

get_file(F,S) ->
%     io:format("get_file F=~p~n",[F]),
    case file:read_file(F) of
	{ok,Bin} ->	    
	    binary_to_list(Bin);
	Err ->
	    ?fatal({error_reading_file,F,Err},S)
    end.
%% check_decl/1
%% Now it is necessary to check that all referenced types is declared,
%% since it is legal to reference some xml types before they are
%% declared.
check_decl(#xmerl_scanner{validation=false}) ->
    ok;
check_decl(#xmerl_scanner{rules=Tab} = S) ->
    check_notations(Tab,S),
    check_elements(Tab,S), %% check also attribute defs for element
    check_entities(Tab,S).
	    
check_notations(Tab,S) ->
    case ets:match(Tab,{{notation,'$1'},undeclared}) of
	[[]] -> ok;
	[] ->  ok;
	[L] when list(L) ->
	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
	Err ->
	    ?fatal({error_missing_declaration_in_DTD,Err},S)
    end.

check_elements(Tab,S) ->
    case ets:match(Tab,{{elem_def,'_'},'$2'},10) of
	{_,_}=M ->
	    Fun = fun({Match,'$end_of_table'},_F) ->
			  lists:foreach(fun(X)->check_elements2(X,S) end,
					Match),
			  ok;
		     ('$end_of_table',_) ->
			  ok;
		     ({Match,Cont},F) ->
			  lists:foreach(fun(X)->check_elements2(X,S) end,
					Match),
			  F(ets:match(Cont),F)
		  end,
	    Fun(M,Fun);
	'$end_of_table' -> ok;
	Err -> ?fatal({error_missing_declaration_in_DTD,Err},S)
    end.

% it is not an error to declare attributes for an element that is not
% declared.
check_elements2([#xmlElement{attributes=Attrs}],S) ->
    check_attributes(Attrs,S);
check_elements2(_,_) ->
    ok.

check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) ->
    case lists:keysearch('ID',2,Rest) of
	{value,Att2} ->
	    ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S);
	_ ->
	    ok
    end,
    vc_ID_Attribute_Default(Attr,S),
    check_attributes(Rest,S);
check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) ->
    vc_Enumeration(Attr,S),
    check_attributes(T,S);
check_attributes([{_,Ent,_,_,_}=Attr|T],S) 
  when Ent=='ENTITY';Ent=='ENTITIES' ->
    vc_Entity_Name(Attr,S),
    check_attributes(T,S);
check_attributes([_|T],S) ->
    check_attributes(T,S);
check_attributes([],_S) ->
    ok.

check_entities(Tab,S=#xmerl_scanner{validation=true}) ->
    case ets:match(Tab,{{entity,'$1'},undeclared}) of
	[[]] -> ok;
	[] ->  ok;
	[L] when list(L) ->
	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
	Err ->
	    ?fatal({error_missing_declaration_in_DTD,Err},S)
    end;
check_entities(_,_) ->
    ok.


%% check_decl2/1: checks that all referenced ID attributes are declared
check_decl2(S=#xmerl_scanner{rules=Tab}) ->
    check_referenced_ids(Tab,S).


check_referenced_ids(Tab,S) ->
    case ets:match(Tab,{{id,'$1'},undeclared}) of
	[[]] -> ok;
	[] ->  ok;
	[L] when list(L) ->
	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
	Err ->
	    ?fatal({error_missing_declaration_in_DTD,Err},S)
    end.

%%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl

scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end,
      fun(S1) -> {[], S1} end,
      S);
scan_ext_subset("%" ++ T, S0) -> 
    %% DeclSep [28a]: WFC: PE Between Declarations.
    %% The replacement text of a parameter entity reference in a
    %% DeclSep must match the production extSubsetDecl.
    ?bump_col(1),
    {_,T1,S1} = scan_decl_sep(T,S),
    scan_ext_subset(T1, S1);
scan_ext_subset("<![" ++ T, S0) ->
    ?bump_col(3),
    ?strip1,
    {_, T2, S2} = scan_conditional_sect(T1, S1),
    scan_ext_subset(T2,S2);
scan_ext_subset(T, S) when ?whitespace(hd(T)) ->
    {_,T1,S1} = strip(T,S),
    scan_ext_subset(T1, S1);
scan_ext_subset(T, S) ->
    {_, T1, S1} = scan_markup_decl(T, S),
    scan_ext_subset(T1, S1).


%%%%%%% [28a] DeclSep ::= PEReference | S

scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read,
				 rules_write_fun=Write,
				 rules_delete_fun=Delete}) ->
    {PERefName, T1, S1} = scan_pe_reference(T, S),
    {ExpandedRef,S2} =
	case expand_pe_reference(PERefName,S1,as_PE) of
	    Tuple when tuple(Tuple) ->
		%% {system,URI} or {public,URI}
		{ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
		{EntV,_,_S2} = scan_entity_value(ExpRef, S1, no_delim,
						 PERefName,parameter),
		%% should do an update Write(parameter_entity) so next
		%% expand_pe_reference is faster
		Delete(parameter_entity,PERefName,_S2),
		_S3 = Write(parameter_entity,PERefName,EntV,_S2),
		EntV2 = Read(parameter_entity,PERefName,_S3),
		{" " ++ EntV2 ++ " ",_S3};
	    ExpRef ->
		{ExpRef,S1}
	end,		     
    {_, T3, S3} = strip(ExpandedRef,S2),
    {_T4,S4} = scan_ext_subset(T3,S3),
    strip(T1,S4).

%%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect

scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_conditional_sect("IGNORE" ++ T, S0) ->
    ?bump_col(6),
    ?strip1,
    "[" ++ T2 = T1,
    {_,T3,S3} = strip(T2,S1),
    scan_ignore(T3,S3);
scan_conditional_sect("INCLUDE" ++ T, S0) ->
    ?bump_col(7),
    ?strip1,
    "[" ++ T2 = T1,
    {_,T3,S3} = strip(T2,S1),
    scan_include(T3, S3);
scan_conditional_sect("%"++T,S0) ->
    ?bump_col(1),
    ?bump_col(1),
    {PERefName, T1, S1} = scan_pe_reference(T, S),
    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
    {_,T2,S2} = strip(ExpRef ++ T1,S1),
    scan_conditional_sect(T2,S2).


%%%% [63] ignoreSect	 ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
%%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
%%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
scan_ignore(Str,S) ->
    scan_ignore(Str,S,0).

scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_ignore("<![" ++ T, S0,Level) ->
    %% nested conditional section. Topmost condition is ignore, though
    ?bump_col(3),
    scan_ignore(T, S,Level+1);
scan_ignore("]]>" ++ T, S0,0) ->
    ?bump_col(3),
    {[], T, S};
scan_ignore("]]>" ++ T, S0,Level) ->
    ?bump_col(3),
    scan_ignore(T, S,Level-1);
scan_ignore([_H|T],S0,Level) ->
    ?bump_col(1),
    scan_ignore(T,S,Level).


%%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
scan_include([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end,
      fun(S1) -> ?fatal(unexpected_end, S1) end,
      S);
scan_include("]]>" ++ T, S0) ->
    ?bump_col(3),
    {[], T, S};
scan_include("%" ++ T, S0) ->
    ?bump_col(1),
    {PERefName, T1, S1} = scan_pe_reference(T, S),
    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
    {_,T2,S2} = strip(ExpRef ++ T1,S1),
    scan_include(T2, S2);
scan_include("<![" ++ T, S0) ->
    ?bump_col(3),
    ?strip1,
    {_, T2, S2} = scan_conditional_sect(T1, S1),
    ?strip3,
    scan_include(T3,S3);
scan_include(T, S) ->
    {_, T1, S1} = scan_markup_decl(T, S),
    scan_include(T1, S1).


%%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | 
%%%%%%%                     NotationDecl | PI |Comment
%%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'

%% Validity constraint: Unique Type Declaration: No element type may be
%% declared more than once.
%%
scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
    ?dbg("cont()...~n", []),
    F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end,
      fun(S1) -> {[], [], S1} end,
      S);
scan_markup_decl("<!--" ++ T, S0) ->
    ?bump_col(4),
    {_, T1, S1} = scan_comment(T, S),
    ?strip2;
scan_markup_decl("<?" ++ T, S0) ->
    ?bump_col(2),
    {_PI, T1, S1} = scan_pi(T, S,_Pos=markup),
    ?strip2;
scan_markup_decl("<!ELEMENT" ++ T, 
		 #xmerl_scanner{rules_read_fun = Read,
				rules_write_fun = Write,
				rules_delete_fun = Delete} = S0) ->
    ?bump_col(9),
    {_,T1,S1} = mandatory_strip(T,S),
    {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
    Element  = 
	case Read(elem_def, Ename, S2) of
	    El = #xmlElement{elementdef=Decl} when Decl /= undeclared ->
		case S2#xmerl_scanner.validation of
		    true ->
			?fatal({already_defined, Ename}, S2);
		    _ ->
			Delete(elem_def,Ename,S2),
			El
		end;
	    El = #xmlElement{} ->
		Delete(elem_def,Ename,S2),
		El;
	    undefined ->
		#xmlElement{}…