/lib/xmerl/src/xmerl_scan.erl
Erlang | 3778 lines | 2842 code | 378 blank | 558 comment | 32 complexity | 573894cf37ee0fe26e2519ee216f684f MD5 | raw file
Possible License(s): LGPL-2.1, BSD-3-Clause, AGPL-1.0
Large files files are truncated, but you can click here to view the full file
- %%% The contents of this file are subject to the Erlang Public License,
- %%% Version 1.0, (the "License"); you may not use this file except in
- %%% compliance with the License. You may obtain a copy of the License at
- %%% http://www.erlang.org/license/EPL1_0.txt
- %%%
- %%% Software distributed under the License is distributed on an "AS IS"
- %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
- %%% the License for the specific language governing rights and limitations
- %%% under the License.
- %%%
- %%% The Original Code is xmerl-0.15
- %%%
- %%% The Initial Developer of the Original Code is Ericsson Telecom
- %%% AB. Portions created by Ericsson are Copyright (C), 1998, Ericsson
- %%% Telecom AB. All Rights Reserved.
- %%%
- %%% Contributor(s):
- %%% Mickael Remond <mickael.remond@IDEALX.com>:
- %%% Johan Blom <johan.blom@mobilearts.se>
- %%% Richard Carlsson
- %%% Fredrik Linder
- %%%
- %%%----------------------------------------------------------------------
- %%% #0. BASIC INFORMATION
- %%%----------------------------------------------------------------------
- %%% File: xmerl_scan.erl
- %%% Author : Ulf Wiger <ulf.wiger@ericsson.com>
- %%% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs.
- %%%
- %%% Modules used : ets, file, filename, io, lists, ucs, uri
- %%%
- %%%----------------------------------------------------------------------
- %% @doc
- %% The XML parser is activated through
- %% <tt>xmerl_scan:string/[1,2]</tt> or
- %% <tt>xmerl_scan:file/[1,2]</tt>.
- %% It returns records of the type defined in xmerl.hrl.
- %% See also <a href="xmerl_examples.html">tutorial</a> on customization
- %% functions.
- %% @type global_state(). <p>
- %% The global state of the scanner, represented by the #xmerl_scanner{} record.
- %% </p>
- %% @type option_list(). <p>Options allows to customize the behaviour of the
- %% scanner.
- %% See also <a href="xmerl_examples.html">tutorial</a> on customization
- %% functions.
- %% </p>
- %% Possible options are:
- %% <dl>
- %% <dt><code>{acc_fun, Fun}</code></dt>
- %% <dd>Call back function to accumulate contents of entity.</dd>
- %% <dt><code>{continuation_fun, Fun} |
- %% {continuation_fun, Fun, ContinuationState}</code></dt>
- %% <dd>Call back function to decide what to do if the scanner runs into eof
- %% before the document is complete.</dd>
- %% <dt><code>{event_fun, Fun} |
- %% {event_fun, Fun, EventState}</code></dt>
- %% <dd>Call back function to handle scanner events.</dd>
- %% <dt><code>{fetch_fun, Fun} |
- %% {fetch_fun, Fun, FetchState}</code></dt>
- %% <dd>Call back function to fetch an external resource.</dd>
- %% <dt><code>{hook_fun, Fun} |
- %% {hook_fun, Fun, HookState}</code></dt>
- %% <dd>Call back function to process the document entities once
- %% identified.</dd>
- %% <dt><code>{close_fun, Fun}</code></dt>
- %% <dd>Called when document has been completely parsed.</dd>
- %% <dt><code>{rules, ReadFun, WriteFun, RulesState} |
- %% {rules, Rules}</code></dt>
- %% <dd>Handles storing of scanner information when parsing.</dd>
- %% <dt><code>{user_state, UserState}</code></dt>
- %% <dd>Global state variable accessible from all customization functions</dd>
- %%
- %% <dt><code>{fetch_path, PathList}</code></dt>
- %% <dd>PathList is a list of
- %% directories to search when fetching files. If the file in question
- %% is not in the fetch_path, the URI will be used as a file
- %% name.</dd>
- %% <dt><code>{space, Flag}</code></dt>
- %% <dd>'preserve' (default) to preserve spaces, 'normalize' to
- %% accumulate consecutive whitespace and replace it with one space.</dd>
- %% <dt><code>{line, Line}</code></dt>
- %% <dd>To specify starting line for scanning in document which contains
- %% fragments of XML.</dd>
- %% <dt><code>{namespace_conformant, Flag}</code></dt>
- %% <dd>Controls whether to behave as a namespace conformant XML parser,
- %% 'false' (default) to not otherwise 'true'.</dd>
- %% <dt><code>{validation, Flag}</code></dt>
- %% <dd>Controls whether to process as a validating XML parser,
- %% 'false' (default) to not otherwise 'true'.</dd>
- %% <dt><code>{quiet, Flag}</code></dt>
- %% <dd>Set to 'true' if xmerl should behave quietly and not output any info
- %% to standard output (default 'false').</dd>
- %% <dt><code>{doctype_DTD, DTD}</code></dt>
- %% <dd>Allows to specify DTD name when it isn't available in the XML
- %% document.</dd>
- %% <dt><code>{xmlbase, Dir}</code></dt>
- %% <dd>XML Base directory. If using string/1 default is current directory.
- %% If using file/1 default is directory of given file.</dd>
- %% <dt><code>{encoding, Enc}</code></dt>
- %% <dd>Set default character set used (default UTF-8).
- %% This character set is used only if not explicitly given by the XML
- %% declaration. </dd>
- %% </dl>
- %% @end
- %% Only used internally are:
- %% <dt><code>{environment,Env}</code></dt>
- %% <dd>What is this?</dd>
- %% <dt><code>{text_decl,Bool}</code></dt>
- %% <dd>What is this?</dd>
- -module(xmerl_scan).
- -vsn('0.19').
- -date('03-09-16').
- %% main API
- -export([string/1, string/2,
- file/1, file/2]).
- %% access functions for various states
- -export([user_state/1, user_state/2,
- event_state/1, event_state/2,
- hook_state/1, hook_state/2,
- rules_state/1, rules_state/2,
- fetch_state/1, fetch_state/2,
- cont_state/1, cont_state/2]).
- %% helper functions. To xmerl_lib ??
- -export([accumulate_whitespace/4]).
- %-define(debug, 1).
- -include("xmerl.hrl"). % record def, macros
- -include_lib("kernel/include/file.hrl").
- -define(fatal(Reason, S),
- if
- S#xmerl_scanner.quiet ->
- ok;
- true ->
- ok=io:format("~p- fatal: ~p~n", [?LINE, Reason])
- end,
- fatal(Reason, S)).
- -define(ustate(U, S), S#xmerl_scanner{user_state = U}).
- %% Functions to access the various states
- %%% @spec user_state(S::global_state()) -> global_state()
- %%% @equiv user_state(UserState,S)
- user_state(#xmerl_scanner{user_state = S}) -> S.
- %%% @spec event_state(S::global_state()) -> global_state()
- %%% @equiv event_state(EventState,S)
- event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S.
- %%% @spec hook_state(S::global_state()) -> global_state()
- %%% @equiv hook_state(HookState,S)
- hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S.
- %%% @spec rules_state(S::global_state()) -> global_state()
- %%% @equiv rules_state(RulesState,S)
- rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S.
- %%% @spec fetch_state(S::global_state()) -> global_state()
- %%% @equiv fetch_state(FetchState,S)
- fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S.
- %%% @spec cont_state(S::global_state()) -> global_state()
- %%% @equiv cont_state(ContinuationState,S)
- cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S.
- %%%% Functions to modify the various states
- %%% @spec user_state(UserState, S::global_state()) -> global_state()
- %%% @doc For controlling the UserState, to be used in a user function.
- %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
- user_state(X, S) ->
- S#xmerl_scanner{user_state = X}.
- %%% @spec event_state(EventState, S::global_state()) -> global_state()
- %%% @doc For controlling the EventState, to be used in an event
- %%% function, and called at the beginning and at the end of a parsed entity.
- %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
- event_state(X, S=#xmerl_scanner{fun_states = FS}) ->
- FS1 = FS#xmerl_fun_states{event = X},
- S#xmerl_scanner{fun_states = FS1}.
- %%% @spec hook_state(HookState, S::global_state()) -> global_state()
- %%% @doc For controlling the HookState, to be used in a hook
- %%% function, and called when the parser has parsed a complete entity.
- %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
- hook_state(X, S=#xmerl_scanner{fun_states = FS}) ->
- FS1 = FS#xmerl_fun_states{hook = X},
- S#xmerl_scanner{fun_states = FS1}.
- %%% @spec rules_state(RulesState, S::global_state()) -> global_state()
- %%% @doc For controlling the RulesState, to be used in a rules
- %%% function, and called when the parser store scanner information in a rules
- %%% database.
- %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
- rules_state(X, S=#xmerl_scanner{fun_states = FS}) ->
- FS1 = FS#xmerl_fun_states{rules = X},
- S#xmerl_scanner{fun_states = FS1}.
- %%% @spec fetch_state(FetchState, S::global_state()) -> global_state()
- %%% @doc For controlling the FetchState, to be used in a fetch
- %%% function, and called when the parser fetch an external resource (eg. a DTD).
- %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
- fetch_state(X, S=#xmerl_scanner{fun_states = FS}) ->
- FS1 = FS#xmerl_fun_states{fetch = X},
- S#xmerl_scanner{fun_states = FS1}.
- %%% @spec cont_state(ContinuationState, S::global_state()) -> global_state()
- %%% @doc For controlling the ContinuationState, to be used in a continuation
- %%% function, and called when the parser encounters the end of the byte stream.
- %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
- cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
- FS1 = FS#xmerl_fun_states{cont = X},
- S#xmerl_scanner{fun_states = FS1}.
- %% @spec file(Filename::string()) -> {xmlElement(),Rest}
- %% Rest = list()
- %% @equiv file(Filename, [])
- file(F) ->
- file(F, []).
- %% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest}
- %% Rest = list()
- %%% @doc Parse file containing an XML document
- file(F, Options) ->
- ExtCharset=case lists:keysearch(encoding,1,Options) of
- {value,{_,Val}} -> Val;
- false -> undefined
- end,
- case int_file(F,Options,ExtCharset) of
- {Res, Tail,S=#xmerl_scanner{close_fun=Close}} ->
- Close(S), % for side effects only - final state is dropped
- {Res,Tail};
- {error, Reason} ->
- {error, Reason};
- Other ->
- {error, Other}
- end.
- int_file(F, Options,_ExtCharset) ->
- % io:format("int_file F=~p~n",[F]),
- case file:read_file(F) of
- {ok, Bin} ->
- int_string(binary_to_list(Bin), Options, filename:dirname(F));
- Error ->
- Error
- end.
- int_file_decl(F, Options,_ExtCharset) ->
- % io:format("int_file_decl F=~p~n",[F]),
- case file:read_file(F) of
- {ok, Bin} ->
- int_string_decl(binary_to_list(Bin), Options, filename:dirname(F));
- Error ->
- Error
- end.
- %% @spec string(Text::list()) -> {xmlElement(),Rest}
- %% Rest = list()
- %% @equiv string(Test, [])
- string(Str) ->
- string(Str, []).
- %% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest}
- %% Rest = list()
- %%% @doc Parse string containing an XML document
- string(Str, Options) ->
- case int_string(Str, Options) of
- {Res, Tail, S=#xmerl_scanner{close_fun = Close}} ->
- Close(S), % for side effects only - final state is dropped
- {Res,Tail};
- {error, Reason} ->
- {error, Reason}; % (This can't happen, currently)
- Other ->
- {error, Other}
- end.
- int_string(Str, Options) ->
- {ok, XMLBase} = file:get_cwd(),
- int_string(Str, Options, XMLBase).
- int_string(Str, Options, XMLBase) ->
- S=initial_state0(Options,XMLBase),
- case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
- {auto,'iso-10646-utf-1',Str2} ->
- scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
- {external,'iso-10646-utf-1',Str2} ->
- scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
- {undefined,undefined,Str2} ->
- scan_document(Str2, S);
- {external,ExtCharset,Str2} ->
- scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
- end.
- int_string_decl(Str, Options, XMLBase) ->
- S=initial_state0(Options,XMLBase),
- case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
- {auto,'iso-10646-utf-1',Str2} ->
- scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
- {external,'iso-10646-utf-1',Str2} ->
- scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
- {undefined,undefined,Str2} ->
- scan_decl(Str2, S);
- {external,ExtCharset,Str2} ->
- scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
- end.
-
- initial_state0(Options,XMLBase) ->
- initial_state(Options, #xmerl_scanner{
- event_fun = fun event/2,
- hook_fun = fun hook/2,
- acc_fun = fun acc/3,
- fetch_fun = fun fetch/2,
- close_fun = fun close/1,
- continuation_fun = fun cont/3,
- rules_read_fun = fun rules_read/3,
- rules_write_fun = fun rules_write/4,
- rules_delete_fun= fun rules_delete/3,
- xmlbase = XMLBase
- }).
- initial_state([{event_fun, F}|T], S) ->
- initial_state(T, S#xmerl_scanner{event_fun = F});
- initial_state([{event_fun, F, ES}|T], S) ->
- S1 = event_state(ES, S#xmerl_scanner{event_fun = F}),
- initial_state(T, S1);
- initial_state([{acc_fun, F}|T], S) ->
- initial_state(T, S#xmerl_scanner{acc_fun = F});
- initial_state([{hook_fun, F}|T], S) ->
- initial_state(T, S#xmerl_scanner{hook_fun = F});
- initial_state([{hook_fun, F, HS}|T], S) ->
- S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}),
- initial_state(T, S1);
- initial_state([{close_fun, F}|T], S) ->
- initial_state(T, S#xmerl_scanner{close_fun = F});
- initial_state([{fetch_fun, F}|T], S) ->
- initial_state(T, S#xmerl_scanner{fetch_fun = F});
- initial_state([{fetch_fun, F, FS}|T], S) ->
- S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}),
- initial_state(T, S1);
- initial_state([{fetch_path, P}|T], S) ->
- initial_state(T, S#xmerl_scanner{fetch_path = P});
- initial_state([{continuation_fun, F}|T], S) ->
- initial_state(T, S#xmerl_scanner{continuation_fun = F});
- initial_state([{continuation_fun, F, CS}|T], S) ->
- S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}),
- initial_state(T, S1);
- initial_state([{rules, R}|T], S) ->
- initial_state(T, S#xmerl_scanner{rules = R,
- keep_rules = true});
- initial_state([{rules, Read, Write, RS}|T], S) ->
- S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read,
- rules_write_fun = Write,
- keep_rules = true}),
- initial_state(T, S1);
- initial_state([{user_state, F}|T], S) ->
- initial_state(T, S#xmerl_scanner{user_state = F});
- initial_state([{space, L}|T], S) ->
- initial_state(T, S#xmerl_scanner{space = L});
- initial_state([{line, L}|T], S) ->
- initial_state(T, S#xmerl_scanner{line = L});
- initial_state([{namespace_conformant, F}|T], S) when F==true; F==false ->
- initial_state(T, S#xmerl_scanner{namespace_conformant = F});
- initial_state([{validation, F}|T], S) when F==true; F==false ->
- initial_state(T, S#xmerl_scanner{validation = F});
- initial_state([{quiet, F}|T], S) when F==true; F==false ->
- initial_state(T, S#xmerl_scanner{quiet = F});
- initial_state([{doctype_DTD,DTD}|T], S) ->
- initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
- initial_state([{text_decl,Bool}|T], S) ->
- initial_state(T,S#xmerl_scanner{text_decl=Bool});
- initial_state([{environment,Env}|T], S) ->
- initial_state(T,S#xmerl_scanner{environment=Env});
- initial_state([{xmlbase, D}|T], S) ->
- initial_state(T, S#xmerl_scanner{xmlbase = D});
- initial_state([{encoding, Enc}|T], S) ->
- initial_state(T, S#xmerl_scanner{encoding = Enc});
- initial_state([], S=#xmerl_scanner{rules = undefined}) ->
- Tab = ets:new(rules, [set, public]),
- S#xmerl_scanner{rules = Tab};
- initial_state([], S) ->
- S.
- %%% -----------------------------------------------------
- %%% Default modifier functions
- %%% Hooks:
- %%% - {element, Line, Name, Attrs, Content}
- %%% - {processing_instruction, Line, Data}
- hook(X, State) ->
- {X, State}.
- %%% Events:
- %%%
- %%% #xmerl_event{event : started | ended,
- %%% line : integer(),
- %%% col : integer(),
- %%% data}
- %%%
- %%% Data Events
- %%% document started, ended
- %%% #xmlElement started, ended
- %%% #xmlAttribute ended
- %%% #xmlPI ended
- %%% #xmlComment ended
- %%% #xmlText ended
- event(_X, S) ->
- S.
- %% The acc/3 function can return either {Acc´, S'} or {Acc', Pos', S'},
- %% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or
- %% X#xmlAttribute.pos (whichever is the current object type.)
- %% The acc/3 function is not allowed to redefine the type of object
- %% being defined, but _is_ allowed to either ignore it or split it
- %% into multiple objects (in which case {Acc',Pos',S'} should be returned.)
- %% If {Acc',S'} is returned, Pos will be incremented by 1 by default.
- %% Below is an example of an acceptable operation
- acc(X = #xmlText{value = Text}, Acc, S) ->
- {[X#xmlText{value = lists:flatten(Text)}|Acc], S};
- acc(X, Acc, S) ->
- {[X|Acc], S}.
- fetch({system, URI}, S) ->
- fetch_URI(URI, S);
- fetch({public, _PublicID, URI}, S) ->
- fetch_URI(URI, S).
- %%% Always assume an external resource can be found locally! Thus
- %%% don't bother fetching with e.g. HTTP. Returns the path where the
- %%% resource is found. The path to the external resource is given by
- %%% URI directly or the option fetch_path (additional paths) or
- %%% directory (base path to external resource)
- fetch_URI(URI, S) ->
- %% assume URI is a filename
- Split = filename:split(URI),
- Filename = fun([])->[];(X)->lists:last(X) end (Split),
- Fullname =
- case Split of %% how about Windows systems?
- ["file:"|Name]-> %% absolute path, see RFC2396 sect 3
- %% file:/dtd_name
- filename:join(["/"|Name]);
- ["/"|Rest] when Rest /= [] ->
- %% absolute path name
- URI;
- ["http:"|_Rest] ->
- {http,URI};
- [] -> %% empty systemliteral
- [];
- _ ->
- filename:join(S#xmerl_scanner.xmlbase, URI)
- end,
- Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname),
- ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]),
- {ok, Path, S}.
- path_locate(_, _, {http,_}=URI) ->
- URI;
- path_locate(_, _, []) ->
- [];
- path_locate([Dir|Dirs], FN, FullName) ->
- F = filename:join(Dir, FN),
- case file:read_file_info(F) of
- {ok, #file_info{type = regular}} ->
- {file,F};
- _ ->
- path_locate(Dirs, FN, FullName)
- end;
- path_locate([], _FN, FullName) ->
- {file,FullName}.
- cont(_F, Exception, US) ->
- Exception(US).
- close(S) ->
- S.
- %%% -----------------------------------------------------
- %%% Scanner
- %%% [1] document ::= prolog element Misc*
- scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
- line = L, col = C,
- environment=Env,
- encoding=Charset,
- validation=ValidateResult}) ->
- S1 = Event(#xmerl_event{event = started,
- line = L,
- col = C,
- data = document}, S),
-
- %% Transform to given character set.
- %% Note that if another character set is given in the encoding
- %% attribute in a XML declaration that one will be used later
- Str=if
- Charset=/=undefined -> % Default character set is UTF-8
- ucs:to_unicode(Str0,list_to_atom(Charset));
- true ->
- Str0
- end,
- {"<"++T2, S2} = scan_prolog(Str, S1, _StartPos = 1),
- {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1),
- {Tail, S4}=scan_misc(T3, S3, _StartPos = 1),
-
- S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = S4#xmerl_scanner.line,
- col = S4#xmerl_scanner.col,
- data = document}, S4),
-
- S6 = case ValidateResult of
- false ->
- cleanup(S5);
- true when Env == element; Env == prolog ->
- check_decl2(S5),
- case xmerl_validate:validate(S5,Res) of
- {'EXIT',{error,Reason}} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
- {'EXIT',Reason} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
- {error,Reason} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
- {error,Reason,_Next} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
- _XML ->
- cleanup(S5)
- end;
- true ->
- cleanup(S5)
- end,
- {Res, Tail, S6}.
- scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
- line = L, col = C,
- environment=_Env,
- encoding=_Charset,
- validation=_ValidateResult}) ->
- S1 = Event(#xmerl_event{event = started,
- line = L,
- col = C,
- data = document}, S),
-
- case scan_prolog(Str, S1, _StartPos = 1) of
- {T2="<"++_, S2} ->
- {{S2#xmerl_scanner.user_state,T2},[],S2};
- {[], S2}->
- {[],[],S2};
- {T2, S2} ->
- {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
- _Lang=[],_Parents=[],#xmlNamespace{}),
- {T2,[],S3}
- end.
- %%% [22] Prolog
- %%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
- %%%
- %% Text declaration may be empty
- scan_prolog([], S=#xmerl_scanner{text_decl=true},_Pos) ->
- {[],S};
- scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end,
- fun(S1) -> {[], S1} end,
- S);
- scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
- when ?whitespace(hd(T)) ->
- {Charset,T3, S3}=
- if
- Col==1,L==1,S0#xmerl_scanner.text_decl==true ->
- ?dbg("prolog(\"<?xml\")~n", []),
- ?bump_col(5),
- {_,T1,S1} = mandatory_strip(T,S0),
- {Decl,T2, S2}=scan_text_decl(T1,S1),
- Encoding=Decl#xmlDecl.encoding,
- {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
- Col==1,L==1 ->
- ?dbg("prolog(\"<?xml\")~n", []),
- ?bump_col(5),
- {Decl,T2, S2}=scan_xml_decl(T, S0),
- Encoding=Decl#xmlDecl.encoding,
- {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
- true ->
- ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
- end,
- %% Now transform to declared character set.
- if
- Charset==Charset0 -> % Document already transformed to this charset!
- scan_prolog(T3, S3, Pos);
- Charset0=/=undefined -> % Document transformed to other bad charset!
- ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S3);
- Charset=/=undefined -> % Document not previously transformed
- T4=ucs:to_unicode(T3,list_to_atom(Charset)),
- scan_prolog(T4, S3, Pos);
- true -> % No encoding info given
- scan_prolog(T3, S3, Pos)
- end;
- scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog,
- encoding=Charset}, Pos) ->
- ?dbg("prolog(\"<!DOCTYPE\")~n", []),
- ?bump_col(9),
- %% If no known character set assume it is UTF-8
- T1=if
- Charset==undefined -> ucs:to_unicode(T,'utf-8');
- true -> T
- end,
- {T2, S1} = scan_doctype(T1, S),
- scan_misc(T2, S1, Pos);
- scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=Charset},Pos) ->
- ?dbg("prolog(\"<\")~n", []),
-
- %% Check for Comments, PI before possible DOCTYPE declaration
- ?bump_col(1),
- %% If no known character set assume it is UTF-8
- T=if
- Charset==undefined -> ucs:to_unicode(Str,'utf-8');
- true -> Str
- end,
- {T1, S1}=scan_misc(T, S, Pos),
- scan_prolog2(T1,S1,Pos).
- scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end,
- fun(S1) -> {[], S1} end,
- S);
- scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) ->
- ?dbg("prolog(\"<!DOCTYPE\")~n", []),
- ?bump_col(9),
- {T1, S1} = scan_doctype(T, S),
- scan_misc(T1, S1, Pos);
- scan_prolog2(Str = "<!" ++ _, S, _Pos) ->
- ?dbg("prolog(\"<!\")~n", []),
- %% In e.g. a DTD, we jump directly to markup declarations
- scan_ext_subset(Str, S);
- scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
- ?dbg("prolog(\"<\")~n", []),
-
- %% Check for more Comments and PI after DOCTYPE declaration
- ?bump_col(1),
- scan_misc(Str, S, Pos).
- %%% [27] Misc ::= Comment | PI | S
- %% Note:
- %% - Neither of Comment and PI are returned in the resulting parsed
- %% structure.
- %% - scan_misc/3 implements Misc* as that is how the rule is always used
- scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end,
- fun(S1) -> {[], S1} end,
- S);
- scan_misc("<!--" ++ T, S, Pos) -> % Comment
- {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
- scan_misc(T1,S1,Pos);
- scan_misc("<?" ++ T, S0, Pos) -> % PI
- ?dbg("prolog(\"<?\")~n", []),
- ?bump_col(2),
- {_PI, T1, S1} = scan_pi(T, S, Pos),
- scan_misc(T1,S1,Pos);
- scan_misc([H|T], S, Pos) when ?whitespace(H) ->
- ?dbg("prolog(whitespace)~n", []),
- scan_misc(T,S,Pos);
- scan_misc(T,S,_Pos) ->
- {T,S}.
- cleanup(S=#xmerl_scanner{keep_rules = false,
- rules = Rules}) ->
- ets:delete(Rules),
- S#xmerl_scanner{rules = undefined};
- cleanup(S) ->
- S.
- %%% Prolog and Document Type Declaration XML 1.0 Section 2.8
- %% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
- %% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
- scan_xml_decl(T, S) ->
- %% VersionInfo [24] is mandatory
- {_,T1,S2} = mandatory_strip(T,S),
- "version" ++ T2 = T1,
- {T3, S3} = scan_eq(T2, S2),
- {Vsn, T4, S4} = scan_xml_vsn(T3, S3),
- Attr = #xmlAttribute{name = version,
- parents = [{xml, _XMLPos = 1}],
- value = Vsn},
- scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}).
- scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end,
- fun(S1) -> {[], [], S1} end,
- S);
- scan_xml_decl("?>" ++ T, S0, Decl) ->
- ?bump_col(2),
- return_xml_decl(T,S,Decl);
- scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) ->
- {_,T1,S1}=mandatory_strip(T,S),
- scan_xml_decl2(T1,S1,Decl).
- scan_xml_decl2("?>" ++ T, S0,Decl) ->
- ?bump_col(2),
- return_xml_decl(T,S,Decl);
- scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event},
- Decl0 = #xmlDecl{attributes = Attrs}) ->
- %% [80] EncodingDecl
- ?bump_col(8),
- {T1, S1} = scan_eq(T, S),
- {EncName, T2, S2} = scan_enc_name(T1, S1),
- LowEncName=httpd_util:to_lower(EncName),
- Attr = #xmlAttribute{name = encoding,
- parents = [{xml, _XMLPos = 1}],
- value = LowEncName},
- Decl = Decl0#xmlDecl{encoding = LowEncName,
- attributes = [Attr|Attrs]},
- S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = S0#xmerl_scanner.line,
- col = S0#xmerl_scanner.col,
- data = Attr}, S2),
- case T2 of
- "?>" ++ _T3 ->
- scan_xml_decl3(T2,S3,Decl);
- _ ->
- {_,T3,S4} = mandatory_strip(T2,S3),
- scan_xml_decl3(T3, S4, Decl)
- end;
- scan_xml_decl2(T="standalone" ++ _T,S,Decl) ->
- scan_xml_decl3(T,S,Decl).
- scan_xml_decl3("?>" ++ T, S0,Decl) ->
- ?bump_col(2),
- return_xml_decl(T,S,Decl);
- scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event},
- Decl0 = #xmlDecl{attributes = Attrs}) ->
- %% [32] SDDecl
- ?bump_col(10),
- {T1, S1} = scan_eq(T, S),
- {StValue,T2,S2}=scan_standalone_value(T1,S1),
- Attr = #xmlAttribute{name = standalone,
- parents = [{xml, _XMLPos = 1}],
- value = StValue},
- Decl = Decl0#xmlDecl{standalone = StValue,
- attributes = [Attr|Attrs]},
- S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = S0#xmerl_scanner.line,
- col = S0#xmerl_scanner.col,
- data = Attr}, S2),
- {_,T3,S4} = strip(T2,S3),
- "?>" ++ T4 = T3,
- return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl).
- return_xml_decl(T,S=#xmerl_scanner{hook_fun = Hook,
- event_fun = Event},
- Decl0 = #xmlDecl{attributes = Attrs}) ->
- ?strip1,
- Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
- S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = S#xmerl_scanner.line,
- col = S#xmerl_scanner.col,
- data = Decl}, S1),
- {Ret, S3} = Hook(Decl, S2),
- {Ret, T1, S3}.
-
- scan_standalone_value("'yes'" ++T,S0)->
- ?bump_col(5),
- {'yes',T,S#xmerl_scanner{standalone=yes}};
- scan_standalone_value("\"yes\"" ++T,S0)->
- ?bump_col(5),
- {'yes',T,S#xmerl_scanner{standalone=yes}};
- scan_standalone_value("'no'" ++T,S0) ->
- ?bump_col(4),
- {'no',T,S};
- scan_standalone_value("\"no\"" ++T,S0) ->
- ?bump_col(4),
- {'no',T,S}.
- %%%
- %%% Text declaration XML 1.0 section 4.3.1
- %%% [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
- scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) ->
- {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S),
- "encoding" ++ T2 = T1,
- S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8},
- {T3, S3} = scan_eq(T2, S2),
- {EncName, T4, S4} = scan_enc_name(T3, S3),
- LowEncName=httpd_util:to_lower(EncName),
- ?strip5,
- Attr = #xmlAttribute{name = encoding,
- parents = [{xml,1}],
- value = LowEncName},
- Decl = Decl0#xmlDecl{encoding = LowEncName,
- attributes = [Attr|Attrs]},
- S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = S5#xmerl_scanner.line,
- col = S5#xmerl_scanner.col,
- data = Attr}, S5),
- scan_text_decl(T5,S6,Decl).
- scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = Hook,
- event_fun = Event},
- Decl0 = #xmlDecl{attributes = Attrs}) ->
- ?bump_col(2),
- ?strip1,
- Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
- S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = S0#xmerl_scanner.line,
- col = S0#xmerl_scanner.col,
- data = Decl}, S1),
- {Ret, S3} = Hook(Decl, S2),
- {Ret, T1, S3}.
- scan_optional_version("version"++T,S0) ->
- ?bump_col(7),
- ?strip1,
- {T2, S2} = scan_eq(T1, S1),
- {Vsn, T3, S3} = scan_xml_vsn(T2, S2),
- {_,T4,S4} = mandatory_strip(T3,S3),
- Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn},
- {#xmlDecl{attributes=[Attr]},T4,S4};
- scan_optional_version(T,S) ->
- {#xmlDecl{attributes=[]},T,S}.
-
- %%%%%%% [81] EncName
- scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end,
- fun(S1) -> ?fatal(expected_encoding_name, S1) end,
- S);
- scan_enc_name([H|T], S0) when H >= $"; H =< $' ->
- ?bump_col(1),
- scan_enc_name(T, S, H, []).
- scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end,
- fun(S1) -> ?fatal(expected_encoding_name, S1) end,
- S);
- scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
- ?bump_col(1),
- scan_enc_name2(T, S, Delim, [H|Acc]);
- scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
- ?bump_col(1),
- scan_enc_name2(T, S, Delim, [H|Acc]);
- scan_enc_name([H|_T],S,_Delim,_Acc) ->
- ?fatal({error,{unexpected_character_in_Enc_Name,H}},S).
- scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end,
- fun(S1) -> ?fatal(expected_encoding_name, S1) end,
- S);
- scan_enc_name2([H|T], S0, H, Acc) ->
- ?bump_col(1),
- {lists:reverse(Acc), T, S};
- scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
- ?bump_col(1),
- scan_enc_name2(T, S, Delim, [H|Acc]);
- scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
- ?bump_col(1),
- scan_enc_name2(T, S, Delim, [H|Acc]);
- scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 ->
- ?bump_col(1),
- scan_enc_name2(T, S, Delim, [H|Acc]);
- scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- ->
- ?bump_col(1),
- scan_enc_name2(T, S, Delim, [H|Acc]).
- %%%%%%% [26] VersionNum
- %%% VersionNum ::= ([a-zA-Z0-9_.:] | '-')+
- scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_xml_vsn([H|T], S) when H==$"; H==$'->
- xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []).
- xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) ->
- {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}};
- xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z ->
- xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
- xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z ->
- xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
- xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 ->
- xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
- xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) ->
- case lists:member(H, "_.:-") of
- true ->
- xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
- false ->
- ?fatal({invalid_vsn_char, H}, S)
- end.
- %%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
- scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_pi(Str = [H1,H2,H3 | T],S=#xmerl_scanner{line = L, col = C}, Pos)
- when H1==$x;H1==$X ->
- %% names beginning with [xX][mM][lL] are reserved for future use.
- if
- ((H2==$m) or (H2==$M)) and
- ((H3==$l) or (H3==$L)) ->
- scan_wellknown_pi(T,S,Pos);
- true ->
- {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
- scan_pi(T1, S1, Target, L, C, Pos, [])
- end;
- scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) ->
- {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
- scan_pi(T1, S1, Target, L, C, Pos,[]).
- %%% More info on xml-stylesheet can be found at:
- %%% "Associating Style Sheets with XML documents", Version 1.0,
- %%% W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/)
- scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) ->
- ?dbg("prolog(\"<?xml-stylesheet\")~n", []),
- ?bump_col(16),
- scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]);
- scan_wellknown_pi(Str,S,_Pos) ->
- ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S).
- % scan_pi(Str="?>"++_T,S,Target, L, C, Pos) ->
- % scan_pi(Str,S,Target, L, C, Pos,[]);
- % scan_pi(Str=[],S,Target, L, C, Pos) ->
- % scan_pi(Str,S,Target, L, C, Pos,[]);
- % scan_pi(T,S,Target, L, C, Pos) ->
- % {_,T1,S1} = mandatory_strip(T,S),
- % scan_pi(T1,S1,Target, L, C, Pos,[]).
- scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
- event_fun = Event},
- Target, L, C, Pos, Acc) ->
- ?bump_col(2),
- PI = #xmlPI{name = Target,
- pos = Pos,
- value = lists:reverse(Acc)},
- S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = L,
- col = C,
- data = PI}, S),
- {Ret, S2} = Hook(PI, S1),
- {Ret, T, S2};
- scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) ->
- ?strip1,
- scan_pi2(T1, S1, Target, L, C, Pos, Acc).
- scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
- event_fun = Event},
- Target, L, C, Pos, Acc) ->
- ?bump_col(2),
- PI = #xmlPI{name = Target,
- pos = Pos,
- value = lists:reverse(Acc)},
- S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
- line = L,
- col = C,
- data = PI}, S),
- {Ret, S2} = Hook(PI, S1),
- {Ret, T, S2};
- scan_pi2([H|T], S0, Target, L, C, Pos, Acc) ->
- ?bump_col(1),
- wfc_legal_char(H,S),
- scan_pi2(T, S, Target, L, C, Pos, [H|Acc]).
- %% [28] doctypedecl ::=
- %% '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
- scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_doctype(T, S) ->
- {_,T1,S1} = mandatory_strip(T,S),
- {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
- ?strip3,
- scan_doctype1(T3, S3#xmerl_scanner{doctype_name = DTName}).
- %% [75] ExternalID ::= 'SYSTEM' S SystemLiteral
- %% | 'PUBLIC' S PubidLiteral S SystemLiteral
- scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_doctype1("PUBLIC" ++ T, S0) ->
- ?bump_col(6),
- {_,T1,S1} = mandatory_strip(T,S),
- {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
- {_,T3,S3} = mandatory_strip(T2,S2),
- {SL, T4, S4} = scan_system_literal(T3, S3),
- ?strip5,
- scan_doctype2(T5, S5, {public, PIDL, SL});
- scan_doctype1("SYSTEM" ++ T, S0) ->
- ?bump_col(6),
- {_,T1,S1} = mandatory_strip(T,S),
- {SL, T2, S2} = scan_system_literal(T1, S1),
- ?strip3,
- scan_doctype2(T3, S3, {system, SL});
- scan_doctype1(T, S) ->
- scan_doctype2(T, S, undefined).
- scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_doctype2("[" ++ T, S0, DTD) ->
- ?bump_col(1),
- ?strip1,
- scan_doctype3(T1, S1, DTD);
- scan_doctype2(">" ++ T, S0, DTD) ->
- ?bump_col(1),
- ?strip1,
- S2 = fetch_DTD(DTD, S1),
- check_decl(S2),
- {T1, S2}.
- %% [28a] DeclSep ::= PEReference | S
- %% [28b] intSubset ::= (markupdecl | DeclSep)*
- scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_doctype3("%" ++ T, S0, DTD) ->
- ?bump_col(1),
- {PERefName, T1, S1} = scan_pe_reference(T, S),
- ?strip2,
- case expand_pe_reference(PERefName, S2,as_PE) of
- {system, _} = Name ->
- S3 = fetch_DTD(Name, S2),
- scan_doctype3(T2, S3, DTD);
- {public, _} = Name ->
- S3 = fetch_DTD(Name, S2),
- scan_doctype3(T2, S3, DTD);
- {public, _, _} = Name ->
- S3 = fetch_DTD(Name, S2),
- scan_doctype3(T2, S3, DTD);
- ExpRef when list(ExpRef) -> % Space added, see Section 4.4.8
- {_,T3,S3} = strip(ExpRef++T2,S2),
- scan_doctype3(T3,S3,DTD)
- end;
- scan_doctype3("]" ++ T, S0, DTD) ->
- ?bump_col(1),
- ?strip1,
- S2 = fetch_DTD(DTD, S1),
- check_decl(S2),
- ">" ++ T2 = T1,
- {T2, S2};
- scan_doctype3(T, S, DTD) ->
- {_, T1, S1} = scan_markup_decl(T, S),
- scan_doctype3(T1, S1, DTD).
- fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when list(URI)->
- %% allow to specify DTD name when it isn't available in xml stream
- fetch_DTD({system,URI},S);
- fetch_DTD(undefined, S) ->
- S;
- fetch_DTD(DTDSpec, S)->
- case fetch_and_parse(DTDSpec,S,[{text_decl,true},
- {environment,{external,subset}}]) of
- NewS when record(NewS,xmerl_scanner) ->
- NewS;
- {_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules
- S
- end.
- fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch,
- rules=Rules,
- xmlbase = XMLBase},
- Options0) ->
- RetS =
- case Fetch(ExtSpec, S) of
- {ok, NewS} ->
- %% For backward compatibility only. This will be removed later!!
- NewS;
- {ok, not_fetched,NewS} ->
- NewS;
- {ok, DataRet, NewS = #xmerl_scanner{user_state = UState,
- event_fun = Event,
- hook_fun = Hook,
- fetch_fun = Fetch1,
- close_fun = Close1,
- continuation_fun = Cont,
- acc_fun = Acc,
- rules_read_fun = Read,
- rules_write_fun = Write,
- validation = Valid,
- quiet = Quiet,
- encoding = Charset
- }} ->
- EvS = event_state(NewS),
- HoS = hook_state(NewS),
- FeS = fetch_state(NewS),
- CoS = cont_state(NewS),
- Options = Options0++[{user_state, UState},
- {rules, Rules},
- {event_fun, Event, EvS},
- {hook_fun, Hook, HoS},
- {fetch_fun, Fetch1, FeS},
- {close_fun, Close1},
- {continuation_fun, Cont, CoS},
- {rules, Read, Write, ""},
- {acc_fun, Acc},
- {validation,Valid},
- {quiet,Quiet},
- {encoding,Charset}],
- case DataRet of
- {file, F} ->
- int_file_decl(F, Options,Charset);
- {string, String} ->
- int_string_decl(String, Options,XMLBase);
- _ ->
- %% other scheme
- {DataRet,[],NewS}
- end;
- Error ->
- ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S)
- end,
- case RetS of
- #xmerl_scanner{} ->
- RetS#xmerl_scanner{text_decl=false,
- environment=S#xmerl_scanner.environment};
- _ -> RetS
- end.
- fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) ->
- case Fetch(ExtSpec,S) of
- {ok, not_fetched,_NewS} ->
- ?fatal({error_fetching_external_source,ExtSpec},S);
- {ok, DataRet, NewS} ->
- String =
- case DataRet of
- {file,F} ->
- get_file(F,S);
- {string,Str} ->
- binary_to_list(Str);
- _ -> DataRet
- end,
- {String, NewS};
- _ ->
- ?fatal({error_fetching_external_resource,ExtSpec},S)
- end.
- get_file(F,S) ->
- % io:format("get_file F=~p~n",[F]),
- case file:read_file(F) of
- {ok,Bin} ->
- binary_to_list(Bin);
- Err ->
- ?fatal({error_reading_file,F,Err},S)
- end.
- %% check_decl/1
- %% Now it is necessary to check that all referenced types is declared,
- %% since it is legal to reference some xml types before they are
- %% declared.
- check_decl(#xmerl_scanner{validation=false}) ->
- ok;
- check_decl(#xmerl_scanner{rules=Tab} = S) ->
- check_notations(Tab,S),
- check_elements(Tab,S), %% check also attribute defs for element
- check_entities(Tab,S).
-
- check_notations(Tab,S) ->
- case ets:match(Tab,{{notation,'$1'},undeclared}) of
- [[]] -> ok;
- [] -> ok;
- [L] when list(L) ->
- ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
- Err ->
- ?fatal({error_missing_declaration_in_DTD,Err},S)
- end.
- check_elements(Tab,S) ->
- case ets:match(Tab,{{elem_def,'_'},'$2'},10) of
- {_,_}=M ->
- Fun = fun({Match,'$end_of_table'},_F) ->
- lists:foreach(fun(X)->check_elements2(X,S) end,
- Match),
- ok;
- ('$end_of_table',_) ->
- ok;
- ({Match,Cont},F) ->
- lists:foreach(fun(X)->check_elements2(X,S) end,
- Match),
- F(ets:match(Cont),F)
- end,
- Fun(M,Fun);
- '$end_of_table' -> ok;
- Err -> ?fatal({error_missing_declaration_in_DTD,Err},S)
- end.
- % it is not an error to declare attributes for an element that is not
- % declared.
- check_elements2([#xmlElement{attributes=Attrs}],S) ->
- check_attributes(Attrs,S);
- check_elements2(_,_) ->
- ok.
- check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) ->
- case lists:keysearch('ID',2,Rest) of
- {value,Att2} ->
- ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S);
- _ ->
- ok
- end,
- vc_ID_Attribute_Default(Attr,S),
- check_attributes(Rest,S);
- check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) ->
- vc_Enumeration(Attr,S),
- check_attributes(T,S);
- check_attributes([{_,Ent,_,_,_}=Attr|T],S)
- when Ent=='ENTITY';Ent=='ENTITIES' ->
- vc_Entity_Name(Attr,S),
- check_attributes(T,S);
- check_attributes([_|T],S) ->
- check_attributes(T,S);
- check_attributes([],_S) ->
- ok.
- check_entities(Tab,S=#xmerl_scanner{validation=true}) ->
- case ets:match(Tab,{{entity,'$1'},undeclared}) of
- [[]] -> ok;
- [] -> ok;
- [L] when list(L) ->
- ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
- Err ->
- ?fatal({error_missing_declaration_in_DTD,Err},S)
- end;
- check_entities(_,_) ->
- ok.
- %% check_decl2/1: checks that all referenced ID attributes are declared
- check_decl2(S=#xmerl_scanner{rules=Tab}) ->
- check_referenced_ids(Tab,S).
- check_referenced_ids(Tab,S) ->
- case ets:match(Tab,{{id,'$1'},undeclared}) of
- [[]] -> ok;
- [] -> ok;
- [L] when list(L) ->
- ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
- Err ->
- ?fatal({error_missing_declaration_in_DTD,Err},S)
- end.
- %%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl
- scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end,
- fun(S1) -> {[], S1} end,
- S);
- scan_ext_subset("%" ++ T, S0) ->
- %% DeclSep [28a]: WFC: PE Between Declarations.
- %% The replacement text of a parameter entity reference in a
- %% DeclSep must match the production extSubsetDecl.
- ?bump_col(1),
- {_,T1,S1} = scan_decl_sep(T,S),
- scan_ext_subset(T1, S1);
- scan_ext_subset("<![" ++ T, S0) ->
- ?bump_col(3),
- ?strip1,
- {_, T2, S2} = scan_conditional_sect(T1, S1),
- scan_ext_subset(T2,S2);
- scan_ext_subset(T, S) when ?whitespace(hd(T)) ->
- {_,T1,S1} = strip(T,S),
- scan_ext_subset(T1, S1);
- scan_ext_subset(T, S) ->
- {_, T1, S1} = scan_markup_decl(T, S),
- scan_ext_subset(T1, S1).
- %%%%%%% [28a] DeclSep ::= PEReference | S
- scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read,
- rules_write_fun=Write,
- rules_delete_fun=Delete}) ->
- {PERefName, T1, S1} = scan_pe_reference(T, S),
- {ExpandedRef,S2} =
- case expand_pe_reference(PERefName,S1,as_PE) of
- Tuple when tuple(Tuple) ->
- %% {system,URI} or {public,URI}
- {ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
- {EntV,_,_S2} = scan_entity_value(ExpRef, S1, no_delim,
- PERefName,parameter),
- %% should do an update Write(parameter_entity) so next
- %% expand_pe_reference is faster
- Delete(parameter_entity,PERefName,_S2),
- _S3 = Write(parameter_entity,PERefName,EntV,_S2),
- EntV2 = Read(parameter_entity,PERefName,_S3),
- {" " ++ EntV2 ++ " ",_S3};
- ExpRef ->
- {ExpRef,S1}
- end,
- {_, T3, S3} = strip(ExpandedRef,S2),
- {_T4,S4} = scan_ext_subset(T3,S3),
- strip(T1,S4).
- %%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect
- scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_conditional_sect("IGNORE" ++ T, S0) ->
- ?bump_col(6),
- ?strip1,
- "[" ++ T2 = T1,
- {_,T3,S3} = strip(T2,S1),
- scan_ignore(T3,S3);
- scan_conditional_sect("INCLUDE" ++ T, S0) ->
- ?bump_col(7),
- ?strip1,
- "[" ++ T2 = T1,
- {_,T3,S3} = strip(T2,S1),
- scan_include(T3, S3);
- scan_conditional_sect("%"++T,S0) ->
- ?bump_col(1),
- ?bump_col(1),
- {PERefName, T1, S1} = scan_pe_reference(T, S),
- ExpRef = expand_pe_reference(PERefName, S1,as_PE),
- {_,T2,S2} = strip(ExpRef ++ T1,S1),
- scan_conditional_sect(T2,S2).
- %%%% [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
- %%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
- %%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
- scan_ignore(Str,S) ->
- scan_ignore(Str,S,0).
- scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_ignore("<![" ++ T, S0,Level) ->
- %% nested conditional section. Topmost condition is ignore, though
- ?bump_col(3),
- scan_ignore(T, S,Level+1);
- scan_ignore("]]>" ++ T, S0,0) ->
- ?bump_col(3),
- {[], T, S};
- scan_ignore("]]>" ++ T, S0,Level) ->
- ?bump_col(3),
- scan_ignore(T, S,Level-1);
- scan_ignore([_H|T],S0,Level) ->
- ?bump_col(1),
- scan_ignore(T,S,Level).
- %%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
- scan_include([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end,
- fun(S1) -> ?fatal(unexpected_end, S1) end,
- S);
- scan_include("]]>" ++ T, S0) ->
- ?bump_col(3),
- {[], T, S};
- scan_include("%" ++ T, S0) ->
- ?bump_col(1),
- {PERefName, T1, S1} = scan_pe_reference(T, S),
- ExpRef = expand_pe_reference(PERefName, S1,as_PE),
- {_,T2,S2} = strip(ExpRef ++ T1,S1),
- scan_include(T2, S2);
- scan_include("<![" ++ T, S0) ->
- ?bump_col(3),
- ?strip1,
- {_, T2, S2} = scan_conditional_sect(T1, S1),
- ?strip3,
- scan_include(T3,S3);
- scan_include(T, S) ->
- {_, T1, S1} = scan_markup_decl(T, S),
- scan_include(T1, S1).
- %%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl |
- %%%%%%% NotationDecl | PI |Comment
- %%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
- %% Validity constraint: Unique Type Declaration: No element type may be
- %% declared more than once.
- %%
- scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
- ?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end,
- fun(S1) -> {[], [], S1} end,
- S);
- scan_markup_decl("<!--" ++ T, S0) ->
- ?bump_col(4),
- {_, T1, S1} = scan_comment(T, S),
- ?strip2;
- scan_markup_decl("<?" ++ T, S0) ->
- ?bump_col(2),
- {_PI, T1, S1} = scan_pi(T, S,_Pos=markup),
- ?strip2;
- scan_markup_decl("<!ELEMENT" ++ T,
- #xmerl_scanner{rules_read_fun = Read,
- rules_write_fun = Write,
- rules_delete_fun = Delete} = S0) ->
- ?bump_col(9),
- {_,T1,S1} = mandatory_strip(T,S),
- {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
- Element =
- case Read(elem_def, Ename, S2) of
- El = #xmlElement{elementdef=Decl} when Decl /= undeclared ->
- case S2#xmerl_scanner.validation of
- true ->
- ?fatal({already_defined, Ename}, S2);
- _ ->
- Delete(elem_def,Ename,S2),
- El
- end;
- El = #xmlElement{} ->
- Delete(elem_def,Ename,S2),
- El;
- undefined ->
- #xmlElement{}
- …
Large files files are truncated, but you can click here to view the full file