PageRenderTime 54ms CodeModel.GetById 7ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/xmerl/src/xmerl_scan.erl

https://github.com/bmizerany/jungerl
Erlang | 3778 lines | 2842 code | 378 blank | 558 comment | 32 complexity | 573894cf37ee0fe26e2519ee216f684f MD5 | raw file
Possible License(s): LGPL-2.1, BSD-3-Clause, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. %%% The contents of this file are subject to the Erlang Public License,
  2. %%% Version 1.0, (the "License"); you may not use this file except in
  3. %%% compliance with the License. You may obtain a copy of the License at
  4. %%% http://www.erlang.org/license/EPL1_0.txt
  5. %%%
  6. %%% Software distributed under the License is distributed on an "AS IS"
  7. %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
  8. %%% the License for the specific language governing rights and limitations
  9. %%% under the License.
  10. %%%
  11. %%% The Original Code is xmerl-0.15
  12. %%%
  13. %%% The Initial Developer of the Original Code is Ericsson Telecom
  14. %%% AB. Portions created by Ericsson are Copyright (C), 1998, Ericsson
  15. %%% Telecom AB. All Rights Reserved.
  16. %%%
  17. %%% Contributor(s):
  18. %%% Mickael Remond <mickael.remond@IDEALX.com>:
  19. %%% Johan Blom <johan.blom@mobilearts.se>
  20. %%% Richard Carlsson
  21. %%% Fredrik Linder
  22. %%%
  23. %%%----------------------------------------------------------------------
  24. %%% #0. BASIC INFORMATION
  25. %%%----------------------------------------------------------------------
  26. %%% File: xmerl_scan.erl
  27. %%% Author : Ulf Wiger <ulf.wiger@ericsson.com>
  28. %%% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs.
  29. %%%
  30. %%% Modules used : ets, file, filename, io, lists, ucs, uri
  31. %%%
  32. %%%----------------------------------------------------------------------
  33. %% @doc
  34. %% The XML parser is activated through
  35. %% <tt>xmerl_scan:string/[1,2]</tt> or
  36. %% <tt>xmerl_scan:file/[1,2]</tt>.
  37. %% It returns records of the type defined in xmerl.hrl.
  38. %% See also <a href="xmerl_examples.html">tutorial</a> on customization
  39. %% functions.
  40. %% @type global_state(). <p>
  41. %% The global state of the scanner, represented by the #xmerl_scanner{} record.
  42. %% </p>
  43. %% @type option_list(). <p>Options allows to customize the behaviour of the
  44. %% scanner.
  45. %% See also <a href="xmerl_examples.html">tutorial</a> on customization
  46. %% functions.
  47. %% </p>
  48. %% Possible options are:
  49. %% <dl>
  50. %% <dt><code>{acc_fun, Fun}</code></dt>
  51. %% <dd>Call back function to accumulate contents of entity.</dd>
  52. %% <dt><code>{continuation_fun, Fun} |
  53. %% {continuation_fun, Fun, ContinuationState}</code></dt>
  54. %% <dd>Call back function to decide what to do if the scanner runs into eof
  55. %% before the document is complete.</dd>
  56. %% <dt><code>{event_fun, Fun} |
  57. %% {event_fun, Fun, EventState}</code></dt>
  58. %% <dd>Call back function to handle scanner events.</dd>
  59. %% <dt><code>{fetch_fun, Fun} |
  60. %% {fetch_fun, Fun, FetchState}</code></dt>
  61. %% <dd>Call back function to fetch an external resource.</dd>
  62. %% <dt><code>{hook_fun, Fun} |
  63. %% {hook_fun, Fun, HookState}</code></dt>
  64. %% <dd>Call back function to process the document entities once
  65. %% identified.</dd>
  66. %% <dt><code>{close_fun, Fun}</code></dt>
  67. %% <dd>Called when document has been completely parsed.</dd>
  68. %% <dt><code>{rules, ReadFun, WriteFun, RulesState} |
  69. %% {rules, Rules}</code></dt>
  70. %% <dd>Handles storing of scanner information when parsing.</dd>
  71. %% <dt><code>{user_state, UserState}</code></dt>
  72. %% <dd>Global state variable accessible from all customization functions</dd>
  73. %%
  74. %% <dt><code>{fetch_path, PathList}</code></dt>
  75. %% <dd>PathList is a list of
  76. %% directories to search when fetching files. If the file in question
  77. %% is not in the fetch_path, the URI will be used as a file
  78. %% name.</dd>
  79. %% <dt><code>{space, Flag}</code></dt>
  80. %% <dd>'preserve' (default) to preserve spaces, 'normalize' to
  81. %% accumulate consecutive whitespace and replace it with one space.</dd>
  82. %% <dt><code>{line, Line}</code></dt>
  83. %% <dd>To specify starting line for scanning in document which contains
  84. %% fragments of XML.</dd>
  85. %% <dt><code>{namespace_conformant, Flag}</code></dt>
  86. %% <dd>Controls whether to behave as a namespace conformant XML parser,
  87. %% 'false' (default) to not otherwise 'true'.</dd>
  88. %% <dt><code>{validation, Flag}</code></dt>
  89. %% <dd>Controls whether to process as a validating XML parser,
  90. %% 'false' (default) to not otherwise 'true'.</dd>
  91. %% <dt><code>{quiet, Flag}</code></dt>
  92. %% <dd>Set to 'true' if xmerl should behave quietly and not output any info
  93. %% to standard output (default 'false').</dd>
  94. %% <dt><code>{doctype_DTD, DTD}</code></dt>
  95. %% <dd>Allows to specify DTD name when it isn't available in the XML
  96. %% document.</dd>
  97. %% <dt><code>{xmlbase, Dir}</code></dt>
  98. %% <dd>XML Base directory. If using string/1 default is current directory.
  99. %% If using file/1 default is directory of given file.</dd>
  100. %% <dt><code>{encoding, Enc}</code></dt>
  101. %% <dd>Set default character set used (default UTF-8).
  102. %% This character set is used only if not explicitly given by the XML
  103. %% declaration. </dd>
  104. %% </dl>
  105. %% @end
  106. %% Only used internally are:
  107. %% <dt><code>{environment,Env}</code></dt>
  108. %% <dd>What is this?</dd>
  109. %% <dt><code>{text_decl,Bool}</code></dt>
  110. %% <dd>What is this?</dd>
  111. -module(xmerl_scan).
  112. -vsn('0.19').
  113. -date('03-09-16').
  114. %% main API
  115. -export([string/1, string/2,
  116. file/1, file/2]).
  117. %% access functions for various states
  118. -export([user_state/1, user_state/2,
  119. event_state/1, event_state/2,
  120. hook_state/1, hook_state/2,
  121. rules_state/1, rules_state/2,
  122. fetch_state/1, fetch_state/2,
  123. cont_state/1, cont_state/2]).
  124. %% helper functions. To xmerl_lib ??
  125. -export([accumulate_whitespace/4]).
  126. %-define(debug, 1).
  127. -include("xmerl.hrl"). % record def, macros
  128. -include_lib("kernel/include/file.hrl").
  129. -define(fatal(Reason, S),
  130. if
  131. S#xmerl_scanner.quiet ->
  132. ok;
  133. true ->
  134. ok=io:format("~p- fatal: ~p~n", [?LINE, Reason])
  135. end,
  136. fatal(Reason, S)).
  137. -define(ustate(U, S), S#xmerl_scanner{user_state = U}).
  138. %% Functions to access the various states
  139. %%% @spec user_state(S::global_state()) -> global_state()
  140. %%% @equiv user_state(UserState,S)
  141. user_state(#xmerl_scanner{user_state = S}) -> S.
  142. %%% @spec event_state(S::global_state()) -> global_state()
  143. %%% @equiv event_state(EventState,S)
  144. event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S.
  145. %%% @spec hook_state(S::global_state()) -> global_state()
  146. %%% @equiv hook_state(HookState,S)
  147. hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S.
  148. %%% @spec rules_state(S::global_state()) -> global_state()
  149. %%% @equiv rules_state(RulesState,S)
  150. rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S.
  151. %%% @spec fetch_state(S::global_state()) -> global_state()
  152. %%% @equiv fetch_state(FetchState,S)
  153. fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S.
  154. %%% @spec cont_state(S::global_state()) -> global_state()
  155. %%% @equiv cont_state(ContinuationState,S)
  156. cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S.
  157. %%%% Functions to modify the various states
  158. %%% @spec user_state(UserState, S::global_state()) -> global_state()
  159. %%% @doc For controlling the UserState, to be used in a user function.
  160. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  161. user_state(X, S) ->
  162. S#xmerl_scanner{user_state = X}.
  163. %%% @spec event_state(EventState, S::global_state()) -> global_state()
  164. %%% @doc For controlling the EventState, to be used in an event
  165. %%% function, and called at the beginning and at the end of a parsed entity.
  166. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  167. event_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  168. FS1 = FS#xmerl_fun_states{event = X},
  169. S#xmerl_scanner{fun_states = FS1}.
  170. %%% @spec hook_state(HookState, S::global_state()) -> global_state()
  171. %%% @doc For controlling the HookState, to be used in a hook
  172. %%% function, and called when the parser has parsed a complete entity.
  173. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  174. hook_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  175. FS1 = FS#xmerl_fun_states{hook = X},
  176. S#xmerl_scanner{fun_states = FS1}.
  177. %%% @spec rules_state(RulesState, S::global_state()) -> global_state()
  178. %%% @doc For controlling the RulesState, to be used in a rules
  179. %%% function, and called when the parser store scanner information in a rules
  180. %%% database.
  181. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  182. rules_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  183. FS1 = FS#xmerl_fun_states{rules = X},
  184. S#xmerl_scanner{fun_states = FS1}.
  185. %%% @spec fetch_state(FetchState, S::global_state()) -> global_state()
  186. %%% @doc For controlling the FetchState, to be used in a fetch
  187. %%% function, and called when the parser fetch an external resource (eg. a DTD).
  188. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  189. fetch_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  190. FS1 = FS#xmerl_fun_states{fetch = X},
  191. S#xmerl_scanner{fun_states = FS1}.
  192. %%% @spec cont_state(ContinuationState, S::global_state()) -> global_state()
  193. %%% @doc For controlling the ContinuationState, to be used in a continuation
  194. %%% function, and called when the parser encounters the end of the byte stream.
  195. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  196. cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  197. FS1 = FS#xmerl_fun_states{cont = X},
  198. S#xmerl_scanner{fun_states = FS1}.
  199. %% @spec file(Filename::string()) -> {xmlElement(),Rest}
  200. %% Rest = list()
  201. %% @equiv file(Filename, [])
  202. file(F) ->
  203. file(F, []).
  204. %% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest}
  205. %% Rest = list()
  206. %%% @doc Parse file containing an XML document
  207. file(F, Options) ->
  208. ExtCharset=case lists:keysearch(encoding,1,Options) of
  209. {value,{_,Val}} -> Val;
  210. false -> undefined
  211. end,
  212. case int_file(F,Options,ExtCharset) of
  213. {Res, Tail,S=#xmerl_scanner{close_fun=Close}} ->
  214. Close(S), % for side effects only - final state is dropped
  215. {Res,Tail};
  216. {error, Reason} ->
  217. {error, Reason};
  218. Other ->
  219. {error, Other}
  220. end.
  221. int_file(F, Options,_ExtCharset) ->
  222. % io:format("int_file F=~p~n",[F]),
  223. case file:read_file(F) of
  224. {ok, Bin} ->
  225. int_string(binary_to_list(Bin), Options, filename:dirname(F));
  226. Error ->
  227. Error
  228. end.
  229. int_file_decl(F, Options,_ExtCharset) ->
  230. % io:format("int_file_decl F=~p~n",[F]),
  231. case file:read_file(F) of
  232. {ok, Bin} ->
  233. int_string_decl(binary_to_list(Bin), Options, filename:dirname(F));
  234. Error ->
  235. Error
  236. end.
  237. %% @spec string(Text::list()) -> {xmlElement(),Rest}
  238. %% Rest = list()
  239. %% @equiv string(Test, [])
  240. string(Str) ->
  241. string(Str, []).
  242. %% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest}
  243. %% Rest = list()
  244. %%% @doc Parse string containing an XML document
  245. string(Str, Options) ->
  246. case int_string(Str, Options) of
  247. {Res, Tail, S=#xmerl_scanner{close_fun = Close}} ->
  248. Close(S), % for side effects only - final state is dropped
  249. {Res,Tail};
  250. {error, Reason} ->
  251. {error, Reason}; % (This can't happen, currently)
  252. Other ->
  253. {error, Other}
  254. end.
  255. int_string(Str, Options) ->
  256. {ok, XMLBase} = file:get_cwd(),
  257. int_string(Str, Options, XMLBase).
  258. int_string(Str, Options, XMLBase) ->
  259. S=initial_state0(Options,XMLBase),
  260. case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
  261. {auto,'iso-10646-utf-1',Str2} ->
  262. scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  263. {external,'iso-10646-utf-1',Str2} ->
  264. scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  265. {undefined,undefined,Str2} ->
  266. scan_document(Str2, S);
  267. {external,ExtCharset,Str2} ->
  268. scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
  269. end.
  270. int_string_decl(Str, Options, XMLBase) ->
  271. S=initial_state0(Options,XMLBase),
  272. case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
  273. {auto,'iso-10646-utf-1',Str2} ->
  274. scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  275. {external,'iso-10646-utf-1',Str2} ->
  276. scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  277. {undefined,undefined,Str2} ->
  278. scan_decl(Str2, S);
  279. {external,ExtCharset,Str2} ->
  280. scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
  281. end.
  282. initial_state0(Options,XMLBase) ->
  283. initial_state(Options, #xmerl_scanner{
  284. event_fun = fun event/2,
  285. hook_fun = fun hook/2,
  286. acc_fun = fun acc/3,
  287. fetch_fun = fun fetch/2,
  288. close_fun = fun close/1,
  289. continuation_fun = fun cont/3,
  290. rules_read_fun = fun rules_read/3,
  291. rules_write_fun = fun rules_write/4,
  292. rules_delete_fun= fun rules_delete/3,
  293. xmlbase = XMLBase
  294. }).
  295. initial_state([{event_fun, F}|T], S) ->
  296. initial_state(T, S#xmerl_scanner{event_fun = F});
  297. initial_state([{event_fun, F, ES}|T], S) ->
  298. S1 = event_state(ES, S#xmerl_scanner{event_fun = F}),
  299. initial_state(T, S1);
  300. initial_state([{acc_fun, F}|T], S) ->
  301. initial_state(T, S#xmerl_scanner{acc_fun = F});
  302. initial_state([{hook_fun, F}|T], S) ->
  303. initial_state(T, S#xmerl_scanner{hook_fun = F});
  304. initial_state([{hook_fun, F, HS}|T], S) ->
  305. S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}),
  306. initial_state(T, S1);
  307. initial_state([{close_fun, F}|T], S) ->
  308. initial_state(T, S#xmerl_scanner{close_fun = F});
  309. initial_state([{fetch_fun, F}|T], S) ->
  310. initial_state(T, S#xmerl_scanner{fetch_fun = F});
  311. initial_state([{fetch_fun, F, FS}|T], S) ->
  312. S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}),
  313. initial_state(T, S1);
  314. initial_state([{fetch_path, P}|T], S) ->
  315. initial_state(T, S#xmerl_scanner{fetch_path = P});
  316. initial_state([{continuation_fun, F}|T], S) ->
  317. initial_state(T, S#xmerl_scanner{continuation_fun = F});
  318. initial_state([{continuation_fun, F, CS}|T], S) ->
  319. S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}),
  320. initial_state(T, S1);
  321. initial_state([{rules, R}|T], S) ->
  322. initial_state(T, S#xmerl_scanner{rules = R,
  323. keep_rules = true});
  324. initial_state([{rules, Read, Write, RS}|T], S) ->
  325. S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read,
  326. rules_write_fun = Write,
  327. keep_rules = true}),
  328. initial_state(T, S1);
  329. initial_state([{user_state, F}|T], S) ->
  330. initial_state(T, S#xmerl_scanner{user_state = F});
  331. initial_state([{space, L}|T], S) ->
  332. initial_state(T, S#xmerl_scanner{space = L});
  333. initial_state([{line, L}|T], S) ->
  334. initial_state(T, S#xmerl_scanner{line = L});
  335. initial_state([{namespace_conformant, F}|T], S) when F==true; F==false ->
  336. initial_state(T, S#xmerl_scanner{namespace_conformant = F});
  337. initial_state([{validation, F}|T], S) when F==true; F==false ->
  338. initial_state(T, S#xmerl_scanner{validation = F});
  339. initial_state([{quiet, F}|T], S) when F==true; F==false ->
  340. initial_state(T, S#xmerl_scanner{quiet = F});
  341. initial_state([{doctype_DTD,DTD}|T], S) ->
  342. initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
  343. initial_state([{text_decl,Bool}|T], S) ->
  344. initial_state(T,S#xmerl_scanner{text_decl=Bool});
  345. initial_state([{environment,Env}|T], S) ->
  346. initial_state(T,S#xmerl_scanner{environment=Env});
  347. initial_state([{xmlbase, D}|T], S) ->
  348. initial_state(T, S#xmerl_scanner{xmlbase = D});
  349. initial_state([{encoding, Enc}|T], S) ->
  350. initial_state(T, S#xmerl_scanner{encoding = Enc});
  351. initial_state([], S=#xmerl_scanner{rules = undefined}) ->
  352. Tab = ets:new(rules, [set, public]),
  353. S#xmerl_scanner{rules = Tab};
  354. initial_state([], S) ->
  355. S.
  356. %%% -----------------------------------------------------
  357. %%% Default modifier functions
  358. %%% Hooks:
  359. %%% - {element, Line, Name, Attrs, Content}
  360. %%% - {processing_instruction, Line, Data}
  361. hook(X, State) ->
  362. {X, State}.
  363. %%% Events:
  364. %%%
  365. %%% #xmerl_event{event : started | ended,
  366. %%% line : integer(),
  367. %%% col : integer(),
  368. %%% data}
  369. %%%
  370. %%% Data Events
  371. %%% document started, ended
  372. %%% #xmlElement started, ended
  373. %%% #xmlAttribute ended
  374. %%% #xmlPI ended
  375. %%% #xmlComment ended
  376. %%% #xmlText ended
  377. event(_X, S) ->
  378. S.
  379. %% The acc/3 function can return either {Acc´, S'} or {Acc', Pos', S'},
  380. %% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or
  381. %% X#xmlAttribute.pos (whichever is the current object type.)
  382. %% The acc/3 function is not allowed to redefine the type of object
  383. %% being defined, but _is_ allowed to either ignore it or split it
  384. %% into multiple objects (in which case {Acc',Pos',S'} should be returned.)
  385. %% If {Acc',S'} is returned, Pos will be incremented by 1 by default.
  386. %% Below is an example of an acceptable operation
  387. acc(X = #xmlText{value = Text}, Acc, S) ->
  388. {[X#xmlText{value = lists:flatten(Text)}|Acc], S};
  389. acc(X, Acc, S) ->
  390. {[X|Acc], S}.
  391. fetch({system, URI}, S) ->
  392. fetch_URI(URI, S);
  393. fetch({public, _PublicID, URI}, S) ->
  394. fetch_URI(URI, S).
  395. %%% Always assume an external resource can be found locally! Thus
  396. %%% don't bother fetching with e.g. HTTP. Returns the path where the
  397. %%% resource is found. The path to the external resource is given by
  398. %%% URI directly or the option fetch_path (additional paths) or
  399. %%% directory (base path to external resource)
  400. fetch_URI(URI, S) ->
  401. %% assume URI is a filename
  402. Split = filename:split(URI),
  403. Filename = fun([])->[];(X)->lists:last(X) end (Split),
  404. Fullname =
  405. case Split of %% how about Windows systems?
  406. ["file:"|Name]-> %% absolute path, see RFC2396 sect 3
  407. %% file:/dtd_name
  408. filename:join(["/"|Name]);
  409. ["/"|Rest] when Rest /= [] ->
  410. %% absolute path name
  411. URI;
  412. ["http:"|_Rest] ->
  413. {http,URI};
  414. [] -> %% empty systemliteral
  415. [];
  416. _ ->
  417. filename:join(S#xmerl_scanner.xmlbase, URI)
  418. end,
  419. Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname),
  420. ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]),
  421. {ok, Path, S}.
  422. path_locate(_, _, {http,_}=URI) ->
  423. URI;
  424. path_locate(_, _, []) ->
  425. [];
  426. path_locate([Dir|Dirs], FN, FullName) ->
  427. F = filename:join(Dir, FN),
  428. case file:read_file_info(F) of
  429. {ok, #file_info{type = regular}} ->
  430. {file,F};
  431. _ ->
  432. path_locate(Dirs, FN, FullName)
  433. end;
  434. path_locate([], _FN, FullName) ->
  435. {file,FullName}.
  436. cont(_F, Exception, US) ->
  437. Exception(US).
  438. close(S) ->
  439. S.
  440. %%% -----------------------------------------------------
  441. %%% Scanner
  442. %%% [1] document ::= prolog element Misc*
  443. scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
  444. line = L, col = C,
  445. environment=Env,
  446. encoding=Charset,
  447. validation=ValidateResult}) ->
  448. S1 = Event(#xmerl_event{event = started,
  449. line = L,
  450. col = C,
  451. data = document}, S),
  452. %% Transform to given character set.
  453. %% Note that if another character set is given in the encoding
  454. %% attribute in a XML declaration that one will be used later
  455. Str=if
  456. Charset=/=undefined -> % Default character set is UTF-8
  457. ucs:to_unicode(Str0,list_to_atom(Charset));
  458. true ->
  459. Str0
  460. end,
  461. {"<"++T2, S2} = scan_prolog(Str, S1, _StartPos = 1),
  462. {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1),
  463. {Tail, S4}=scan_misc(T3, S3, _StartPos = 1),
  464. S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  465. line = S4#xmerl_scanner.line,
  466. col = S4#xmerl_scanner.col,
  467. data = document}, S4),
  468. S6 = case ValidateResult of
  469. false ->
  470. cleanup(S5);
  471. true when Env == element; Env == prolog ->
  472. check_decl2(S5),
  473. case xmerl_validate:validate(S5,Res) of
  474. {'EXIT',{error,Reason}} ->
  475. S5b=cleanup(S5),
  476. ?fatal({failed_validation,Reason}, S5b);
  477. {'EXIT',Reason} ->
  478. S5b=cleanup(S5),
  479. ?fatal({failed_validation,Reason}, S5b);
  480. {error,Reason} ->
  481. S5b=cleanup(S5),
  482. ?fatal({failed_validation,Reason}, S5b);
  483. {error,Reason,_Next} ->
  484. S5b=cleanup(S5),
  485. ?fatal({failed_validation,Reason}, S5b);
  486. _XML ->
  487. cleanup(S5)
  488. end;
  489. true ->
  490. cleanup(S5)
  491. end,
  492. {Res, Tail, S6}.
  493. scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
  494. line = L, col = C,
  495. environment=_Env,
  496. encoding=_Charset,
  497. validation=_ValidateResult}) ->
  498. S1 = Event(#xmerl_event{event = started,
  499. line = L,
  500. col = C,
  501. data = document}, S),
  502. case scan_prolog(Str, S1, _StartPos = 1) of
  503. {T2="<"++_, S2} ->
  504. {{S2#xmerl_scanner.user_state,T2},[],S2};
  505. {[], S2}->
  506. {[],[],S2};
  507. {T2, S2} ->
  508. {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
  509. _Lang=[],_Parents=[],#xmlNamespace{}),
  510. {T2,[],S3}
  511. end.
  512. %%% [22] Prolog
  513. %%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
  514. %%%
  515. %% Text declaration may be empty
  516. scan_prolog([], S=#xmerl_scanner{text_decl=true},_Pos) ->
  517. {[],S};
  518. scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  519. ?dbg("cont()...~n", []),
  520. F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end,
  521. fun(S1) -> {[], S1} end,
  522. S);
  523. scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
  524. when ?whitespace(hd(T)) ->
  525. {Charset,T3, S3}=
  526. if
  527. Col==1,L==1,S0#xmerl_scanner.text_decl==true ->
  528. ?dbg("prolog(\"<?xml\")~n", []),
  529. ?bump_col(5),
  530. {_,T1,S1} = mandatory_strip(T,S0),
  531. {Decl,T2, S2}=scan_text_decl(T1,S1),
  532. Encoding=Decl#xmlDecl.encoding,
  533. {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
  534. Col==1,L==1 ->
  535. ?dbg("prolog(\"<?xml\")~n", []),
  536. ?bump_col(5),
  537. {Decl,T2, S2}=scan_xml_decl(T, S0),
  538. Encoding=Decl#xmlDecl.encoding,
  539. {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
  540. true ->
  541. ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
  542. end,
  543. %% Now transform to declared character set.
  544. if
  545. Charset==Charset0 -> % Document already transformed to this charset!
  546. scan_prolog(T3, S3, Pos);
  547. Charset0=/=undefined -> % Document transformed to other bad charset!
  548. ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S3);
  549. Charset=/=undefined -> % Document not previously transformed
  550. T4=ucs:to_unicode(T3,list_to_atom(Charset)),
  551. scan_prolog(T4, S3, Pos);
  552. true -> % No encoding info given
  553. scan_prolog(T3, S3, Pos)
  554. end;
  555. scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog,
  556. encoding=Charset}, Pos) ->
  557. ?dbg("prolog(\"<!DOCTYPE\")~n", []),
  558. ?bump_col(9),
  559. %% If no known character set assume it is UTF-8
  560. T1=if
  561. Charset==undefined -> ucs:to_unicode(T,'utf-8');
  562. true -> T
  563. end,
  564. {T2, S1} = scan_doctype(T1, S),
  565. scan_misc(T2, S1, Pos);
  566. scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=Charset},Pos) ->
  567. ?dbg("prolog(\"<\")~n", []),
  568. %% Check for Comments, PI before possible DOCTYPE declaration
  569. ?bump_col(1),
  570. %% If no known character set assume it is UTF-8
  571. T=if
  572. Charset==undefined -> ucs:to_unicode(Str,'utf-8');
  573. true -> Str
  574. end,
  575. {T1, S1}=scan_misc(T, S, Pos),
  576. scan_prolog2(T1,S1,Pos).
  577. scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  578. ?dbg("cont()...~n", []),
  579. F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end,
  580. fun(S1) -> {[], S1} end,
  581. S);
  582. scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) ->
  583. ?dbg("prolog(\"<!DOCTYPE\")~n", []),
  584. ?bump_col(9),
  585. {T1, S1} = scan_doctype(T, S),
  586. scan_misc(T1, S1, Pos);
  587. scan_prolog2(Str = "<!" ++ _, S, _Pos) ->
  588. ?dbg("prolog(\"<!\")~n", []),
  589. %% In e.g. a DTD, we jump directly to markup declarations
  590. scan_ext_subset(Str, S);
  591. scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
  592. ?dbg("prolog(\"<\")~n", []),
  593. %% Check for more Comments and PI after DOCTYPE declaration
  594. ?bump_col(1),
  595. scan_misc(Str, S, Pos).
  596. %%% [27] Misc ::= Comment | PI | S
  597. %% Note:
  598. %% - Neither of Comment and PI are returned in the resulting parsed
  599. %% structure.
  600. %% - scan_misc/3 implements Misc* as that is how the rule is always used
  601. scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  602. ?dbg("cont()...~n", []),
  603. F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end,
  604. fun(S1) -> {[], S1} end,
  605. S);
  606. scan_misc("<!--" ++ T, S, Pos) -> % Comment
  607. {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
  608. scan_misc(T1,S1,Pos);
  609. scan_misc("<?" ++ T, S0, Pos) -> % PI
  610. ?dbg("prolog(\"<?\")~n", []),
  611. ?bump_col(2),
  612. {_PI, T1, S1} = scan_pi(T, S, Pos),
  613. scan_misc(T1,S1,Pos);
  614. scan_misc([H|T], S, Pos) when ?whitespace(H) ->
  615. ?dbg("prolog(whitespace)~n", []),
  616. scan_misc(T,S,Pos);
  617. scan_misc(T,S,_Pos) ->
  618. {T,S}.
  619. cleanup(S=#xmerl_scanner{keep_rules = false,
  620. rules = Rules}) ->
  621. ets:delete(Rules),
  622. S#xmerl_scanner{rules = undefined};
  623. cleanup(S) ->
  624. S.
  625. %%% Prolog and Document Type Declaration XML 1.0 Section 2.8
  626. %% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  627. %% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
  628. scan_xml_decl(T, S) ->
  629. %% VersionInfo [24] is mandatory
  630. {_,T1,S2} = mandatory_strip(T,S),
  631. "version" ++ T2 = T1,
  632. {T3, S3} = scan_eq(T2, S2),
  633. {Vsn, T4, S4} = scan_xml_vsn(T3, S3),
  634. Attr = #xmlAttribute{name = version,
  635. parents = [{xml, _XMLPos = 1}],
  636. value = Vsn},
  637. scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}).
  638. scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) ->
  639. ?dbg("cont()...~n", []),
  640. F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end,
  641. fun(S1) -> {[], [], S1} end,
  642. S);
  643. scan_xml_decl("?>" ++ T, S0, Decl) ->
  644. ?bump_col(2),
  645. return_xml_decl(T,S,Decl);
  646. scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) ->
  647. {_,T1,S1}=mandatory_strip(T,S),
  648. scan_xml_decl2(T1,S1,Decl).
  649. scan_xml_decl2("?>" ++ T, S0,Decl) ->
  650. ?bump_col(2),
  651. return_xml_decl(T,S,Decl);
  652. scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event},
  653. Decl0 = #xmlDecl{attributes = Attrs}) ->
  654. %% [80] EncodingDecl
  655. ?bump_col(8),
  656. {T1, S1} = scan_eq(T, S),
  657. {EncName, T2, S2} = scan_enc_name(T1, S1),
  658. LowEncName=httpd_util:to_lower(EncName),
  659. Attr = #xmlAttribute{name = encoding,
  660. parents = [{xml, _XMLPos = 1}],
  661. value = LowEncName},
  662. Decl = Decl0#xmlDecl{encoding = LowEncName,
  663. attributes = [Attr|Attrs]},
  664. S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  665. line = S0#xmerl_scanner.line,
  666. col = S0#xmerl_scanner.col,
  667. data = Attr}, S2),
  668. case T2 of
  669. "?>" ++ _T3 ->
  670. scan_xml_decl3(T2,S3,Decl);
  671. _ ->
  672. {_,T3,S4} = mandatory_strip(T2,S3),
  673. scan_xml_decl3(T3, S4, Decl)
  674. end;
  675. scan_xml_decl2(T="standalone" ++ _T,S,Decl) ->
  676. scan_xml_decl3(T,S,Decl).
  677. scan_xml_decl3("?>" ++ T, S0,Decl) ->
  678. ?bump_col(2),
  679. return_xml_decl(T,S,Decl);
  680. scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event},
  681. Decl0 = #xmlDecl{attributes = Attrs}) ->
  682. %% [32] SDDecl
  683. ?bump_col(10),
  684. {T1, S1} = scan_eq(T, S),
  685. {StValue,T2,S2}=scan_standalone_value(T1,S1),
  686. Attr = #xmlAttribute{name = standalone,
  687. parents = [{xml, _XMLPos = 1}],
  688. value = StValue},
  689. Decl = Decl0#xmlDecl{standalone = StValue,
  690. attributes = [Attr|Attrs]},
  691. S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  692. line = S0#xmerl_scanner.line,
  693. col = S0#xmerl_scanner.col,
  694. data = Attr}, S2),
  695. {_,T3,S4} = strip(T2,S3),
  696. "?>" ++ T4 = T3,
  697. return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl).
  698. return_xml_decl(T,S=#xmerl_scanner{hook_fun = Hook,
  699. event_fun = Event},
  700. Decl0 = #xmlDecl{attributes = Attrs}) ->
  701. ?strip1,
  702. Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
  703. S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  704. line = S#xmerl_scanner.line,
  705. col = S#xmerl_scanner.col,
  706. data = Decl}, S1),
  707. {Ret, S3} = Hook(Decl, S2),
  708. {Ret, T1, S3}.
  709. scan_standalone_value("'yes'" ++T,S0)->
  710. ?bump_col(5),
  711. {'yes',T,S#xmerl_scanner{standalone=yes}};
  712. scan_standalone_value("\"yes\"" ++T,S0)->
  713. ?bump_col(5),
  714. {'yes',T,S#xmerl_scanner{standalone=yes}};
  715. scan_standalone_value("'no'" ++T,S0) ->
  716. ?bump_col(4),
  717. {'no',T,S};
  718. scan_standalone_value("\"no\"" ++T,S0) ->
  719. ?bump_col(4),
  720. {'no',T,S}.
  721. %%%
  722. %%% Text declaration XML 1.0 section 4.3.1
  723. %%% [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
  724. scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) ->
  725. {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S),
  726. "encoding" ++ T2 = T1,
  727. S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8},
  728. {T3, S3} = scan_eq(T2, S2),
  729. {EncName, T4, S4} = scan_enc_name(T3, S3),
  730. LowEncName=httpd_util:to_lower(EncName),
  731. ?strip5,
  732. Attr = #xmlAttribute{name = encoding,
  733. parents = [{xml,1}],
  734. value = LowEncName},
  735. Decl = Decl0#xmlDecl{encoding = LowEncName,
  736. attributes = [Attr|Attrs]},
  737. S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended,
  738. line = S5#xmerl_scanner.line,
  739. col = S5#xmerl_scanner.col,
  740. data = Attr}, S5),
  741. scan_text_decl(T5,S6,Decl).
  742. scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = Hook,
  743. event_fun = Event},
  744. Decl0 = #xmlDecl{attributes = Attrs}) ->
  745. ?bump_col(2),
  746. ?strip1,
  747. Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
  748. S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  749. line = S0#xmerl_scanner.line,
  750. col = S0#xmerl_scanner.col,
  751. data = Decl}, S1),
  752. {Ret, S3} = Hook(Decl, S2),
  753. {Ret, T1, S3}.
  754. scan_optional_version("version"++T,S0) ->
  755. ?bump_col(7),
  756. ?strip1,
  757. {T2, S2} = scan_eq(T1, S1),
  758. {Vsn, T3, S3} = scan_xml_vsn(T2, S2),
  759. {_,T4,S4} = mandatory_strip(T3,S3),
  760. Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn},
  761. {#xmlDecl{attributes=[Attr]},T4,S4};
  762. scan_optional_version(T,S) ->
  763. {#xmlDecl{attributes=[]},T,S}.
  764. %%%%%%% [81] EncName
  765. scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) ->
  766. ?dbg("cont()...~n", []),
  767. F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end,
  768. fun(S1) -> ?fatal(expected_encoding_name, S1) end,
  769. S);
  770. scan_enc_name([H|T], S0) when H >= $"; H =< $' ->
  771. ?bump_col(1),
  772. scan_enc_name(T, S, H, []).
  773. scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
  774. ?dbg("cont()...~n", []),
  775. F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end,
  776. fun(S1) -> ?fatal(expected_encoding_name, S1) end,
  777. S);
  778. scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
  779. ?bump_col(1),
  780. scan_enc_name2(T, S, Delim, [H|Acc]);
  781. scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
  782. ?bump_col(1),
  783. scan_enc_name2(T, S, Delim, [H|Acc]);
  784. scan_enc_name([H|_T],S,_Delim,_Acc) ->
  785. ?fatal({error,{unexpected_character_in_Enc_Name,H}},S).
  786. scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
  787. ?dbg("cont()...~n", []),
  788. F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end,
  789. fun(S1) -> ?fatal(expected_encoding_name, S1) end,
  790. S);
  791. scan_enc_name2([H|T], S0, H, Acc) ->
  792. ?bump_col(1),
  793. {lists:reverse(Acc), T, S};
  794. scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
  795. ?bump_col(1),
  796. scan_enc_name2(T, S, Delim, [H|Acc]);
  797. scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
  798. ?bump_col(1),
  799. scan_enc_name2(T, S, Delim, [H|Acc]);
  800. scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 ->
  801. ?bump_col(1),
  802. scan_enc_name2(T, S, Delim, [H|Acc]);
  803. scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- ->
  804. ?bump_col(1),
  805. scan_enc_name2(T, S, Delim, [H|Acc]).
  806. %%%%%%% [26] VersionNum
  807. %%% VersionNum ::= ([a-zA-Z0-9_.:] | '-')+
  808. scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) ->
  809. ?dbg("cont()...~n", []),
  810. F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end,
  811. fun(S1) -> ?fatal(unexpected_end, S1) end,
  812. S);
  813. scan_xml_vsn([H|T], S) when H==$"; H==$'->
  814. xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []).
  815. xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
  816. ?dbg("cont()...~n", []),
  817. F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end,
  818. fun(S1) -> ?fatal(unexpected_end, S1) end,
  819. S);
  820. xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) ->
  821. {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}};
  822. xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z ->
  823. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  824. xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z ->
  825. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  826. xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 ->
  827. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  828. xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) ->
  829. case lists:member(H, "_.:-") of
  830. true ->
  831. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  832. false ->
  833. ?fatal({invalid_vsn_char, H}, S)
  834. end.
  835. %%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  836. scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  837. ?dbg("cont()...~n", []),
  838. F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end,
  839. fun(S1) -> ?fatal(unexpected_end, S1) end,
  840. S);
  841. scan_pi(Str = [H1,H2,H3 | T],S=#xmerl_scanner{line = L, col = C}, Pos)
  842. when H1==$x;H1==$X ->
  843. %% names beginning with [xX][mM][lL] are reserved for future use.
  844. if
  845. ((H2==$m) or (H2==$M)) and
  846. ((H3==$l) or (H3==$L)) ->
  847. scan_wellknown_pi(T,S,Pos);
  848. true ->
  849. {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
  850. scan_pi(T1, S1, Target, L, C, Pos, [])
  851. end;
  852. scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) ->
  853. {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
  854. scan_pi(T1, S1, Target, L, C, Pos,[]).
  855. %%% More info on xml-stylesheet can be found at:
  856. %%% "Associating Style Sheets with XML documents", Version 1.0,
  857. %%% W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/)
  858. scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) ->
  859. ?dbg("prolog(\"<?xml-stylesheet\")~n", []),
  860. ?bump_col(16),
  861. scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]);
  862. scan_wellknown_pi(Str,S,_Pos) ->
  863. ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S).
  864. % scan_pi(Str="?>"++_T,S,Target, L, C, Pos) ->
  865. % scan_pi(Str,S,Target, L, C, Pos,[]);
  866. % scan_pi(Str=[],S,Target, L, C, Pos) ->
  867. % scan_pi(Str,S,Target, L, C, Pos,[]);
  868. % scan_pi(T,S,Target, L, C, Pos) ->
  869. % {_,T1,S1} = mandatory_strip(T,S),
  870. % scan_pi(T1,S1,Target, L, C, Pos,[]).
  871. scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
  872. ?dbg("cont()...~n", []),
  873. F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end,
  874. fun(S1) -> ?fatal(unexpected_end, S1) end,
  875. S);
  876. scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
  877. event_fun = Event},
  878. Target, L, C, Pos, Acc) ->
  879. ?bump_col(2),
  880. PI = #xmlPI{name = Target,
  881. pos = Pos,
  882. value = lists:reverse(Acc)},
  883. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  884. line = L,
  885. col = C,
  886. data = PI}, S),
  887. {Ret, S2} = Hook(PI, S1),
  888. {Ret, T, S2};
  889. scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) ->
  890. ?strip1,
  891. scan_pi2(T1, S1, Target, L, C, Pos, Acc).
  892. scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
  893. ?dbg("cont()...~n", []),
  894. F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end,
  895. fun(S1) -> ?fatal(unexpected_end, S1) end,
  896. S);
  897. scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
  898. event_fun = Event},
  899. Target, L, C, Pos, Acc) ->
  900. ?bump_col(2),
  901. PI = #xmlPI{name = Target,
  902. pos = Pos,
  903. value = lists:reverse(Acc)},
  904. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  905. line = L,
  906. col = C,
  907. data = PI}, S),
  908. {Ret, S2} = Hook(PI, S1),
  909. {Ret, T, S2};
  910. scan_pi2([H|T], S0, Target, L, C, Pos, Acc) ->
  911. ?bump_col(1),
  912. wfc_legal_char(H,S),
  913. scan_pi2(T, S, Target, L, C, Pos, [H|Acc]).
  914. %% [28] doctypedecl ::=
  915. %% '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
  916. scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) ->
  917. ?dbg("cont()...~n", []),
  918. F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end,
  919. fun(S1) -> ?fatal(unexpected_end, S1) end,
  920. S);
  921. scan_doctype(T, S) ->
  922. {_,T1,S1} = mandatory_strip(T,S),
  923. {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
  924. ?strip3,
  925. scan_doctype1(T3, S3#xmerl_scanner{doctype_name = DTName}).
  926. %% [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  927. %% | 'PUBLIC' S PubidLiteral S SystemLiteral
  928. scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) ->
  929. ?dbg("cont()...~n", []),
  930. F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end,
  931. fun(S1) -> ?fatal(unexpected_end, S1) end,
  932. S);
  933. scan_doctype1("PUBLIC" ++ T, S0) ->
  934. ?bump_col(6),
  935. {_,T1,S1} = mandatory_strip(T,S),
  936. {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
  937. {_,T3,S3} = mandatory_strip(T2,S2),
  938. {SL, T4, S4} = scan_system_literal(T3, S3),
  939. ?strip5,
  940. scan_doctype2(T5, S5, {public, PIDL, SL});
  941. scan_doctype1("SYSTEM" ++ T, S0) ->
  942. ?bump_col(6),
  943. {_,T1,S1} = mandatory_strip(T,S),
  944. {SL, T2, S2} = scan_system_literal(T1, S1),
  945. ?strip3,
  946. scan_doctype2(T3, S3, {system, SL});
  947. scan_doctype1(T, S) ->
  948. scan_doctype2(T, S, undefined).
  949. scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
  950. ?dbg("cont()...~n", []),
  951. F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end,
  952. fun(S1) -> ?fatal(unexpected_end, S1) end,
  953. S);
  954. scan_doctype2("[" ++ T, S0, DTD) ->
  955. ?bump_col(1),
  956. ?strip1,
  957. scan_doctype3(T1, S1, DTD);
  958. scan_doctype2(">" ++ T, S0, DTD) ->
  959. ?bump_col(1),
  960. ?strip1,
  961. S2 = fetch_DTD(DTD, S1),
  962. check_decl(S2),
  963. {T1, S2}.
  964. %% [28a] DeclSep ::= PEReference | S
  965. %% [28b] intSubset ::= (markupdecl | DeclSep)*
  966. scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
  967. ?dbg("cont()...~n", []),
  968. F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end,
  969. fun(S1) -> ?fatal(unexpected_end, S1) end,
  970. S);
  971. scan_doctype3("%" ++ T, S0, DTD) ->
  972. ?bump_col(1),
  973. {PERefName, T1, S1} = scan_pe_reference(T, S),
  974. ?strip2,
  975. case expand_pe_reference(PERefName, S2,as_PE) of
  976. {system, _} = Name ->
  977. S3 = fetch_DTD(Name, S2),
  978. scan_doctype3(T2, S3, DTD);
  979. {public, _} = Name ->
  980. S3 = fetch_DTD(Name, S2),
  981. scan_doctype3(T2, S3, DTD);
  982. {public, _, _} = Name ->
  983. S3 = fetch_DTD(Name, S2),
  984. scan_doctype3(T2, S3, DTD);
  985. ExpRef when list(ExpRef) -> % Space added, see Section 4.4.8
  986. {_,T3,S3} = strip(ExpRef++T2,S2),
  987. scan_doctype3(T3,S3,DTD)
  988. end;
  989. scan_doctype3("]" ++ T, S0, DTD) ->
  990. ?bump_col(1),
  991. ?strip1,
  992. S2 = fetch_DTD(DTD, S1),
  993. check_decl(S2),
  994. ">" ++ T2 = T1,
  995. {T2, S2};
  996. scan_doctype3(T, S, DTD) ->
  997. {_, T1, S1} = scan_markup_decl(T, S),
  998. scan_doctype3(T1, S1, DTD).
  999. fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when list(URI)->
  1000. %% allow to specify DTD name when it isn't available in xml stream
  1001. fetch_DTD({system,URI},S);
  1002. fetch_DTD(undefined, S) ->
  1003. S;
  1004. fetch_DTD(DTDSpec, S)->
  1005. case fetch_and_parse(DTDSpec,S,[{text_decl,true},
  1006. {environment,{external,subset}}]) of
  1007. NewS when record(NewS,xmerl_scanner) ->
  1008. NewS;
  1009. {_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules
  1010. S
  1011. end.
  1012. fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch,
  1013. rules=Rules,
  1014. xmlbase = XMLBase},
  1015. Options0) ->
  1016. RetS =
  1017. case Fetch(ExtSpec, S) of
  1018. {ok, NewS} ->
  1019. %% For backward compatibility only. This will be removed later!!
  1020. NewS;
  1021. {ok, not_fetched,NewS} ->
  1022. NewS;
  1023. {ok, DataRet, NewS = #xmerl_scanner{user_state = UState,
  1024. event_fun = Event,
  1025. hook_fun = Hook,
  1026. fetch_fun = Fetch1,
  1027. close_fun = Close1,
  1028. continuation_fun = Cont,
  1029. acc_fun = Acc,
  1030. rules_read_fun = Read,
  1031. rules_write_fun = Write,
  1032. validation = Valid,
  1033. quiet = Quiet,
  1034. encoding = Charset
  1035. }} ->
  1036. EvS = event_state(NewS),
  1037. HoS = hook_state(NewS),
  1038. FeS = fetch_state(NewS),
  1039. CoS = cont_state(NewS),
  1040. Options = Options0++[{user_state, UState},
  1041. {rules, Rules},
  1042. {event_fun, Event, EvS},
  1043. {hook_fun, Hook, HoS},
  1044. {fetch_fun, Fetch1, FeS},
  1045. {close_fun, Close1},
  1046. {continuation_fun, Cont, CoS},
  1047. {rules, Read, Write, ""},
  1048. {acc_fun, Acc},
  1049. {validation,Valid},
  1050. {quiet,Quiet},
  1051. {encoding,Charset}],
  1052. case DataRet of
  1053. {file, F} ->
  1054. int_file_decl(F, Options,Charset);
  1055. {string, String} ->
  1056. int_string_decl(String, Options,XMLBase);
  1057. _ ->
  1058. %% other scheme
  1059. {DataRet,[],NewS}
  1060. end;
  1061. Error ->
  1062. ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S)
  1063. end,
  1064. case RetS of
  1065. #xmerl_scanner{} ->
  1066. RetS#xmerl_scanner{text_decl=false,
  1067. environment=S#xmerl_scanner.environment};
  1068. _ -> RetS
  1069. end.
  1070. fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) ->
  1071. case Fetch(ExtSpec,S) of
  1072. {ok, not_fetched,_NewS} ->
  1073. ?fatal({error_fetching_external_source,ExtSpec},S);
  1074. {ok, DataRet, NewS} ->
  1075. String =
  1076. case DataRet of
  1077. {file,F} ->
  1078. get_file(F,S);
  1079. {string,Str} ->
  1080. binary_to_list(Str);
  1081. _ -> DataRet
  1082. end,
  1083. {String, NewS};
  1084. _ ->
  1085. ?fatal({error_fetching_external_resource,ExtSpec},S)
  1086. end.
  1087. get_file(F,S) ->
  1088. % io:format("get_file F=~p~n",[F]),
  1089. case file:read_file(F) of
  1090. {ok,Bin} ->
  1091. binary_to_list(Bin);
  1092. Err ->
  1093. ?fatal({error_reading_file,F,Err},S)
  1094. end.
  1095. %% check_decl/1
  1096. %% Now it is necessary to check that all referenced types is declared,
  1097. %% since it is legal to reference some xml types before they are
  1098. %% declared.
  1099. check_decl(#xmerl_scanner{validation=false}) ->
  1100. ok;
  1101. check_decl(#xmerl_scanner{rules=Tab} = S) ->
  1102. check_notations(Tab,S),
  1103. check_elements(Tab,S), %% check also attribute defs for element
  1104. check_entities(Tab,S).
  1105. check_notations(Tab,S) ->
  1106. case ets:match(Tab,{{notation,'$1'},undeclared}) of
  1107. [[]] -> ok;
  1108. [] -> ok;
  1109. [L] when list(L) ->
  1110. ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
  1111. Err ->
  1112. ?fatal({error_missing_declaration_in_DTD,Err},S)
  1113. end.
  1114. check_elements(Tab,S) ->
  1115. case ets:match(Tab,{{elem_def,'_'},'$2'},10) of
  1116. {_,_}=M ->
  1117. Fun = fun({Match,'$end_of_table'},_F) ->
  1118. lists:foreach(fun(X)->check_elements2(X,S) end,
  1119. Match),
  1120. ok;
  1121. ('$end_of_table',_) ->
  1122. ok;
  1123. ({Match,Cont},F) ->
  1124. lists:foreach(fun(X)->check_elements2(X,S) end,
  1125. Match),
  1126. F(ets:match(Cont),F)
  1127. end,
  1128. Fun(M,Fun);
  1129. '$end_of_table' -> ok;
  1130. Err -> ?fatal({error_missing_declaration_in_DTD,Err},S)
  1131. end.
  1132. % it is not an error to declare attributes for an element that is not
  1133. % declared.
  1134. check_elements2([#xmlElement{attributes=Attrs}],S) ->
  1135. check_attributes(Attrs,S);
  1136. check_elements2(_,_) ->
  1137. ok.
  1138. check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) ->
  1139. case lists:keysearch('ID',2,Rest) of
  1140. {value,Att2} ->
  1141. ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S);
  1142. _ ->
  1143. ok
  1144. end,
  1145. vc_ID_Attribute_Default(Attr,S),
  1146. check_attributes(Rest,S);
  1147. check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) ->
  1148. vc_Enumeration(Attr,S),
  1149. check_attributes(T,S);
  1150. check_attributes([{_,Ent,_,_,_}=Attr|T],S)
  1151. when Ent=='ENTITY';Ent=='ENTITIES' ->
  1152. vc_Entity_Name(Attr,S),
  1153. check_attributes(T,S);
  1154. check_attributes([_|T],S) ->
  1155. check_attributes(T,S);
  1156. check_attributes([],_S) ->
  1157. ok.
  1158. check_entities(Tab,S=#xmerl_scanner{validation=true}) ->
  1159. case ets:match(Tab,{{entity,'$1'},undeclared}) of
  1160. [[]] -> ok;
  1161. [] -> ok;
  1162. [L] when list(L) ->
  1163. ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
  1164. Err ->
  1165. ?fatal({error_missing_declaration_in_DTD,Err},S)
  1166. end;
  1167. check_entities(_,_) ->
  1168. ok.
  1169. %% check_decl2/1: checks that all referenced ID attributes are declared
  1170. check_decl2(S=#xmerl_scanner{rules=Tab}) ->
  1171. check_referenced_ids(Tab,S).
  1172. check_referenced_ids(Tab,S) ->
  1173. case ets:match(Tab,{{id,'$1'},undeclared}) of
  1174. [[]] -> ok;
  1175. [] -> ok;
  1176. [L] when list(L) ->
  1177. ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
  1178. Err ->
  1179. ?fatal({error_missing_declaration_in_DTD,Err},S)
  1180. end.
  1181. %%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl
  1182. scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) ->
  1183. ?dbg("cont()...~n", []),
  1184. F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end,
  1185. fun(S1) -> {[], S1} end,
  1186. S);
  1187. scan_ext_subset("%" ++ T, S0) ->
  1188. %% DeclSep [28a]: WFC: PE Between Declarations.
  1189. %% The replacement text of a parameter entity reference in a
  1190. %% DeclSep must match the production extSubsetDecl.
  1191. ?bump_col(1),
  1192. {_,T1,S1} = scan_decl_sep(T,S),
  1193. scan_ext_subset(T1, S1);
  1194. scan_ext_subset("<![" ++ T, S0) ->
  1195. ?bump_col(3),
  1196. ?strip1,
  1197. {_, T2, S2} = scan_conditional_sect(T1, S1),
  1198. scan_ext_subset(T2,S2);
  1199. scan_ext_subset(T, S) when ?whitespace(hd(T)) ->
  1200. {_,T1,S1} = strip(T,S),
  1201. scan_ext_subset(T1, S1);
  1202. scan_ext_subset(T, S) ->
  1203. {_, T1, S1} = scan_markup_decl(T, S),
  1204. scan_ext_subset(T1, S1).
  1205. %%%%%%% [28a] DeclSep ::= PEReference | S
  1206. scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read,
  1207. rules_write_fun=Write,
  1208. rules_delete_fun=Delete}) ->
  1209. {PERefName, T1, S1} = scan_pe_reference(T, S),
  1210. {ExpandedRef,S2} =
  1211. case expand_pe_reference(PERefName,S1,as_PE) of
  1212. Tuple when tuple(Tuple) ->
  1213. %% {system,URI} or {public,URI}
  1214. {ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
  1215. {EntV,_,_S2} = scan_entity_value(ExpRef, S1, no_delim,
  1216. PERefName,parameter),
  1217. %% should do an update Write(parameter_entity) so next
  1218. %% expand_pe_reference is faster
  1219. Delete(parameter_entity,PERefName,_S2),
  1220. _S3 = Write(parameter_entity,PERefName,EntV,_S2),
  1221. EntV2 = Read(parameter_entity,PERefName,_S3),
  1222. {" " ++ EntV2 ++ " ",_S3};
  1223. ExpRef ->
  1224. {ExpRef,S1}
  1225. end,
  1226. {_, T3, S3} = strip(ExpandedRef,S2),
  1227. {_T4,S4} = scan_ext_subset(T3,S3),
  1228. strip(T1,S4).
  1229. %%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect
  1230. scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) ->
  1231. ?dbg("cont()...~n", []),
  1232. F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end,
  1233. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1234. S);
  1235. scan_conditional_sect("IGNORE" ++ T, S0) ->
  1236. ?bump_col(6),
  1237. ?strip1,
  1238. "[" ++ T2 = T1,
  1239. {_,T3,S3} = strip(T2,S1),
  1240. scan_ignore(T3,S3);
  1241. scan_conditional_sect("INCLUDE" ++ T, S0) ->
  1242. ?bump_col(7),
  1243. ?strip1,
  1244. "[" ++ T2 = T1,
  1245. {_,T3,S3} = strip(T2,S1),
  1246. scan_include(T3, S3);
  1247. scan_conditional_sect("%"++T,S0) ->
  1248. ?bump_col(1),
  1249. ?bump_col(1),
  1250. {PERefName, T1, S1} = scan_pe_reference(T, S),
  1251. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  1252. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  1253. scan_conditional_sect(T2,S2).
  1254. %%%% [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
  1255. %%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
  1256. %%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
  1257. scan_ignore(Str,S) ->
  1258. scan_ignore(Str,S,0).
  1259. scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) ->
  1260. ?dbg("cont()...~n", []),
  1261. F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end,
  1262. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1263. S);
  1264. scan_ignore("<![" ++ T, S0,Level) ->
  1265. %% nested conditional section. Topmost condition is ignore, though
  1266. ?bump_col(3),
  1267. scan_ignore(T, S,Level+1);
  1268. scan_ignore("]]>" ++ T, S0,0) ->
  1269. ?bump_col(3),
  1270. {[], T, S};
  1271. scan_ignore("]]>" ++ T, S0,Level) ->
  1272. ?bump_col(3),
  1273. scan_ignore(T, S,Level-1);
  1274. scan_ignore([_H|T],S0,Level) ->
  1275. ?bump_col(1),
  1276. scan_ignore(T,S,Level).
  1277. %%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
  1278. scan_include([], S=#xmerl_scanner{continuation_fun = F}) ->
  1279. ?dbg("cont()...~n", []),
  1280. F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end,
  1281. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1282. S);
  1283. scan_include("]]>" ++ T, S0) ->
  1284. ?bump_col(3),
  1285. {[], T, S};
  1286. scan_include("%" ++ T, S0) ->
  1287. ?bump_col(1),
  1288. {PERefName, T1, S1} = scan_pe_reference(T, S),
  1289. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  1290. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  1291. scan_include(T2, S2);
  1292. scan_include("<![" ++ T, S0) ->
  1293. ?bump_col(3),
  1294. ?strip1,
  1295. {_, T2, S2} = scan_conditional_sect(T1, S1),
  1296. ?strip3,
  1297. scan_include(T3,S3);
  1298. scan_include(T, S) ->
  1299. {_, T1, S1} = scan_markup_decl(T, S),
  1300. scan_include(T1, S1).
  1301. %%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl |
  1302. %%%%%%% NotationDecl | PI |Comment
  1303. %%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
  1304. %% Validity constraint: Unique Type Declaration: No element type may be
  1305. %% declared more than once.
  1306. %%
  1307. scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
  1308. ?dbg("cont()...~n", []),
  1309. F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end,
  1310. fun(S1) -> {[], [], S1} end,
  1311. S);
  1312. scan_markup_decl("<!--" ++ T, S0) ->
  1313. ?bump_col(4),
  1314. {_, T1, S1} = scan_comment(T, S),
  1315. ?strip2;
  1316. scan_markup_decl("<?" ++ T, S0) ->
  1317. ?bump_col(2),
  1318. {_PI, T1, S1} = scan_pi(T, S,_Pos=markup),
  1319. ?strip2;
  1320. scan_markup_decl("<!ELEMENT" ++ T,
  1321. #xmerl_scanner{rules_read_fun = Read,
  1322. rules_write_fun = Write,
  1323. rules_delete_fun = Delete} = S0) ->
  1324. ?bump_col(9),
  1325. {_,T1,S1} = mandatory_strip(T,S),
  1326. {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
  1327. Element =
  1328. case Read(elem_def, Ename, S2) of
  1329. El = #xmlElement{elementdef=Decl} when Decl /= undeclared ->
  1330. case S2#xmerl_scanner.validation of
  1331. true ->
  1332. ?fatal({already_defined, Ename}, S2);
  1333. _ ->
  1334. Delete(elem_def,Ename,S2),
  1335. El
  1336. end;
  1337. El = #xmlElement{} ->
  1338. Delete(elem_def,Ename,S2),
  1339. El;
  1340. undefined ->
  1341. #xmlElement{}
  1342. …

Large files files are truncated, but you can click here to view the full file