PageRenderTime 88ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/xmerl/src/xmerl_scan.erl

http://github.com/gebi/jungerl
Erlang | 3778 lines | 2842 code | 378 blank | 558 comment | 32 complexity | 573894cf37ee0fe26e2519ee216f684f MD5 | raw file
Possible License(s): AGPL-1.0, JSON, LGPL-2.1, BSD-3-Clause
  1. %%% The contents of this file are subject to the Erlang Public License,
  2. %%% Version 1.0, (the "License"); you may not use this file except in
  3. %%% compliance with the License. You may obtain a copy of the License at
  4. %%% http://www.erlang.org/license/EPL1_0.txt
  5. %%%
  6. %%% Software distributed under the License is distributed on an "AS IS"
  7. %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
  8. %%% the License for the specific language governing rights and limitations
  9. %%% under the License.
  10. %%%
  11. %%% The Original Code is xmerl-0.15
  12. %%%
  13. %%% The Initial Developer of the Original Code is Ericsson Telecom
  14. %%% AB. Portions created by Ericsson are Copyright (C), 1998, Ericsson
  15. %%% Telecom AB. All Rights Reserved.
  16. %%%
  17. %%% Contributor(s):
  18. %%% Mickael Remond <mickael.remond@IDEALX.com>:
  19. %%% Johan Blom <johan.blom@mobilearts.se>
  20. %%% Richard Carlsson
  21. %%% Fredrik Linder
  22. %%%
  23. %%%----------------------------------------------------------------------
  24. %%% #0. BASIC INFORMATION
  25. %%%----------------------------------------------------------------------
  26. %%% File: xmerl_scan.erl
  27. %%% Author : Ulf Wiger <ulf.wiger@ericsson.com>
  28. %%% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs.
  29. %%%
  30. %%% Modules used : ets, file, filename, io, lists, ucs, uri
  31. %%%
  32. %%%----------------------------------------------------------------------
  33. %% @doc
  34. %% The XML parser is activated through
  35. %% <tt>xmerl_scan:string/[1,2]</tt> or
  36. %% <tt>xmerl_scan:file/[1,2]</tt>.
  37. %% It returns records of the type defined in xmerl.hrl.
  38. %% See also <a href="xmerl_examples.html">tutorial</a> on customization
  39. %% functions.
  40. %% @type global_state(). <p>
  41. %% The global state of the scanner, represented by the #xmerl_scanner{} record.
  42. %% </p>
  43. %% @type option_list(). <p>Options allows to customize the behaviour of the
  44. %% scanner.
  45. %% See also <a href="xmerl_examples.html">tutorial</a> on customization
  46. %% functions.
  47. %% </p>
  48. %% Possible options are:
  49. %% <dl>
  50. %% <dt><code>{acc_fun, Fun}</code></dt>
  51. %% <dd>Call back function to accumulate contents of entity.</dd>
  52. %% <dt><code>{continuation_fun, Fun} |
  53. %% {continuation_fun, Fun, ContinuationState}</code></dt>
  54. %% <dd>Call back function to decide what to do if the scanner runs into eof
  55. %% before the document is complete.</dd>
  56. %% <dt><code>{event_fun, Fun} |
  57. %% {event_fun, Fun, EventState}</code></dt>
  58. %% <dd>Call back function to handle scanner events.</dd>
  59. %% <dt><code>{fetch_fun, Fun} |
  60. %% {fetch_fun, Fun, FetchState}</code></dt>
  61. %% <dd>Call back function to fetch an external resource.</dd>
  62. %% <dt><code>{hook_fun, Fun} |
  63. %% {hook_fun, Fun, HookState}</code></dt>
  64. %% <dd>Call back function to process the document entities once
  65. %% identified.</dd>
  66. %% <dt><code>{close_fun, Fun}</code></dt>
  67. %% <dd>Called when document has been completely parsed.</dd>
  68. %% <dt><code>{rules, ReadFun, WriteFun, RulesState} |
  69. %% {rules, Rules}</code></dt>
  70. %% <dd>Handles storing of scanner information when parsing.</dd>
  71. %% <dt><code>{user_state, UserState}</code></dt>
  72. %% <dd>Global state variable accessible from all customization functions</dd>
  73. %%
  74. %% <dt><code>{fetch_path, PathList}</code></dt>
  75. %% <dd>PathList is a list of
  76. %% directories to search when fetching files. If the file in question
  77. %% is not in the fetch_path, the URI will be used as a file
  78. %% name.</dd>
  79. %% <dt><code>{space, Flag}</code></dt>
  80. %% <dd>'preserve' (default) to preserve spaces, 'normalize' to
  81. %% accumulate consecutive whitespace and replace it with one space.</dd>
  82. %% <dt><code>{line, Line}</code></dt>
  83. %% <dd>To specify starting line for scanning in document which contains
  84. %% fragments of XML.</dd>
  85. %% <dt><code>{namespace_conformant, Flag}</code></dt>
  86. %% <dd>Controls whether to behave as a namespace conformant XML parser,
  87. %% 'false' (default) to not otherwise 'true'.</dd>
  88. %% <dt><code>{validation, Flag}</code></dt>
  89. %% <dd>Controls whether to process as a validating XML parser,
  90. %% 'false' (default) to not otherwise 'true'.</dd>
  91. %% <dt><code>{quiet, Flag}</code></dt>
  92. %% <dd>Set to 'true' if xmerl should behave quietly and not output any info
  93. %% to standard output (default 'false').</dd>
  94. %% <dt><code>{doctype_DTD, DTD}</code></dt>
  95. %% <dd>Allows to specify DTD name when it isn't available in the XML
  96. %% document.</dd>
  97. %% <dt><code>{xmlbase, Dir}</code></dt>
  98. %% <dd>XML Base directory. If using string/1 default is current directory.
  99. %% If using file/1 default is directory of given file.</dd>
  100. %% <dt><code>{encoding, Enc}</code></dt>
  101. %% <dd>Set default character set used (default UTF-8).
  102. %% This character set is used only if not explicitly given by the XML
  103. %% declaration. </dd>
  104. %% </dl>
  105. %% @end
  106. %% Only used internally are:
  107. %% <dt><code>{environment,Env}</code></dt>
  108. %% <dd>What is this?</dd>
  109. %% <dt><code>{text_decl,Bool}</code></dt>
  110. %% <dd>What is this?</dd>
  111. -module(xmerl_scan).
  112. -vsn('0.19').
  113. -date('03-09-16').
  114. %% main API
  115. -export([string/1, string/2,
  116. file/1, file/2]).
  117. %% access functions for various states
  118. -export([user_state/1, user_state/2,
  119. event_state/1, event_state/2,
  120. hook_state/1, hook_state/2,
  121. rules_state/1, rules_state/2,
  122. fetch_state/1, fetch_state/2,
  123. cont_state/1, cont_state/2]).
  124. %% helper functions. To xmerl_lib ??
  125. -export([accumulate_whitespace/4]).
  126. %-define(debug, 1).
  127. -include("xmerl.hrl"). % record def, macros
  128. -include_lib("kernel/include/file.hrl").
  129. -define(fatal(Reason, S),
  130. if
  131. S#xmerl_scanner.quiet ->
  132. ok;
  133. true ->
  134. ok=io:format("~p- fatal: ~p~n", [?LINE, Reason])
  135. end,
  136. fatal(Reason, S)).
  137. -define(ustate(U, S), S#xmerl_scanner{user_state = U}).
  138. %% Functions to access the various states
  139. %%% @spec user_state(S::global_state()) -> global_state()
  140. %%% @equiv user_state(UserState,S)
  141. user_state(#xmerl_scanner{user_state = S}) -> S.
  142. %%% @spec event_state(S::global_state()) -> global_state()
  143. %%% @equiv event_state(EventState,S)
  144. event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S.
  145. %%% @spec hook_state(S::global_state()) -> global_state()
  146. %%% @equiv hook_state(HookState,S)
  147. hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S.
  148. %%% @spec rules_state(S::global_state()) -> global_state()
  149. %%% @equiv rules_state(RulesState,S)
  150. rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S.
  151. %%% @spec fetch_state(S::global_state()) -> global_state()
  152. %%% @equiv fetch_state(FetchState,S)
  153. fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S.
  154. %%% @spec cont_state(S::global_state()) -> global_state()
  155. %%% @equiv cont_state(ContinuationState,S)
  156. cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S.
  157. %%%% Functions to modify the various states
  158. %%% @spec user_state(UserState, S::global_state()) -> global_state()
  159. %%% @doc For controlling the UserState, to be used in a user function.
  160. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  161. user_state(X, S) ->
  162. S#xmerl_scanner{user_state = X}.
  163. %%% @spec event_state(EventState, S::global_state()) -> global_state()
  164. %%% @doc For controlling the EventState, to be used in an event
  165. %%% function, and called at the beginning and at the end of a parsed entity.
  166. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  167. event_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  168. FS1 = FS#xmerl_fun_states{event = X},
  169. S#xmerl_scanner{fun_states = FS1}.
  170. %%% @spec hook_state(HookState, S::global_state()) -> global_state()
  171. %%% @doc For controlling the HookState, to be used in a hook
  172. %%% function, and called when the parser has parsed a complete entity.
  173. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  174. hook_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  175. FS1 = FS#xmerl_fun_states{hook = X},
  176. S#xmerl_scanner{fun_states = FS1}.
  177. %%% @spec rules_state(RulesState, S::global_state()) -> global_state()
  178. %%% @doc For controlling the RulesState, to be used in a rules
  179. %%% function, and called when the parser store scanner information in a rules
  180. %%% database.
  181. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  182. rules_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  183. FS1 = FS#xmerl_fun_states{rules = X},
  184. S#xmerl_scanner{fun_states = FS1}.
  185. %%% @spec fetch_state(FetchState, S::global_state()) -> global_state()
  186. %%% @doc For controlling the FetchState, to be used in a fetch
  187. %%% function, and called when the parser fetch an external resource (eg. a DTD).
  188. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  189. fetch_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  190. FS1 = FS#xmerl_fun_states{fetch = X},
  191. S#xmerl_scanner{fun_states = FS1}.
  192. %%% @spec cont_state(ContinuationState, S::global_state()) -> global_state()
  193. %%% @doc For controlling the ContinuationState, to be used in a continuation
  194. %%% function, and called when the parser encounters the end of the byte stream.
  195. %%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
  196. cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
  197. FS1 = FS#xmerl_fun_states{cont = X},
  198. S#xmerl_scanner{fun_states = FS1}.
  199. %% @spec file(Filename::string()) -> {xmlElement(),Rest}
  200. %% Rest = list()
  201. %% @equiv file(Filename, [])
  202. file(F) ->
  203. file(F, []).
  204. %% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest}
  205. %% Rest = list()
  206. %%% @doc Parse file containing an XML document
  207. file(F, Options) ->
  208. ExtCharset=case lists:keysearch(encoding,1,Options) of
  209. {value,{_,Val}} -> Val;
  210. false -> undefined
  211. end,
  212. case int_file(F,Options,ExtCharset) of
  213. {Res, Tail,S=#xmerl_scanner{close_fun=Close}} ->
  214. Close(S), % for side effects only - final state is dropped
  215. {Res,Tail};
  216. {error, Reason} ->
  217. {error, Reason};
  218. Other ->
  219. {error, Other}
  220. end.
  221. int_file(F, Options,_ExtCharset) ->
  222. % io:format("int_file F=~p~n",[F]),
  223. case file:read_file(F) of
  224. {ok, Bin} ->
  225. int_string(binary_to_list(Bin), Options, filename:dirname(F));
  226. Error ->
  227. Error
  228. end.
  229. int_file_decl(F, Options,_ExtCharset) ->
  230. % io:format("int_file_decl F=~p~n",[F]),
  231. case file:read_file(F) of
  232. {ok, Bin} ->
  233. int_string_decl(binary_to_list(Bin), Options, filename:dirname(F));
  234. Error ->
  235. Error
  236. end.
  237. %% @spec string(Text::list()) -> {xmlElement(),Rest}
  238. %% Rest = list()
  239. %% @equiv string(Test, [])
  240. string(Str) ->
  241. string(Str, []).
  242. %% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest}
  243. %% Rest = list()
  244. %%% @doc Parse string containing an XML document
  245. string(Str, Options) ->
  246. case int_string(Str, Options) of
  247. {Res, Tail, S=#xmerl_scanner{close_fun = Close}} ->
  248. Close(S), % for side effects only - final state is dropped
  249. {Res,Tail};
  250. {error, Reason} ->
  251. {error, Reason}; % (This can't happen, currently)
  252. Other ->
  253. {error, Other}
  254. end.
  255. int_string(Str, Options) ->
  256. {ok, XMLBase} = file:get_cwd(),
  257. int_string(Str, Options, XMLBase).
  258. int_string(Str, Options, XMLBase) ->
  259. S=initial_state0(Options,XMLBase),
  260. case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
  261. {auto,'iso-10646-utf-1',Str2} ->
  262. scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  263. {external,'iso-10646-utf-1',Str2} ->
  264. scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  265. {undefined,undefined,Str2} ->
  266. scan_document(Str2, S);
  267. {external,ExtCharset,Str2} ->
  268. scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
  269. end.
  270. int_string_decl(Str, Options, XMLBase) ->
  271. S=initial_state0(Options,XMLBase),
  272. case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
  273. {auto,'iso-10646-utf-1',Str2} ->
  274. scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  275. {external,'iso-10646-utf-1',Str2} ->
  276. scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
  277. {undefined,undefined,Str2} ->
  278. scan_decl(Str2, S);
  279. {external,ExtCharset,Str2} ->
  280. scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
  281. end.
  282. initial_state0(Options,XMLBase) ->
  283. initial_state(Options, #xmerl_scanner{
  284. event_fun = fun event/2,
  285. hook_fun = fun hook/2,
  286. acc_fun = fun acc/3,
  287. fetch_fun = fun fetch/2,
  288. close_fun = fun close/1,
  289. continuation_fun = fun cont/3,
  290. rules_read_fun = fun rules_read/3,
  291. rules_write_fun = fun rules_write/4,
  292. rules_delete_fun= fun rules_delete/3,
  293. xmlbase = XMLBase
  294. }).
  295. initial_state([{event_fun, F}|T], S) ->
  296. initial_state(T, S#xmerl_scanner{event_fun = F});
  297. initial_state([{event_fun, F, ES}|T], S) ->
  298. S1 = event_state(ES, S#xmerl_scanner{event_fun = F}),
  299. initial_state(T, S1);
  300. initial_state([{acc_fun, F}|T], S) ->
  301. initial_state(T, S#xmerl_scanner{acc_fun = F});
  302. initial_state([{hook_fun, F}|T], S) ->
  303. initial_state(T, S#xmerl_scanner{hook_fun = F});
  304. initial_state([{hook_fun, F, HS}|T], S) ->
  305. S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}),
  306. initial_state(T, S1);
  307. initial_state([{close_fun, F}|T], S) ->
  308. initial_state(T, S#xmerl_scanner{close_fun = F});
  309. initial_state([{fetch_fun, F}|T], S) ->
  310. initial_state(T, S#xmerl_scanner{fetch_fun = F});
  311. initial_state([{fetch_fun, F, FS}|T], S) ->
  312. S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}),
  313. initial_state(T, S1);
  314. initial_state([{fetch_path, P}|T], S) ->
  315. initial_state(T, S#xmerl_scanner{fetch_path = P});
  316. initial_state([{continuation_fun, F}|T], S) ->
  317. initial_state(T, S#xmerl_scanner{continuation_fun = F});
  318. initial_state([{continuation_fun, F, CS}|T], S) ->
  319. S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}),
  320. initial_state(T, S1);
  321. initial_state([{rules, R}|T], S) ->
  322. initial_state(T, S#xmerl_scanner{rules = R,
  323. keep_rules = true});
  324. initial_state([{rules, Read, Write, RS}|T], S) ->
  325. S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read,
  326. rules_write_fun = Write,
  327. keep_rules = true}),
  328. initial_state(T, S1);
  329. initial_state([{user_state, F}|T], S) ->
  330. initial_state(T, S#xmerl_scanner{user_state = F});
  331. initial_state([{space, L}|T], S) ->
  332. initial_state(T, S#xmerl_scanner{space = L});
  333. initial_state([{line, L}|T], S) ->
  334. initial_state(T, S#xmerl_scanner{line = L});
  335. initial_state([{namespace_conformant, F}|T], S) when F==true; F==false ->
  336. initial_state(T, S#xmerl_scanner{namespace_conformant = F});
  337. initial_state([{validation, F}|T], S) when F==true; F==false ->
  338. initial_state(T, S#xmerl_scanner{validation = F});
  339. initial_state([{quiet, F}|T], S) when F==true; F==false ->
  340. initial_state(T, S#xmerl_scanner{quiet = F});
  341. initial_state([{doctype_DTD,DTD}|T], S) ->
  342. initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
  343. initial_state([{text_decl,Bool}|T], S) ->
  344. initial_state(T,S#xmerl_scanner{text_decl=Bool});
  345. initial_state([{environment,Env}|T], S) ->
  346. initial_state(T,S#xmerl_scanner{environment=Env});
  347. initial_state([{xmlbase, D}|T], S) ->
  348. initial_state(T, S#xmerl_scanner{xmlbase = D});
  349. initial_state([{encoding, Enc}|T], S) ->
  350. initial_state(T, S#xmerl_scanner{encoding = Enc});
  351. initial_state([], S=#xmerl_scanner{rules = undefined}) ->
  352. Tab = ets:new(rules, [set, public]),
  353. S#xmerl_scanner{rules = Tab};
  354. initial_state([], S) ->
  355. S.
  356. %%% -----------------------------------------------------
  357. %%% Default modifier functions
  358. %%% Hooks:
  359. %%% - {element, Line, Name, Attrs, Content}
  360. %%% - {processing_instruction, Line, Data}
  361. hook(X, State) ->
  362. {X, State}.
  363. %%% Events:
  364. %%%
  365. %%% #xmerl_event{event : started | ended,
  366. %%% line : integer(),
  367. %%% col : integer(),
  368. %%% data}
  369. %%%
  370. %%% Data Events
  371. %%% document started, ended
  372. %%% #xmlElement started, ended
  373. %%% #xmlAttribute ended
  374. %%% #xmlPI ended
  375. %%% #xmlComment ended
  376. %%% #xmlText ended
  377. event(_X, S) ->
  378. S.
  379. %% The acc/3 function can return either {Acc´, S'} or {Acc', Pos', S'},
  380. %% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or
  381. %% X#xmlAttribute.pos (whichever is the current object type.)
  382. %% The acc/3 function is not allowed to redefine the type of object
  383. %% being defined, but _is_ allowed to either ignore it or split it
  384. %% into multiple objects (in which case {Acc',Pos',S'} should be returned.)
  385. %% If {Acc',S'} is returned, Pos will be incremented by 1 by default.
  386. %% Below is an example of an acceptable operation
  387. acc(X = #xmlText{value = Text}, Acc, S) ->
  388. {[X#xmlText{value = lists:flatten(Text)}|Acc], S};
  389. acc(X, Acc, S) ->
  390. {[X|Acc], S}.
  391. fetch({system, URI}, S) ->
  392. fetch_URI(URI, S);
  393. fetch({public, _PublicID, URI}, S) ->
  394. fetch_URI(URI, S).
  395. %%% Always assume an external resource can be found locally! Thus
  396. %%% don't bother fetching with e.g. HTTP. Returns the path where the
  397. %%% resource is found. The path to the external resource is given by
  398. %%% URI directly or the option fetch_path (additional paths) or
  399. %%% directory (base path to external resource)
  400. fetch_URI(URI, S) ->
  401. %% assume URI is a filename
  402. Split = filename:split(URI),
  403. Filename = fun([])->[];(X)->lists:last(X) end (Split),
  404. Fullname =
  405. case Split of %% how about Windows systems?
  406. ["file:"|Name]-> %% absolute path, see RFC2396 sect 3
  407. %% file:/dtd_name
  408. filename:join(["/"|Name]);
  409. ["/"|Rest] when Rest /= [] ->
  410. %% absolute path name
  411. URI;
  412. ["http:"|_Rest] ->
  413. {http,URI};
  414. [] -> %% empty systemliteral
  415. [];
  416. _ ->
  417. filename:join(S#xmerl_scanner.xmlbase, URI)
  418. end,
  419. Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname),
  420. ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]),
  421. {ok, Path, S}.
  422. path_locate(_, _, {http,_}=URI) ->
  423. URI;
  424. path_locate(_, _, []) ->
  425. [];
  426. path_locate([Dir|Dirs], FN, FullName) ->
  427. F = filename:join(Dir, FN),
  428. case file:read_file_info(F) of
  429. {ok, #file_info{type = regular}} ->
  430. {file,F};
  431. _ ->
  432. path_locate(Dirs, FN, FullName)
  433. end;
  434. path_locate([], _FN, FullName) ->
  435. {file,FullName}.
  436. cont(_F, Exception, US) ->
  437. Exception(US).
  438. close(S) ->
  439. S.
  440. %%% -----------------------------------------------------
  441. %%% Scanner
  442. %%% [1] document ::= prolog element Misc*
  443. scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
  444. line = L, col = C,
  445. environment=Env,
  446. encoding=Charset,
  447. validation=ValidateResult}) ->
  448. S1 = Event(#xmerl_event{event = started,
  449. line = L,
  450. col = C,
  451. data = document}, S),
  452. %% Transform to given character set.
  453. %% Note that if another character set is given in the encoding
  454. %% attribute in a XML declaration that one will be used later
  455. Str=if
  456. Charset=/=undefined -> % Default character set is UTF-8
  457. ucs:to_unicode(Str0,list_to_atom(Charset));
  458. true ->
  459. Str0
  460. end,
  461. {"<"++T2, S2} = scan_prolog(Str, S1, _StartPos = 1),
  462. {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1),
  463. {Tail, S4}=scan_misc(T3, S3, _StartPos = 1),
  464. S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  465. line = S4#xmerl_scanner.line,
  466. col = S4#xmerl_scanner.col,
  467. data = document}, S4),
  468. S6 = case ValidateResult of
  469. false ->
  470. cleanup(S5);
  471. true when Env == element; Env == prolog ->
  472. check_decl2(S5),
  473. case xmerl_validate:validate(S5,Res) of
  474. {'EXIT',{error,Reason}} ->
  475. S5b=cleanup(S5),
  476. ?fatal({failed_validation,Reason}, S5b);
  477. {'EXIT',Reason} ->
  478. S5b=cleanup(S5),
  479. ?fatal({failed_validation,Reason}, S5b);
  480. {error,Reason} ->
  481. S5b=cleanup(S5),
  482. ?fatal({failed_validation,Reason}, S5b);
  483. {error,Reason,_Next} ->
  484. S5b=cleanup(S5),
  485. ?fatal({failed_validation,Reason}, S5b);
  486. _XML ->
  487. cleanup(S5)
  488. end;
  489. true ->
  490. cleanup(S5)
  491. end,
  492. {Res, Tail, S6}.
  493. scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
  494. line = L, col = C,
  495. environment=_Env,
  496. encoding=_Charset,
  497. validation=_ValidateResult}) ->
  498. S1 = Event(#xmerl_event{event = started,
  499. line = L,
  500. col = C,
  501. data = document}, S),
  502. case scan_prolog(Str, S1, _StartPos = 1) of
  503. {T2="<"++_, S2} ->
  504. {{S2#xmerl_scanner.user_state,T2},[],S2};
  505. {[], S2}->
  506. {[],[],S2};
  507. {T2, S2} ->
  508. {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
  509. _Lang=[],_Parents=[],#xmlNamespace{}),
  510. {T2,[],S3}
  511. end.
  512. %%% [22] Prolog
  513. %%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
  514. %%%
  515. %% Text declaration may be empty
  516. scan_prolog([], S=#xmerl_scanner{text_decl=true},_Pos) ->
  517. {[],S};
  518. scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  519. ?dbg("cont()...~n", []),
  520. F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end,
  521. fun(S1) -> {[], S1} end,
  522. S);
  523. scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
  524. when ?whitespace(hd(T)) ->
  525. {Charset,T3, S3}=
  526. if
  527. Col==1,L==1,S0#xmerl_scanner.text_decl==true ->
  528. ?dbg("prolog(\"<?xml\")~n", []),
  529. ?bump_col(5),
  530. {_,T1,S1} = mandatory_strip(T,S0),
  531. {Decl,T2, S2}=scan_text_decl(T1,S1),
  532. Encoding=Decl#xmlDecl.encoding,
  533. {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
  534. Col==1,L==1 ->
  535. ?dbg("prolog(\"<?xml\")~n", []),
  536. ?bump_col(5),
  537. {Decl,T2, S2}=scan_xml_decl(T, S0),
  538. Encoding=Decl#xmlDecl.encoding,
  539. {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
  540. true ->
  541. ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
  542. end,
  543. %% Now transform to declared character set.
  544. if
  545. Charset==Charset0 -> % Document already transformed to this charset!
  546. scan_prolog(T3, S3, Pos);
  547. Charset0=/=undefined -> % Document transformed to other bad charset!
  548. ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S3);
  549. Charset=/=undefined -> % Document not previously transformed
  550. T4=ucs:to_unicode(T3,list_to_atom(Charset)),
  551. scan_prolog(T4, S3, Pos);
  552. true -> % No encoding info given
  553. scan_prolog(T3, S3, Pos)
  554. end;
  555. scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog,
  556. encoding=Charset}, Pos) ->
  557. ?dbg("prolog(\"<!DOCTYPE\")~n", []),
  558. ?bump_col(9),
  559. %% If no known character set assume it is UTF-8
  560. T1=if
  561. Charset==undefined -> ucs:to_unicode(T,'utf-8');
  562. true -> T
  563. end,
  564. {T2, S1} = scan_doctype(T1, S),
  565. scan_misc(T2, S1, Pos);
  566. scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=Charset},Pos) ->
  567. ?dbg("prolog(\"<\")~n", []),
  568. %% Check for Comments, PI before possible DOCTYPE declaration
  569. ?bump_col(1),
  570. %% If no known character set assume it is UTF-8
  571. T=if
  572. Charset==undefined -> ucs:to_unicode(Str,'utf-8');
  573. true -> Str
  574. end,
  575. {T1, S1}=scan_misc(T, S, Pos),
  576. scan_prolog2(T1,S1,Pos).
  577. scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  578. ?dbg("cont()...~n", []),
  579. F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end,
  580. fun(S1) -> {[], S1} end,
  581. S);
  582. scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) ->
  583. ?dbg("prolog(\"<!DOCTYPE\")~n", []),
  584. ?bump_col(9),
  585. {T1, S1} = scan_doctype(T, S),
  586. scan_misc(T1, S1, Pos);
  587. scan_prolog2(Str = "<!" ++ _, S, _Pos) ->
  588. ?dbg("prolog(\"<!\")~n", []),
  589. %% In e.g. a DTD, we jump directly to markup declarations
  590. scan_ext_subset(Str, S);
  591. scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
  592. ?dbg("prolog(\"<\")~n", []),
  593. %% Check for more Comments and PI after DOCTYPE declaration
  594. ?bump_col(1),
  595. scan_misc(Str, S, Pos).
  596. %%% [27] Misc ::= Comment | PI | S
  597. %% Note:
  598. %% - Neither of Comment and PI are returned in the resulting parsed
  599. %% structure.
  600. %% - scan_misc/3 implements Misc* as that is how the rule is always used
  601. scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  602. ?dbg("cont()...~n", []),
  603. F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end,
  604. fun(S1) -> {[], S1} end,
  605. S);
  606. scan_misc("<!--" ++ T, S, Pos) -> % Comment
  607. {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
  608. scan_misc(T1,S1,Pos);
  609. scan_misc("<?" ++ T, S0, Pos) -> % PI
  610. ?dbg("prolog(\"<?\")~n", []),
  611. ?bump_col(2),
  612. {_PI, T1, S1} = scan_pi(T, S, Pos),
  613. scan_misc(T1,S1,Pos);
  614. scan_misc([H|T], S, Pos) when ?whitespace(H) ->
  615. ?dbg("prolog(whitespace)~n", []),
  616. scan_misc(T,S,Pos);
  617. scan_misc(T,S,_Pos) ->
  618. {T,S}.
  619. cleanup(S=#xmerl_scanner{keep_rules = false,
  620. rules = Rules}) ->
  621. ets:delete(Rules),
  622. S#xmerl_scanner{rules = undefined};
  623. cleanup(S) ->
  624. S.
  625. %%% Prolog and Document Type Declaration XML 1.0 Section 2.8
  626. %% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  627. %% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
  628. scan_xml_decl(T, S) ->
  629. %% VersionInfo [24] is mandatory
  630. {_,T1,S2} = mandatory_strip(T,S),
  631. "version" ++ T2 = T1,
  632. {T3, S3} = scan_eq(T2, S2),
  633. {Vsn, T4, S4} = scan_xml_vsn(T3, S3),
  634. Attr = #xmlAttribute{name = version,
  635. parents = [{xml, _XMLPos = 1}],
  636. value = Vsn},
  637. scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}).
  638. scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) ->
  639. ?dbg("cont()...~n", []),
  640. F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end,
  641. fun(S1) -> {[], [], S1} end,
  642. S);
  643. scan_xml_decl("?>" ++ T, S0, Decl) ->
  644. ?bump_col(2),
  645. return_xml_decl(T,S,Decl);
  646. scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) ->
  647. {_,T1,S1}=mandatory_strip(T,S),
  648. scan_xml_decl2(T1,S1,Decl).
  649. scan_xml_decl2("?>" ++ T, S0,Decl) ->
  650. ?bump_col(2),
  651. return_xml_decl(T,S,Decl);
  652. scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event},
  653. Decl0 = #xmlDecl{attributes = Attrs}) ->
  654. %% [80] EncodingDecl
  655. ?bump_col(8),
  656. {T1, S1} = scan_eq(T, S),
  657. {EncName, T2, S2} = scan_enc_name(T1, S1),
  658. LowEncName=httpd_util:to_lower(EncName),
  659. Attr = #xmlAttribute{name = encoding,
  660. parents = [{xml, _XMLPos = 1}],
  661. value = LowEncName},
  662. Decl = Decl0#xmlDecl{encoding = LowEncName,
  663. attributes = [Attr|Attrs]},
  664. S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  665. line = S0#xmerl_scanner.line,
  666. col = S0#xmerl_scanner.col,
  667. data = Attr}, S2),
  668. case T2 of
  669. "?>" ++ _T3 ->
  670. scan_xml_decl3(T2,S3,Decl);
  671. _ ->
  672. {_,T3,S4} = mandatory_strip(T2,S3),
  673. scan_xml_decl3(T3, S4, Decl)
  674. end;
  675. scan_xml_decl2(T="standalone" ++ _T,S,Decl) ->
  676. scan_xml_decl3(T,S,Decl).
  677. scan_xml_decl3("?>" ++ T, S0,Decl) ->
  678. ?bump_col(2),
  679. return_xml_decl(T,S,Decl);
  680. scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event},
  681. Decl0 = #xmlDecl{attributes = Attrs}) ->
  682. %% [32] SDDecl
  683. ?bump_col(10),
  684. {T1, S1} = scan_eq(T, S),
  685. {StValue,T2,S2}=scan_standalone_value(T1,S1),
  686. Attr = #xmlAttribute{name = standalone,
  687. parents = [{xml, _XMLPos = 1}],
  688. value = StValue},
  689. Decl = Decl0#xmlDecl{standalone = StValue,
  690. attributes = [Attr|Attrs]},
  691. S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  692. line = S0#xmerl_scanner.line,
  693. col = S0#xmerl_scanner.col,
  694. data = Attr}, S2),
  695. {_,T3,S4} = strip(T2,S3),
  696. "?>" ++ T4 = T3,
  697. return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl).
  698. return_xml_decl(T,S=#xmerl_scanner{hook_fun = Hook,
  699. event_fun = Event},
  700. Decl0 = #xmlDecl{attributes = Attrs}) ->
  701. ?strip1,
  702. Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
  703. S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  704. line = S#xmerl_scanner.line,
  705. col = S#xmerl_scanner.col,
  706. data = Decl}, S1),
  707. {Ret, S3} = Hook(Decl, S2),
  708. {Ret, T1, S3}.
  709. scan_standalone_value("'yes'" ++T,S0)->
  710. ?bump_col(5),
  711. {'yes',T,S#xmerl_scanner{standalone=yes}};
  712. scan_standalone_value("\"yes\"" ++T,S0)->
  713. ?bump_col(5),
  714. {'yes',T,S#xmerl_scanner{standalone=yes}};
  715. scan_standalone_value("'no'" ++T,S0) ->
  716. ?bump_col(4),
  717. {'no',T,S};
  718. scan_standalone_value("\"no\"" ++T,S0) ->
  719. ?bump_col(4),
  720. {'no',T,S}.
  721. %%%
  722. %%% Text declaration XML 1.0 section 4.3.1
  723. %%% [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
  724. scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) ->
  725. {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S),
  726. "encoding" ++ T2 = T1,
  727. S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8},
  728. {T3, S3} = scan_eq(T2, S2),
  729. {EncName, T4, S4} = scan_enc_name(T3, S3),
  730. LowEncName=httpd_util:to_lower(EncName),
  731. ?strip5,
  732. Attr = #xmlAttribute{name = encoding,
  733. parents = [{xml,1}],
  734. value = LowEncName},
  735. Decl = Decl0#xmlDecl{encoding = LowEncName,
  736. attributes = [Attr|Attrs]},
  737. S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended,
  738. line = S5#xmerl_scanner.line,
  739. col = S5#xmerl_scanner.col,
  740. data = Attr}, S5),
  741. scan_text_decl(T5,S6,Decl).
  742. scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = Hook,
  743. event_fun = Event},
  744. Decl0 = #xmlDecl{attributes = Attrs}) ->
  745. ?bump_col(2),
  746. ?strip1,
  747. Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
  748. S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  749. line = S0#xmerl_scanner.line,
  750. col = S0#xmerl_scanner.col,
  751. data = Decl}, S1),
  752. {Ret, S3} = Hook(Decl, S2),
  753. {Ret, T1, S3}.
  754. scan_optional_version("version"++T,S0) ->
  755. ?bump_col(7),
  756. ?strip1,
  757. {T2, S2} = scan_eq(T1, S1),
  758. {Vsn, T3, S3} = scan_xml_vsn(T2, S2),
  759. {_,T4,S4} = mandatory_strip(T3,S3),
  760. Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn},
  761. {#xmlDecl{attributes=[Attr]},T4,S4};
  762. scan_optional_version(T,S) ->
  763. {#xmlDecl{attributes=[]},T,S}.
  764. %%%%%%% [81] EncName
  765. scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) ->
  766. ?dbg("cont()...~n", []),
  767. F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end,
  768. fun(S1) -> ?fatal(expected_encoding_name, S1) end,
  769. S);
  770. scan_enc_name([H|T], S0) when H >= $"; H =< $' ->
  771. ?bump_col(1),
  772. scan_enc_name(T, S, H, []).
  773. scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
  774. ?dbg("cont()...~n", []),
  775. F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end,
  776. fun(S1) -> ?fatal(expected_encoding_name, S1) end,
  777. S);
  778. scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
  779. ?bump_col(1),
  780. scan_enc_name2(T, S, Delim, [H|Acc]);
  781. scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
  782. ?bump_col(1),
  783. scan_enc_name2(T, S, Delim, [H|Acc]);
  784. scan_enc_name([H|_T],S,_Delim,_Acc) ->
  785. ?fatal({error,{unexpected_character_in_Enc_Name,H}},S).
  786. scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
  787. ?dbg("cont()...~n", []),
  788. F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end,
  789. fun(S1) -> ?fatal(expected_encoding_name, S1) end,
  790. S);
  791. scan_enc_name2([H|T], S0, H, Acc) ->
  792. ?bump_col(1),
  793. {lists:reverse(Acc), T, S};
  794. scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
  795. ?bump_col(1),
  796. scan_enc_name2(T, S, Delim, [H|Acc]);
  797. scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
  798. ?bump_col(1),
  799. scan_enc_name2(T, S, Delim, [H|Acc]);
  800. scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 ->
  801. ?bump_col(1),
  802. scan_enc_name2(T, S, Delim, [H|Acc]);
  803. scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- ->
  804. ?bump_col(1),
  805. scan_enc_name2(T, S, Delim, [H|Acc]).
  806. %%%%%%% [26] VersionNum
  807. %%% VersionNum ::= ([a-zA-Z0-9_.:] | '-')+
  808. scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) ->
  809. ?dbg("cont()...~n", []),
  810. F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end,
  811. fun(S1) -> ?fatal(unexpected_end, S1) end,
  812. S);
  813. scan_xml_vsn([H|T], S) when H==$"; H==$'->
  814. xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []).
  815. xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
  816. ?dbg("cont()...~n", []),
  817. F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end,
  818. fun(S1) -> ?fatal(unexpected_end, S1) end,
  819. S);
  820. xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) ->
  821. {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}};
  822. xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z ->
  823. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  824. xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z ->
  825. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  826. xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 ->
  827. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  828. xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) ->
  829. case lists:member(H, "_.:-") of
  830. true ->
  831. xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
  832. false ->
  833. ?fatal({invalid_vsn_char, H}, S)
  834. end.
  835. %%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  836. scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
  837. ?dbg("cont()...~n", []),
  838. F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end,
  839. fun(S1) -> ?fatal(unexpected_end, S1) end,
  840. S);
  841. scan_pi(Str = [H1,H2,H3 | T],S=#xmerl_scanner{line = L, col = C}, Pos)
  842. when H1==$x;H1==$X ->
  843. %% names beginning with [xX][mM][lL] are reserved for future use.
  844. if
  845. ((H2==$m) or (H2==$M)) and
  846. ((H3==$l) or (H3==$L)) ->
  847. scan_wellknown_pi(T,S,Pos);
  848. true ->
  849. {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
  850. scan_pi(T1, S1, Target, L, C, Pos, [])
  851. end;
  852. scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) ->
  853. {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
  854. scan_pi(T1, S1, Target, L, C, Pos,[]).
  855. %%% More info on xml-stylesheet can be found at:
  856. %%% "Associating Style Sheets with XML documents", Version 1.0,
  857. %%% W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/)
  858. scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) ->
  859. ?dbg("prolog(\"<?xml-stylesheet\")~n", []),
  860. ?bump_col(16),
  861. scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]);
  862. scan_wellknown_pi(Str,S,_Pos) ->
  863. ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S).
  864. % scan_pi(Str="?>"++_T,S,Target, L, C, Pos) ->
  865. % scan_pi(Str,S,Target, L, C, Pos,[]);
  866. % scan_pi(Str=[],S,Target, L, C, Pos) ->
  867. % scan_pi(Str,S,Target, L, C, Pos,[]);
  868. % scan_pi(T,S,Target, L, C, Pos) ->
  869. % {_,T1,S1} = mandatory_strip(T,S),
  870. % scan_pi(T1,S1,Target, L, C, Pos,[]).
  871. scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
  872. ?dbg("cont()...~n", []),
  873. F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end,
  874. fun(S1) -> ?fatal(unexpected_end, S1) end,
  875. S);
  876. scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
  877. event_fun = Event},
  878. Target, L, C, Pos, Acc) ->
  879. ?bump_col(2),
  880. PI = #xmlPI{name = Target,
  881. pos = Pos,
  882. value = lists:reverse(Acc)},
  883. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  884. line = L,
  885. col = C,
  886. data = PI}, S),
  887. {Ret, S2} = Hook(PI, S1),
  888. {Ret, T, S2};
  889. scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) ->
  890. ?strip1,
  891. scan_pi2(T1, S1, Target, L, C, Pos, Acc).
  892. scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
  893. ?dbg("cont()...~n", []),
  894. F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end,
  895. fun(S1) -> ?fatal(unexpected_end, S1) end,
  896. S);
  897. scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
  898. event_fun = Event},
  899. Target, L, C, Pos, Acc) ->
  900. ?bump_col(2),
  901. PI = #xmlPI{name = Target,
  902. pos = Pos,
  903. value = lists:reverse(Acc)},
  904. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  905. line = L,
  906. col = C,
  907. data = PI}, S),
  908. {Ret, S2} = Hook(PI, S1),
  909. {Ret, T, S2};
  910. scan_pi2([H|T], S0, Target, L, C, Pos, Acc) ->
  911. ?bump_col(1),
  912. wfc_legal_char(H,S),
  913. scan_pi2(T, S, Target, L, C, Pos, [H|Acc]).
  914. %% [28] doctypedecl ::=
  915. %% '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
  916. scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) ->
  917. ?dbg("cont()...~n", []),
  918. F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end,
  919. fun(S1) -> ?fatal(unexpected_end, S1) end,
  920. S);
  921. scan_doctype(T, S) ->
  922. {_,T1,S1} = mandatory_strip(T,S),
  923. {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
  924. ?strip3,
  925. scan_doctype1(T3, S3#xmerl_scanner{doctype_name = DTName}).
  926. %% [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  927. %% | 'PUBLIC' S PubidLiteral S SystemLiteral
  928. scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) ->
  929. ?dbg("cont()...~n", []),
  930. F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end,
  931. fun(S1) -> ?fatal(unexpected_end, S1) end,
  932. S);
  933. scan_doctype1("PUBLIC" ++ T, S0) ->
  934. ?bump_col(6),
  935. {_,T1,S1} = mandatory_strip(T,S),
  936. {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
  937. {_,T3,S3} = mandatory_strip(T2,S2),
  938. {SL, T4, S4} = scan_system_literal(T3, S3),
  939. ?strip5,
  940. scan_doctype2(T5, S5, {public, PIDL, SL});
  941. scan_doctype1("SYSTEM" ++ T, S0) ->
  942. ?bump_col(6),
  943. {_,T1,S1} = mandatory_strip(T,S),
  944. {SL, T2, S2} = scan_system_literal(T1, S1),
  945. ?strip3,
  946. scan_doctype2(T3, S3, {system, SL});
  947. scan_doctype1(T, S) ->
  948. scan_doctype2(T, S, undefined).
  949. scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
  950. ?dbg("cont()...~n", []),
  951. F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end,
  952. fun(S1) -> ?fatal(unexpected_end, S1) end,
  953. S);
  954. scan_doctype2("[" ++ T, S0, DTD) ->
  955. ?bump_col(1),
  956. ?strip1,
  957. scan_doctype3(T1, S1, DTD);
  958. scan_doctype2(">" ++ T, S0, DTD) ->
  959. ?bump_col(1),
  960. ?strip1,
  961. S2 = fetch_DTD(DTD, S1),
  962. check_decl(S2),
  963. {T1, S2}.
  964. %% [28a] DeclSep ::= PEReference | S
  965. %% [28b] intSubset ::= (markupdecl | DeclSep)*
  966. scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
  967. ?dbg("cont()...~n", []),
  968. F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end,
  969. fun(S1) -> ?fatal(unexpected_end, S1) end,
  970. S);
  971. scan_doctype3("%" ++ T, S0, DTD) ->
  972. ?bump_col(1),
  973. {PERefName, T1, S1} = scan_pe_reference(T, S),
  974. ?strip2,
  975. case expand_pe_reference(PERefName, S2,as_PE) of
  976. {system, _} = Name ->
  977. S3 = fetch_DTD(Name, S2),
  978. scan_doctype3(T2, S3, DTD);
  979. {public, _} = Name ->
  980. S3 = fetch_DTD(Name, S2),
  981. scan_doctype3(T2, S3, DTD);
  982. {public, _, _} = Name ->
  983. S3 = fetch_DTD(Name, S2),
  984. scan_doctype3(T2, S3, DTD);
  985. ExpRef when list(ExpRef) -> % Space added, see Section 4.4.8
  986. {_,T3,S3} = strip(ExpRef++T2,S2),
  987. scan_doctype3(T3,S3,DTD)
  988. end;
  989. scan_doctype3("]" ++ T, S0, DTD) ->
  990. ?bump_col(1),
  991. ?strip1,
  992. S2 = fetch_DTD(DTD, S1),
  993. check_decl(S2),
  994. ">" ++ T2 = T1,
  995. {T2, S2};
  996. scan_doctype3(T, S, DTD) ->
  997. {_, T1, S1} = scan_markup_decl(T, S),
  998. scan_doctype3(T1, S1, DTD).
  999. fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when list(URI)->
  1000. %% allow to specify DTD name when it isn't available in xml stream
  1001. fetch_DTD({system,URI},S);
  1002. fetch_DTD(undefined, S) ->
  1003. S;
  1004. fetch_DTD(DTDSpec, S)->
  1005. case fetch_and_parse(DTDSpec,S,[{text_decl,true},
  1006. {environment,{external,subset}}]) of
  1007. NewS when record(NewS,xmerl_scanner) ->
  1008. NewS;
  1009. {_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules
  1010. S
  1011. end.
  1012. fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch,
  1013. rules=Rules,
  1014. xmlbase = XMLBase},
  1015. Options0) ->
  1016. RetS =
  1017. case Fetch(ExtSpec, S) of
  1018. {ok, NewS} ->
  1019. %% For backward compatibility only. This will be removed later!!
  1020. NewS;
  1021. {ok, not_fetched,NewS} ->
  1022. NewS;
  1023. {ok, DataRet, NewS = #xmerl_scanner{user_state = UState,
  1024. event_fun = Event,
  1025. hook_fun = Hook,
  1026. fetch_fun = Fetch1,
  1027. close_fun = Close1,
  1028. continuation_fun = Cont,
  1029. acc_fun = Acc,
  1030. rules_read_fun = Read,
  1031. rules_write_fun = Write,
  1032. validation = Valid,
  1033. quiet = Quiet,
  1034. encoding = Charset
  1035. }} ->
  1036. EvS = event_state(NewS),
  1037. HoS = hook_state(NewS),
  1038. FeS = fetch_state(NewS),
  1039. CoS = cont_state(NewS),
  1040. Options = Options0++[{user_state, UState},
  1041. {rules, Rules},
  1042. {event_fun, Event, EvS},
  1043. {hook_fun, Hook, HoS},
  1044. {fetch_fun, Fetch1, FeS},
  1045. {close_fun, Close1},
  1046. {continuation_fun, Cont, CoS},
  1047. {rules, Read, Write, ""},
  1048. {acc_fun, Acc},
  1049. {validation,Valid},
  1050. {quiet,Quiet},
  1051. {encoding,Charset}],
  1052. case DataRet of
  1053. {file, F} ->
  1054. int_file_decl(F, Options,Charset);
  1055. {string, String} ->
  1056. int_string_decl(String, Options,XMLBase);
  1057. _ ->
  1058. %% other scheme
  1059. {DataRet,[],NewS}
  1060. end;
  1061. Error ->
  1062. ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S)
  1063. end,
  1064. case RetS of
  1065. #xmerl_scanner{} ->
  1066. RetS#xmerl_scanner{text_decl=false,
  1067. environment=S#xmerl_scanner.environment};
  1068. _ -> RetS
  1069. end.
  1070. fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) ->
  1071. case Fetch(ExtSpec,S) of
  1072. {ok, not_fetched,_NewS} ->
  1073. ?fatal({error_fetching_external_source,ExtSpec},S);
  1074. {ok, DataRet, NewS} ->
  1075. String =
  1076. case DataRet of
  1077. {file,F} ->
  1078. get_file(F,S);
  1079. {string,Str} ->
  1080. binary_to_list(Str);
  1081. _ -> DataRet
  1082. end,
  1083. {String, NewS};
  1084. _ ->
  1085. ?fatal({error_fetching_external_resource,ExtSpec},S)
  1086. end.
  1087. get_file(F,S) ->
  1088. % io:format("get_file F=~p~n",[F]),
  1089. case file:read_file(F) of
  1090. {ok,Bin} ->
  1091. binary_to_list(Bin);
  1092. Err ->
  1093. ?fatal({error_reading_file,F,Err},S)
  1094. end.
  1095. %% check_decl/1
  1096. %% Now it is necessary to check that all referenced types is declared,
  1097. %% since it is legal to reference some xml types before they are
  1098. %% declared.
  1099. check_decl(#xmerl_scanner{validation=false}) ->
  1100. ok;
  1101. check_decl(#xmerl_scanner{rules=Tab} = S) ->
  1102. check_notations(Tab,S),
  1103. check_elements(Tab,S), %% check also attribute defs for element
  1104. check_entities(Tab,S).
  1105. check_notations(Tab,S) ->
  1106. case ets:match(Tab,{{notation,'$1'},undeclared}) of
  1107. [[]] -> ok;
  1108. [] -> ok;
  1109. [L] when list(L) ->
  1110. ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
  1111. Err ->
  1112. ?fatal({error_missing_declaration_in_DTD,Err},S)
  1113. end.
  1114. check_elements(Tab,S) ->
  1115. case ets:match(Tab,{{elem_def,'_'},'$2'},10) of
  1116. {_,_}=M ->
  1117. Fun = fun({Match,'$end_of_table'},_F) ->
  1118. lists:foreach(fun(X)->check_elements2(X,S) end,
  1119. Match),
  1120. ok;
  1121. ('$end_of_table',_) ->
  1122. ok;
  1123. ({Match,Cont},F) ->
  1124. lists:foreach(fun(X)->check_elements2(X,S) end,
  1125. Match),
  1126. F(ets:match(Cont),F)
  1127. end,
  1128. Fun(M,Fun);
  1129. '$end_of_table' -> ok;
  1130. Err -> ?fatal({error_missing_declaration_in_DTD,Err},S)
  1131. end.
  1132. % it is not an error to declare attributes for an element that is not
  1133. % declared.
  1134. check_elements2([#xmlElement{attributes=Attrs}],S) ->
  1135. check_attributes(Attrs,S);
  1136. check_elements2(_,_) ->
  1137. ok.
  1138. check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) ->
  1139. case lists:keysearch('ID',2,Rest) of
  1140. {value,Att2} ->
  1141. ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S);
  1142. _ ->
  1143. ok
  1144. end,
  1145. vc_ID_Attribute_Default(Attr,S),
  1146. check_attributes(Rest,S);
  1147. check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) ->
  1148. vc_Enumeration(Attr,S),
  1149. check_attributes(T,S);
  1150. check_attributes([{_,Ent,_,_,_}=Attr|T],S)
  1151. when Ent=='ENTITY';Ent=='ENTITIES' ->
  1152. vc_Entity_Name(Attr,S),
  1153. check_attributes(T,S);
  1154. check_attributes([_|T],S) ->
  1155. check_attributes(T,S);
  1156. check_attributes([],_S) ->
  1157. ok.
  1158. check_entities(Tab,S=#xmerl_scanner{validation=true}) ->
  1159. case ets:match(Tab,{{entity,'$1'},undeclared}) of
  1160. [[]] -> ok;
  1161. [] -> ok;
  1162. [L] when list(L) ->
  1163. ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
  1164. Err ->
  1165. ?fatal({error_missing_declaration_in_DTD,Err},S)
  1166. end;
  1167. check_entities(_,_) ->
  1168. ok.
  1169. %% check_decl2/1: checks that all referenced ID attributes are declared
  1170. check_decl2(S=#xmerl_scanner{rules=Tab}) ->
  1171. check_referenced_ids(Tab,S).
  1172. check_referenced_ids(Tab,S) ->
  1173. case ets:match(Tab,{{id,'$1'},undeclared}) of
  1174. [[]] -> ok;
  1175. [] -> ok;
  1176. [L] when list(L) ->
  1177. ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
  1178. Err ->
  1179. ?fatal({error_missing_declaration_in_DTD,Err},S)
  1180. end.
  1181. %%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl
  1182. scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) ->
  1183. ?dbg("cont()...~n", []),
  1184. F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end,
  1185. fun(S1) -> {[], S1} end,
  1186. S);
  1187. scan_ext_subset("%" ++ T, S0) ->
  1188. %% DeclSep [28a]: WFC: PE Between Declarations.
  1189. %% The replacement text of a parameter entity reference in a
  1190. %% DeclSep must match the production extSubsetDecl.
  1191. ?bump_col(1),
  1192. {_,T1,S1} = scan_decl_sep(T,S),
  1193. scan_ext_subset(T1, S1);
  1194. scan_ext_subset("<![" ++ T, S0) ->
  1195. ?bump_col(3),
  1196. ?strip1,
  1197. {_, T2, S2} = scan_conditional_sect(T1, S1),
  1198. scan_ext_subset(T2,S2);
  1199. scan_ext_subset(T, S) when ?whitespace(hd(T)) ->
  1200. {_,T1,S1} = strip(T,S),
  1201. scan_ext_subset(T1, S1);
  1202. scan_ext_subset(T, S) ->
  1203. {_, T1, S1} = scan_markup_decl(T, S),
  1204. scan_ext_subset(T1, S1).
  1205. %%%%%%% [28a] DeclSep ::= PEReference | S
  1206. scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read,
  1207. rules_write_fun=Write,
  1208. rules_delete_fun=Delete}) ->
  1209. {PERefName, T1, S1} = scan_pe_reference(T, S),
  1210. {ExpandedRef,S2} =
  1211. case expand_pe_reference(PERefName,S1,as_PE) of
  1212. Tuple when tuple(Tuple) ->
  1213. %% {system,URI} or {public,URI}
  1214. {ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
  1215. {EntV,_,_S2} = scan_entity_value(ExpRef, S1, no_delim,
  1216. PERefName,parameter),
  1217. %% should do an update Write(parameter_entity) so next
  1218. %% expand_pe_reference is faster
  1219. Delete(parameter_entity,PERefName,_S2),
  1220. _S3 = Write(parameter_entity,PERefName,EntV,_S2),
  1221. EntV2 = Read(parameter_entity,PERefName,_S3),
  1222. {" " ++ EntV2 ++ " ",_S3};
  1223. ExpRef ->
  1224. {ExpRef,S1}
  1225. end,
  1226. {_, T3, S3} = strip(ExpandedRef,S2),
  1227. {_T4,S4} = scan_ext_subset(T3,S3),
  1228. strip(T1,S4).
  1229. %%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect
  1230. scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) ->
  1231. ?dbg("cont()...~n", []),
  1232. F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end,
  1233. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1234. S);
  1235. scan_conditional_sect("IGNORE" ++ T, S0) ->
  1236. ?bump_col(6),
  1237. ?strip1,
  1238. "[" ++ T2 = T1,
  1239. {_,T3,S3} = strip(T2,S1),
  1240. scan_ignore(T3,S3);
  1241. scan_conditional_sect("INCLUDE" ++ T, S0) ->
  1242. ?bump_col(7),
  1243. ?strip1,
  1244. "[" ++ T2 = T1,
  1245. {_,T3,S3} = strip(T2,S1),
  1246. scan_include(T3, S3);
  1247. scan_conditional_sect("%"++T,S0) ->
  1248. ?bump_col(1),
  1249. ?bump_col(1),
  1250. {PERefName, T1, S1} = scan_pe_reference(T, S),
  1251. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  1252. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  1253. scan_conditional_sect(T2,S2).
  1254. %%%% [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
  1255. %%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
  1256. %%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
  1257. scan_ignore(Str,S) ->
  1258. scan_ignore(Str,S,0).
  1259. scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) ->
  1260. ?dbg("cont()...~n", []),
  1261. F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end,
  1262. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1263. S);
  1264. scan_ignore("<![" ++ T, S0,Level) ->
  1265. %% nested conditional section. Topmost condition is ignore, though
  1266. ?bump_col(3),
  1267. scan_ignore(T, S,Level+1);
  1268. scan_ignore("]]>" ++ T, S0,0) ->
  1269. ?bump_col(3),
  1270. {[], T, S};
  1271. scan_ignore("]]>" ++ T, S0,Level) ->
  1272. ?bump_col(3),
  1273. scan_ignore(T, S,Level-1);
  1274. scan_ignore([_H|T],S0,Level) ->
  1275. ?bump_col(1),
  1276. scan_ignore(T,S,Level).
  1277. %%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
  1278. scan_include([], S=#xmerl_scanner{continuation_fun = F}) ->
  1279. ?dbg("cont()...~n", []),
  1280. F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end,
  1281. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1282. S);
  1283. scan_include("]]>" ++ T, S0) ->
  1284. ?bump_col(3),
  1285. {[], T, S};
  1286. scan_include("%" ++ T, S0) ->
  1287. ?bump_col(1),
  1288. {PERefName, T1, S1} = scan_pe_reference(T, S),
  1289. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  1290. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  1291. scan_include(T2, S2);
  1292. scan_include("<![" ++ T, S0) ->
  1293. ?bump_col(3),
  1294. ?strip1,
  1295. {_, T2, S2} = scan_conditional_sect(T1, S1),
  1296. ?strip3,
  1297. scan_include(T3,S3);
  1298. scan_include(T, S) ->
  1299. {_, T1, S1} = scan_markup_decl(T, S),
  1300. scan_include(T1, S1).
  1301. %%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl |
  1302. %%%%%%% NotationDecl | PI |Comment
  1303. %%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
  1304. %% Validity constraint: Unique Type Declaration: No element type may be
  1305. %% declared more than once.
  1306. %%
  1307. scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
  1308. ?dbg("cont()...~n", []),
  1309. F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end,
  1310. fun(S1) -> {[], [], S1} end,
  1311. S);
  1312. scan_markup_decl("<!--" ++ T, S0) ->
  1313. ?bump_col(4),
  1314. {_, T1, S1} = scan_comment(T, S),
  1315. ?strip2;
  1316. scan_markup_decl("<?" ++ T, S0) ->
  1317. ?bump_col(2),
  1318. {_PI, T1, S1} = scan_pi(T, S,_Pos=markup),
  1319. ?strip2;
  1320. scan_markup_decl("<!ELEMENT" ++ T,
  1321. #xmerl_scanner{rules_read_fun = Read,
  1322. rules_write_fun = Write,
  1323. rules_delete_fun = Delete} = S0) ->
  1324. ?bump_col(9),
  1325. {_,T1,S1} = mandatory_strip(T,S),
  1326. {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
  1327. Element =
  1328. case Read(elem_def, Ename, S2) of
  1329. El = #xmlElement{elementdef=Decl} when Decl /= undeclared ->
  1330. case S2#xmerl_scanner.validation of
  1331. true ->
  1332. ?fatal({already_defined, Ename}, S2);
  1333. _ ->
  1334. Delete(elem_def,Ename,S2),
  1335. El
  1336. end;
  1337. El = #xmlElement{} ->
  1338. Delete(elem_def,Ename,S2),
  1339. El;
  1340. undefined ->
  1341. #xmlElement{}
  1342. end,
  1343. {_,T3,S3} = mandatory_strip(T2,S2),
  1344. {Edef, T4, S4} = scan_contentspec(T3, S3),
  1345. ?strip5,
  1346. {">" ++ T6,S6} = scan_element_completion(T5,S5),
  1347. S7 = Write(elem_def, Ename,
  1348. Element#xmlElement{name = Ename,
  1349. content = Edef,
  1350. elementdef=S6#xmerl_scanner.environment}, S6),
  1351. strip(T6,S7);
  1352. scan_markup_decl("<!ENTITY" ++ T, S0) ->
  1353. %% <!ENTITY [%] entity.name NDATA notation.name>
  1354. %% <!ENTITY [%] entity.name "replacement text">
  1355. %% <!ENTITY [%] entity.name SYSTEM "system.identifier">
  1356. %% <!ENTITY [%] entity.name PUBLIC public.identifier "system.identifier">
  1357. ?bump_col(8),
  1358. {_,T1,S1} = mandatory_strip(T,S),
  1359. {T2, S2} = scan_entity(T1, S1),
  1360. ?strip3;
  1361. scan_markup_decl("<!NOTATION" ++ T, S0) ->
  1362. %% <!NOTATION notation.name "public.identifier" "helper.application">
  1363. ?bump_col(10),
  1364. {_,T1,S1} = mandatory_strip(T,S),
  1365. {T2, S2} = scan_notation_decl(T1, S1),
  1366. ?strip3;
  1367. scan_markup_decl("<!ATTLIST" ++ T,
  1368. #xmerl_scanner{rules_read_fun = Read,
  1369. rules_write_fun = Write,
  1370. rules_delete_fun= Delete} = S0) ->
  1371. %% <!ATTLIST Ename ( AttrName Type Value )*>
  1372. ?bump_col(9),
  1373. {_,T1,S1} = mandatory_strip(T,S),
  1374. {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
  1375. % ?strip3,
  1376. {Attributes, T4, S4} = scan_attdef(T2, S2),
  1377. {EDEF,MergedAttrs} =
  1378. case Read(elem_def, Ename, S4) of
  1379. undefined -> %% this may happen when the ELEMENT is declared in
  1380. %% the external DTD but the ATTLIST in the
  1381. %% internal DTD.
  1382. {#xmlElement{},update_attributes(Attributes,[])};
  1383. Edef = #xmlElement{attributes = OldAttrs} ->
  1384. Delete(elem_def,Ename,S4),
  1385. %% the slot in rules table must be empty so that the
  1386. %% later write has the assumed effect. Read maybe
  1387. %% should empty the table slot.
  1388. {Edef,update_attributes(Attributes, OldAttrs)}
  1389. end,
  1390. NewEdef = EDEF#xmlElement{name=Ename,attributes = MergedAttrs},
  1391. S5 = Write(elem_def, Ename, NewEdef, S4),
  1392. T5 = T4,
  1393. ?strip6.
  1394. scan_element_completion(T,S) ->
  1395. scan_markup_completion_gt(T,S).
  1396. update_attributes(NewAttrs, OldAttrs) ->
  1397. update_attributes1(NewAttrs,lists:reverse(OldAttrs)).
  1398. update_attributes1([A = {Name,_Type,_DefaultV,_DefaultD,_Env}|Attrs],
  1399. OldAttrs) ->
  1400. case lists:keymember(Name, 1, OldAttrs) of
  1401. true ->
  1402. update_attributes1(Attrs, OldAttrs);
  1403. false ->
  1404. update_attributes1(Attrs, [A|OldAttrs])
  1405. end;
  1406. update_attributes1([],Acc) ->
  1407. lists:reverse(Acc).
  1408. %%%%%%% [53] AttDef
  1409. scan_attdef([], S=#xmerl_scanner{continuation_fun = F}) ->
  1410. ?dbg("cont()...~n", []),
  1411. F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1) end,
  1412. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1413. S);
  1414. scan_attdef(T, S) ->
  1415. scan_attdef(T, S, _AttrAcc = []).
  1416. scan_attdef([], S=#xmerl_scanner{continuation_fun = F}, Attrs) ->
  1417. ?dbg("cont()...~n", []),
  1418. F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1, Attrs) end,
  1419. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1420. S);
  1421. scan_attdef(">" ++ T, S0, Attrs) ->
  1422. ?bump_col(1),
  1423. {lists:reverse(Attrs), T, S};
  1424. scan_attdef("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) ->
  1425. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  1426. scan_attdef("%" ++ T, S0, Attrs) ->
  1427. ?bump_col(1),
  1428. {PERefName, T1, S1} = scan_pe_reference(T, S0),
  1429. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  1430. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  1431. scan_attdef(T2, S2, Attrs);
  1432. scan_attdef(T,S,Attrs) ->
  1433. {_,T1,S1} = mandatory_strip(T,S),
  1434. scan_attdef2(T1,S1,Attrs).
  1435. scan_attdef2(">" ++ T, S0, Attrs) ->
  1436. ?bump_col(1),
  1437. {lists:reverse(Attrs), T, S};
  1438. scan_attdef2("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) ->
  1439. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  1440. scan_attdef2("%" ++ T, S0, Attrs) ->
  1441. ?bump_col(1),
  1442. {PERefName, T1, S1} = scan_pe_reference(T, S0),
  1443. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  1444. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  1445. scan_attdef2(T2, S2, Attrs);
  1446. scan_attdef2(T, S, Attrs) ->
  1447. {AttName, _NamespaceInfo, T1, S1} = scan_name(T, S),
  1448. {_,T2,S2} = mandatory_strip(T1,S1),
  1449. {AttType, T3, S3} = scan_att_type(T2, S2),
  1450. {_,T4,S4} = mandatory_strip(T3,S3),
  1451. {{DefaultDecl,DefaultValue}, T5, S5} = scan_default_decl(T4, S4, AttType),
  1452. ?strip6,
  1453. Attr = {AttName, AttType,DefaultValue,DefaultDecl,
  1454. S#xmerl_scanner.environment},
  1455. scan_attdef2(T6, S6, [Attr|Attrs]).
  1456. %%% [54] StringType
  1457. scan_att_type([], S=#xmerl_scanner{continuation_fun = F}) ->
  1458. ?dbg("cont()...~n", []),
  1459. F(fun(MoreBytes, S1) -> scan_att_type(MoreBytes, S1) end,
  1460. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1461. S);
  1462. scan_att_type("CDATA" ++ T, S0) ->
  1463. ?bump_col(5),
  1464. {'CDATA', T, S};
  1465. %%% [55] TokenizedType
  1466. scan_att_type("IDREFS" ++ T, S0) ->
  1467. ?bump_col(6),
  1468. {'IDREFS', T, S};
  1469. scan_att_type("IDREF" ++ T, S0) ->
  1470. ?bump_col(5),
  1471. {'IDREF', T, S};
  1472. scan_att_type("ID" ++ T, S0) ->
  1473. ?bump_col(2),
  1474. {'ID', T, S};
  1475. scan_att_type("ENTITY" ++ T, S0) ->
  1476. ?bump_col(6),
  1477. {'ENTITY', T, S};
  1478. scan_att_type("ENTITIES" ++ T, S0) ->
  1479. ?bump_col(8),
  1480. {'ENTITIES', T, S};
  1481. scan_att_type("NMTOKENS" ++ T, S0) ->
  1482. ?bump_col(8),
  1483. {'NMTOKENS', T, S};
  1484. scan_att_type("NMTOKEN" ++ T, S0) ->
  1485. ?bump_col(7),
  1486. {'NMTOKEN', T, S};
  1487. %%% [57] EnumeratedType
  1488. scan_att_type("NOTATION" ++ T, S0) ->
  1489. ?bump_col(8),
  1490. {_,T1,S1} = mandatory_strip(T,S),
  1491. "(" ++ T2 = T1,
  1492. S2 = S1,
  1493. ?strip3,
  1494. {Name, _NamespaceInfo, T4, S4} = scan_name(T3, S3),
  1495. notation_exists(Name, S4),
  1496. ?strip5,
  1497. scan_notation_type(T5, S5, [Name]);
  1498. scan_att_type("(" ++ T, S0) ->
  1499. ?bump_col(1),
  1500. ?strip1,
  1501. {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1),
  1502. ?strip3,
  1503. scan_enumeration(T3, S3, [NmToken]);
  1504. scan_att_type("%" ++ _T, S=#xmerl_scanner{environment=prolog}) ->
  1505. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  1506. scan_att_type("%" ++ T, S0) ->
  1507. ?bump_col(1),
  1508. {PERefName, T1, S1} = scan_pe_reference(T, S0),
  1509. ExpRef = expand_pe_reference(PERefName, S1,in_literal),
  1510. {ExpRef,T1,S1}.
  1511. %%% [58] NotationType
  1512. scan_notation_type([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
  1513. ?dbg("cont()...~n", []),
  1514. F(fun(MoreBytes, S1) -> scan_notation_type(MoreBytes, S1, Acc) end,
  1515. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1516. S);
  1517. scan_notation_type(")" ++ T, S0, Acc) ->
  1518. ?bump_col(1),
  1519. {{notation, lists:reverse(Acc)}, T, S};
  1520. scan_notation_type("|" ++ T, S0, Acc) ->
  1521. ?bump_col(1),
  1522. ?strip1,
  1523. {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
  1524. notation_exists(Name, S2),
  1525. ?strip3,
  1526. scan_notation_type(T3, S3, [Name | Acc]).
  1527. %%% Validity constraint for NotationType:
  1528. %%% The used notation names must be declared in the DTD, but they may
  1529. %%% be declared later.
  1530. notation_exists(Name, #xmerl_scanner{rules_read_fun = Read,
  1531. rules_write_fun = Write } = S) ->
  1532. case Read(notation, Name, S) of
  1533. undefined ->
  1534. %% this is legal, since the referenced NOTATION
  1535. %% may be declared later in internal or external
  1536. %% subset.
  1537. Write(notation,Name,undeclared,S);
  1538. _Value ->
  1539. ok
  1540. end.
  1541. %%% [59] Enumeration
  1542. scan_enumeration([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
  1543. ?dbg("cont()...~n", []),
  1544. F(fun(MoreBytes, S1) -> scan_enumeration(MoreBytes, S1, Acc) end,
  1545. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1546. S);
  1547. scan_enumeration(")" ++ T, S0, Acc) ->
  1548. ?bump_col(1),
  1549. {{enumeration, lists:reverse(Acc)}, T, S};
  1550. scan_enumeration("|" ++ T, S0, Acc) ->
  1551. ?bump_col(1),
  1552. ?strip1,
  1553. {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1),
  1554. ?strip3,
  1555. scan_enumeration(T3, S3, [NmToken|Acc]).
  1556. %%%%%%% [60] DefaultDecl
  1557. scan_default_decl([], S=#xmerl_scanner{continuation_fun = F}, Type) ->
  1558. ?dbg("cont()...~n", []),
  1559. F(fun(MoreBytes, S1) -> scan_default_decl(MoreBytes, S1, Type) end,
  1560. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1561. S);
  1562. scan_default_decl("#REQUIRED" ++ T, S0, _Type) ->
  1563. ?bump_col(9),
  1564. {{'#REQUIRED',no_value}, T, S};
  1565. scan_default_decl("#IMPLIED" ++ T, S0, _Type) ->
  1566. ?bump_col(8),
  1567. {{'#IMPLIED',no_value}, T, S};
  1568. scan_default_decl("#FIXED" ++ T, S0, Type) ->
  1569. ?bump_col(6),
  1570. {_,T1,S1} = mandatory_strip(T,S),
  1571. {Value,T2,S2,_} = default_value(T1, S1, Type),
  1572. {{'#FIXED',Value},T2,S2};
  1573. scan_default_decl(Str, S, Type) ->
  1574. {Value,T1,S1,_} = default_value(Str, S, Type),
  1575. {{no_decl,Value},T1,S1}.
  1576. %% There is room here to validate against Type, but we don't do it at
  1577. %% the moment.
  1578. default_value(T, S, Type) ->
  1579. {_Val, _T1, _S1,_} = scan_att_value(T, S, Type).
  1580. %%%%%%% [71] EntityDef
  1581. scan_entity([], S=#xmerl_scanner{continuation_fun = F}) ->
  1582. ?dbg("cont()...~n", []),
  1583. F(fun(MoreBytes, S1) -> scan_entity(MoreBytes, S1) end,
  1584. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1585. S);
  1586. scan_entity("%" ++ T, #xmerl_scanner{rules_write_fun = Write} = S0) ->
  1587. %% parameter entity
  1588. ?bump_col(1),
  1589. {_,T1,S1} = mandatory_strip(T,S),
  1590. {PEName, _NamespaceInfo, T2, S2} = scan_name_no_colons(T1, S1),
  1591. {_,T3,S3} = mandatory_strip(T2,S2),
  1592. {PEDef, T4, S4} = scan_pe_def(T3, S3, PEName),
  1593. ?strip5,
  1594. {">" ++ T6,S6} = scan_entity_completion(T5,S5),
  1595. S7 = Write(parameter_entity, PEName, PEDef, S6),
  1596. {T6, S7};
  1597. scan_entity(T, #xmerl_scanner{rules_write_fun = Write,
  1598. rules_read_fun = Read,
  1599. rules_delete_fun = Delete} = S) ->
  1600. %% generic entity
  1601. {EName, _NamespaceInfo, T1, S1} = scan_name_no_colons(T, S),
  1602. {_,T2,S2} = mandatory_strip(T1,S1),
  1603. {EDef, EntType, T3, S3} = scan_entity_def(T2, S2, EName),
  1604. check_entity_recursion(EName,S3),
  1605. ?strip4,
  1606. {">" ++ T5,S5} = scan_entity_completion(T4,S4),
  1607. case Read(entity,EName,S5) of
  1608. undeclared -> Delete(entity,EName,S5);
  1609. _ -> ok
  1610. end,
  1611. S6 = Write(entity, EName, {S5#xmerl_scanner.environment,EntType,EDef}, S5),
  1612. {T5, S6}.
  1613. scan_entity_completion(T,S) ->
  1614. scan_markup_completion_gt(T,S).
  1615. %%%%%%% [73] EntityDef
  1616. scan_entity_def([], S=#xmerl_scanner{continuation_fun = F}, EName) ->
  1617. ?dbg("cont()...~n", []),
  1618. F(fun(MoreBytes, S1) -> scan_entity_def(MoreBytes, S1, EName) end,
  1619. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1620. S);
  1621. scan_entity_def("'" ++ T, S0, EName) ->
  1622. ?bump_col(1),
  1623. {EVal,T1,S1}=scan_entity_value(T, S, $', EName,general),
  1624. {EVal,internal,T1,S1};
  1625. scan_entity_def("\"" ++ T, S0, EName) ->
  1626. ?bump_col(1),
  1627. {EVal,T1,S1}=scan_entity_value(T, S, $", EName,general),
  1628. {EVal,internal,T1,S1};
  1629. %% external general entity, parsed or unparsed.
  1630. scan_entity_def(Str, S, EName) ->
  1631. {ExtID, T1, S1} = scan_external_id(Str, S),
  1632. {NData, T2, S2} = scan_ndata_decl(T1, S1),
  1633. case NData of
  1634. {ndata,_} ->
  1635. %% if NDATA exists it is an unparsed ENTITY
  1636. {{ExtID,NData},external,T2,S2};
  1637. _ ->
  1638. case fetch_and_parse(ExtID,S2,
  1639. [{text_decl,true},
  1640. {environment,{external,{entity,EName}}}]) of
  1641. {{_USret,Entity},_Tail,_Sx} ->
  1642. {Entity, external,T2, S2};
  1643. {Entity,_Tail,Sx} ->
  1644. OldRef=S2#xmerl_scanner.entity_references,
  1645. NewRef=Sx#xmerl_scanner.entity_references,
  1646. {Entity,T2,
  1647. S2#xmerl_scanner{entity_references=OldRef++NewRef}};
  1648. {error,enoent} -> % this bad entity is declared,
  1649. % but it may not be referenced,
  1650. % then it would not be an
  1651. % error.
  1652. {{error,enoent},external,T2,S2}
  1653. end
  1654. end.
  1655. scan_ndata_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
  1656. ?dbg("cont()...~n", []),
  1657. F(fun(MoreBytes, S1) -> scan_ndata_decl(MoreBytes, S1) end,
  1658. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1659. S);
  1660. scan_ndata_decl(Str = ">"++_T, S) ->
  1661. {[], Str, S};
  1662. scan_ndata_decl(T, S) ->
  1663. {_,T1,S1} = mandatory_strip(T,S),
  1664. scan_ndata_decl2(T1,S1).
  1665. scan_ndata_decl2(Str = ">"++_T,S) ->
  1666. {[], Str, S};
  1667. scan_ndata_decl2("NDATA" ++ T,S0 = #xmerl_scanner{rules_read_fun = Read,
  1668. rules_write_fun = Write}) ->
  1669. ?bump_col(5),
  1670. {_,T1,S1} = mandatory_strip(T,S),
  1671. {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
  1672. case Read(notation, Name, S2) of
  1673. undefined -> %% this is legal, since the referenced NOTATION
  1674. %% may be declared later in internal or external
  1675. %% subset.
  1676. Write(notation,Name,undeclared,S2),
  1677. {{ndata,Name},T2,S2};
  1678. _Value ->
  1679. {{ndata, Name}, T2, S2}
  1680. end.
  1681. %%%%%%% [39] element
  1682. scan_element(T, S, Pos) ->
  1683. scan_element(T, S, Pos, S#xmerl_scanner.space,
  1684. _Lang = [], _Parents = [], #xmlNamespace{}).
  1685. scan_element(T, S=#xmerl_scanner{line=L,col=C},
  1686. Pos, SpaceDefault,Lang, Parents, NS) ->
  1687. {Name, NamespaceInfo, T1, S1} = scan_name(T, S),
  1688. vc_Element_valid(Name,S),
  1689. ?strip2,
  1690. scan_element(T2, S2, Pos, Name, L, C, _Attrs = [],
  1691. Lang, Parents, NamespaceInfo, NS,
  1692. SpaceDefault).
  1693. scan_element("/", S=#xmerl_scanner{continuation_fun = F},
  1694. Pos, Name, StartL, StartC, Attrs, Lang, Parents,
  1695. NSI, NS, SpaceDefault) ->
  1696. ?dbg("trailing / detected~n", []),
  1697. F(fun(MoreBytes, S1) -> scan_element("/" ++ MoreBytes, S1,
  1698. Pos, Name, StartL, StartC, Attrs,
  1699. Lang,Parents,NSI,NS,SpaceDefault) end,
  1700. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1701. S);
  1702. scan_element([], S=#xmerl_scanner{continuation_fun = F},
  1703. Pos, Name, StartL, StartC, Attrs, Lang, Parents,
  1704. NSI, NS, SpaceDefault) ->
  1705. ?dbg("cont()...~n", []),
  1706. F(fun(MoreBytes, S1) -> scan_element(MoreBytes, S1,
  1707. Pos, Name, StartL, StartC, Attrs,
  1708. Lang,Parents,NSI,NS,SpaceDefault) end,
  1709. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1710. S);
  1711. scan_element("/>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
  1712. event_fun = Event,
  1713. line = L, col = C,
  1714. xmlbase_cache=XMLBase}, Pos,
  1715. Name, _StartL, _StartC, Attrs0, Lang, Parents, NSI,
  1716. Namespace, _SpaceDefault) ->
  1717. ?bump_col(2),
  1718. Attrs = lists:reverse(Attrs0),
  1719. E=processed_whole_element(S, Pos, Name, Attrs, Lang, Parents,NSI,Namespace),
  1720. wfc_unique_att_spec(Attrs,S),
  1721. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  1722. line = L,
  1723. col = C,
  1724. data = E}, S0),
  1725. {Ret, S2} = Hook(E, S1),
  1726. S2b=S2#xmerl_scanner{xmlbase=XMLBase},
  1727. {Ret, T, S2b};
  1728. scan_element(">", S=#xmerl_scanner{continuation_fun = F},
  1729. Pos, Name, StartL, StartC, Attrs, Lang, Parents,
  1730. NSI, NS, SpaceDefault) ->
  1731. ?dbg("trailing > detected~n", []),
  1732. F(fun(MoreBytes, S1) -> scan_element(">" ++ MoreBytes, S1,
  1733. Pos, Name, StartL, StartC, Attrs,
  1734. Lang,Parents,NSI,NS,SpaceDefault) end,
  1735. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1736. S);
  1737. scan_element(">" ++ T, S0 = #xmerl_scanner{event_fun = Event,
  1738. hook_fun = Hook,
  1739. line = L, col = C,
  1740. xmlbase_cache=XMLBase,
  1741. space = SpaceOption},
  1742. Pos, Name, StartL, StartC, Attrs0, Lang, Parents,
  1743. NSI, Namespace, SpaceDefault) ->
  1744. ?bump_col(1),
  1745. Attrs = lists:reverse(Attrs0),
  1746. wfc_unique_att_spec(Attrs,S),
  1747. XMLSpace = case lists:keysearch('xml:space', 1, Attrs) of
  1748. false -> SpaceDefault;
  1749. {value, "default"} -> SpaceOption;
  1750. {value, "preserve"} -> preserve;
  1751. _ -> SpaceDefault
  1752. end,
  1753. E0=processed_whole_element(S,Pos,Name,Attrs,Lang,Parents,NSI,Namespace),
  1754. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started,
  1755. line = StartL,
  1756. col = StartC,
  1757. data = E0}, S),
  1758. {Content, T1, S2} = scan_content(T, S1, Name, Attrs, XMLSpace,
  1759. E0#xmlElement.language,
  1760. [{Name, Pos}|Parents], Namespace),
  1761. Element=E0#xmlElement{content=Content,
  1762. xmlbase=E0#xmlElement.xmlbase},
  1763. S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  1764. line = L,
  1765. col = C,
  1766. data = Element}, S2),
  1767. {Ret, S4} = Hook(Element, S3),
  1768. S4b=S4#xmerl_scanner{xmlbase=XMLBase},
  1769. {Ret, T1, S4b};
  1770. scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents,
  1771. NSI, NS, SpaceDefault) ->
  1772. {AttName, NamespaceInfo, T1, S1} = scan_name(T, S),
  1773. {T2, S2} = scan_eq(T1, S1),
  1774. {AttType,_DefaultDecl} = get_att_type(S2,AttName,Name),
  1775. {AttValue, T3, S3,IsNorm} = scan_att_value(T2, S2, AttType),
  1776. %% check_default_value(S3,DefaultDecl,AttValue),
  1777. NewNS = check_namespace(AttName, NamespaceInfo, AttValue, NS),
  1778. wfc_whitespace_betw_attrs(hd(T3),S3),
  1779. ?strip4,
  1780. AttrPos = case Attrs of
  1781. [] ->
  1782. 1;
  1783. [#xmlAttribute{pos = P}|_] ->
  1784. P+1
  1785. end,
  1786. Attr = #xmlAttribute{name = AttName,
  1787. pos = AttrPos,
  1788. language = Lang,
  1789. namespace = NamespaceInfo,
  1790. value = AttValue,
  1791. normalized = IsNorm},
  1792. XMLBase=if
  1793. AttName=='xml:base' ->
  1794. resolve_relative_uri(AttValue,S4#xmerl_scanner.xmlbase);
  1795. true ->
  1796. S4#xmerl_scanner.xmlbase
  1797. end,
  1798. #xmerl_scanner{event_fun = Event,
  1799. line = Line,
  1800. col = Col} = S4,
  1801. S5 = Event(#xmerl_event{event = ended,
  1802. line = Line,
  1803. col = Col,
  1804. data = Attr},
  1805. S4#xmerl_scanner{xmlbase=XMLBase,
  1806. xmlbase_cache=S#xmerl_scanner.xmlbase}),
  1807. scan_element(T4, S5, Pos, Name, StartL, StartC, [Attr|Attrs],
  1808. Lang, Parents, NSI, NewNS, SpaceDefault).
  1809. get_att_type(S=#xmerl_scanner{rules_read_fun=Read},AttName,ElemName) ->
  1810. case Read(elem_def,ElemName,S) of
  1811. #xmlElement{attributes = Attrs} ->
  1812. case lists:keysearch(AttName,1,Attrs) of
  1813. {value,{_,AttType,_,DefaultDecl,_}} ->
  1814. {AttType,DefaultDecl};
  1815. _ -> {'CDATA',no_value} %% undefined attribute shall be treated as CDATA
  1816. end;
  1817. _ -> {'CDATA',no_value}
  1818. end.
  1819. resolve_relative_uri(NewBase="/"++_,CurrentBase) ->
  1820. case uri:parse(CurrentBase) of
  1821. {error,no_scheme} ->
  1822. NewBase;
  1823. {Scheme,Host,Port,_PathQuery} ->
  1824. atom_to_list(Scheme)++Host++":"++integer_to_list(Port)++NewBase
  1825. end;
  1826. resolve_relative_uri(NewBase,CurrentBase) ->
  1827. filename:join(CurrentBase,NewBase).
  1828. processed_whole_element(S=#xmerl_scanner{hook_fun = _Hook,
  1829. xmlbase = XMLBase,
  1830. line = _L, col = _C,
  1831. event_fun = _Event},
  1832. Pos, Name, Attrs, Lang, Parents, NSI, Namespace) ->
  1833. Language = check_language(Attrs, Lang),
  1834. {ExpName, ExpAttrs} =
  1835. case S#xmerl_scanner.namespace_conformant of
  1836. true ->
  1837. %% expand attribute names. We need to do this after having
  1838. %% scanned all attributes of the element, since (as far as
  1839. %% I can tell), XML Names only specifies that namespace attrs
  1840. %% are valid within the whole scope of the element in which
  1841. %% they are declared, which should also mean that even if they
  1842. %% are declared after some other attributes, the namespace
  1843. %% should apply to those attributes as well.
  1844. %% Note that the default URI does not apply to attrbute names.
  1845. TempNamespace = Namespace#xmlNamespace{default = []},
  1846. ExpAttrsX =
  1847. [A#xmlAttribute{
  1848. expanded_name=expanded_name(
  1849. A#xmlAttribute.name,
  1850. A#xmlAttribute.namespace,
  1851. % NSI,
  1852. TempNamespace, S)} || A <- Attrs],
  1853. {expanded_name(Name, NSI, Namespace, S), ExpAttrsX};
  1854. false ->
  1855. {Name, Attrs}
  1856. end,
  1857. #xmlElement{name = Name,
  1858. xmlbase = XMLBase,
  1859. pos = Pos,
  1860. parents = Parents,
  1861. attributes = ExpAttrs,
  1862. language = Language,
  1863. expanded_name = ExpName,
  1864. nsinfo = NSI,
  1865. namespace = Namespace}.
  1866. check_language([#xmlAttribute{name='xml:lang',value=Lang}|_], _) ->
  1867. Lang;
  1868. check_language([_|T], Lang) ->
  1869. check_language(T, Lang);
  1870. check_language([], Lang) ->
  1871. Lang.
  1872. check_namespace(xmlns, _, Value, NS) ->
  1873. NS#xmlNamespace{default = list_to_atom(Value)};
  1874. check_namespace(_, {"xmlns", Prefix}, Value,
  1875. NS = #xmlNamespace{nodes = Ns}) ->
  1876. NS#xmlNamespace{nodes = keyreplaceadd(
  1877. Prefix, 1, Ns, {Prefix, list_to_atom(Value)})};
  1878. check_namespace(_, _, _, NS) ->
  1879. NS.
  1880. expanded_name(Name, [], #xmlNamespace{default = []}, _S) ->
  1881. Name;
  1882. expanded_name(Name, [], #xmlNamespace{default = URI}, _S) ->
  1883. {URI, Name};
  1884. expanded_name(_Name, {"xmlns", Local}, _NS, _S) -> % CHECK THIS /JB
  1885. {"xmlns",Local};
  1886. expanded_name(_Name, {Prefix, Local}, #xmlNamespace{nodes = Ns}, S) ->
  1887. case lists:keysearch(Prefix, 1, Ns) of
  1888. {value, {_, URI}} ->
  1889. {URI, list_to_atom(Local)};
  1890. false ->
  1891. %% A namespace constraint of XML Names is that the prefix
  1892. %% must be declared
  1893. ?fatal({namespace_prefix_not_declared, Prefix}, S)
  1894. end.
  1895. keyreplaceadd(K, Pos, [H|T], Obj) when K == element(Pos, H) ->
  1896. [Obj|T];
  1897. keyreplaceadd(K, Pos, [H|T], Obj) ->
  1898. [H|keyreplaceadd(K, Pos, T, Obj)];
  1899. keyreplaceadd(_K, _Pos, [], Obj) ->
  1900. [Obj].
  1901. %%%%%%% [10] AttValue
  1902. %% normalize the attribute value according to XML 1.0 section 3.3.3
  1903. scan_att_value([], S=#xmerl_scanner{continuation_fun = F},AT) ->
  1904. ?dbg("cont()...~n", []),
  1905. F(fun(MoreBytes, S1) -> scan_att_value(MoreBytes, S1, AT) end,
  1906. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1907. S);
  1908. scan_att_value("%"++_T,S=#xmerl_scanner{environment=prolog},_AttType) ->
  1909. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  1910. scan_att_value("%"++T,S0=#xmerl_scanner{rules_read_fun=Read,
  1911. rules_write_fun=Write,
  1912. rules_delete_fun=Delete},AttType) ->
  1913. ?bump_col(1),
  1914. {Name,T1,S1} = scan_pe_reference(T,S),
  1915. {ExpandedRef,S2} =
  1916. case expand_pe_reference(Name,S1,in_literal) of
  1917. Tuple when tuple(Tuple) ->
  1918. %% {system,URI} or {public,URI}
  1919. %% Included in literal, just get external file.
  1920. {ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
  1921. {EntV,_,_S2} = scan_entity_value(ExpRef, S1, no_delim,
  1922. Name,parameter),
  1923. %% should do an update Write(parameter_entity) so next
  1924. %% expand_pe_reference is faster
  1925. Delete(parameter_entity,Name,_S2),
  1926. _S3 = Write(parameter_entity,Name,EntV,_S2),
  1927. EntV2 = Read(parameter_entity,Name,_S3),
  1928. {EntV2,_S3};
  1929. ExpRef ->
  1930. {ExpRef,S1}
  1931. end,
  1932. {_,T2,S3} = strip(ExpandedRef ++ T1,S2),
  1933. scan_att_value(T2,S3,AttType);
  1934. scan_att_value([H|T], S0,'CDATA'=AT) when H == $"; H == $' ->
  1935. ?bump_col(1),
  1936. scan_att_chars(T, S, H, [],[], AT,false);
  1937. scan_att_value([H|T], S0,AttType) when H == $"; H == $' ->
  1938. ?bump_col(1),
  1939. {T1,S1,IsNorm} = normalize(T,S,false),
  1940. scan_att_chars(T1, S1, H, [],[], AttType,IsNorm).
  1941. scan_att_chars([],S=#xmerl_scanner{continuation_fun=F},H,Acc,TmpAcc,AT,IsNorm)->
  1942. ?dbg("cont()...~n", []),
  1943. F(fun(MoreBytes, S1) ->
  1944. scan_att_chars(MoreBytes, S1, H, Acc,TmpAcc,AT,IsNorm)
  1945. end,
  1946. fun(S1) -> ?fatal(unexpected_end, S1) end,
  1947. S);
  1948. scan_att_chars([H|T], S0, H, Acc, TmpAcc,AttType,IsNorm) -> % End quote
  1949. ?bump_col(1),
  1950. check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AttType,S),
  1951. {Acc2,S2,IsNorm2} =
  1952. if
  1953. AttType == 'CDATA' -> {Acc,S,IsNorm};
  1954. true ->
  1955. normalize(Acc,S,IsNorm)
  1956. end,
  1957. {lists:reverse(Acc2), T, S2,IsNorm2};
  1958. scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference
  1959. ?bump_col(1),
  1960. {ExpRef, T1, S1} = scan_reference(T, S),
  1961. case markup_delimeter(ExpRef) of
  1962. true ->
  1963. scan_att_chars(T1,S1,Delim,[ExpRef|Acc],[ExpRef|TmpAcc],AT,IsNorm);
  1964. _ ->
  1965. scan_att_chars(ExpRef ++ T1, S1, Delim, Acc,TmpAcc, AT,IsNorm)
  1966. end;
  1967. scan_att_chars("<" ++ _T, S0, _Delim, _Acc,_, _,_) -> % Tags not allowed here
  1968. ?fatal(unexpected_char, S0);
  1969. scan_att_chars([H|T], S0, Delim, Acc, _TmpAcc,'CDATA',IsNorm)
  1970. when ?whitespace(H) ->
  1971. ?bump_col(1),
  1972. scan_att_chars(T, S, Delim, [$\s|Acc], [],'CDATA',IsNorm);
  1973. scan_att_chars([H|T], S0, Delim, Acc, TmpAcc,AT,IsNorm)
  1974. when ?whitespace(H) ->
  1975. ?bump_col(1),
  1976. {T1,S1,IsNorm2} = normalize(T,S,IsNorm),
  1977. check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AT,S1),
  1978. scan_att_chars(T1, S1, Delim, [$\s|Acc],[], AT,IsNorm2);
  1979. scan_att_chars([H|T], S0, Delim, Acc, TmpAcc,AT,IsNorm) ->
  1980. ?bump_col(1),
  1981. valid_Char(S#xmerl_scanner.validation,AT,H,S),
  1982. scan_att_chars(T, S, Delim, [H|Acc], [H|TmpAcc],AT,IsNorm).
  1983. markup_delimeter("&")-> true;
  1984. markup_delimeter("\"") -> true;
  1985. markup_delimeter("\'") -> true;
  1986. markup_delimeter("<") -> true;
  1987. markup_delimeter(">") -> true;
  1988. markup_delimeter("%") -> true;
  1989. markup_delimeter(_) -> false.
  1990. check_att_default_val(true,[],_Ent,_S) ->
  1991. ok;
  1992. check_att_default_val(true,RevName,Ent,S) ->
  1993. check_att_default_val(lists:reverse(RevName),Ent,S);
  1994. check_att_default_val(_,_,_,_) ->
  1995. ok.
  1996. check_att_default_val(Name,Ent,S=#xmerl_scanner{rules_write_fun=Write})
  1997. when Ent == 'ENTITY'; Ent == 'ENTITIES' ->
  1998. case xmerl_lib:is_letter(hd(Name)) of
  1999. true -> ok;
  2000. _ -> ?fatal({illegal_first_character,Ent,Name},S)
  2001. end,
  2002. SName = list_to_atom(Name),
  2003. Write(entity,SName,undeclared,S);
  2004. check_att_default_val(Name,IDR,S=#xmerl_scanner{rules_write_fun=Write})
  2005. when IDR == 'IDREF'; IDR == 'IDREFS' ->
  2006. case xmerl_lib:is_letter(hd(Name)) of
  2007. true -> ok;
  2008. _ -> ?fatal({illegal_first_character,IDR,Name},S)
  2009. end,
  2010. SName = list_to_atom(Name),
  2011. Write(id,SName,undeclared,S);
  2012. check_att_default_val(Name,'ID',S=#xmerl_scanner{rules_write_fun=Write,
  2013. rules_read_fun=Read,
  2014. rules_delete_fun=Delete}) ->
  2015. case xmerl_lib:is_name(Name) of
  2016. false ->
  2017. ?fatal({'ID_names_must_be_Name_production',Name},S);
  2018. _ ->
  2019. ok
  2020. end,
  2021. SName = if
  2022. list(Name) -> list_to_atom(Name);
  2023. true -> Name
  2024. end,
  2025. case Read(id,SName,S) of
  2026. undeclared -> %% was referenced in IDREF/IDREFS before defined
  2027. Delete(id,SName,S);
  2028. SName -> ?fatal({values_must_be_unique,'ID',SName},S);
  2029. undefined -> ok
  2030. end,
  2031. Write(id,SName,SName,S);
  2032. check_att_default_val(_,_,_) ->
  2033. ok.
  2034. valid_Char(true,AT,C,S) when AT=='NMTOKEN';AT=='NMTOKENS' ->
  2035. vc_Valid_Char(AT,C,S);
  2036. valid_Char(_,_,[C],S) ->
  2037. case xmerl_lib:is_char(C) of
  2038. true ->
  2039. ok;
  2040. false ->
  2041. ?fatal({unexpected_char,C}, S)
  2042. end;
  2043. valid_Char(_,_,C,S) ->
  2044. case xmerl_lib:is_char(C) of
  2045. true ->
  2046. ok;
  2047. false ->
  2048. ?fatal({unexpected_char,C}, S)
  2049. end.
  2050. %%%%%%% [43] content
  2051. scan_content(T, S, Name, Attrs, Space, Lang, Parents, NS) ->
  2052. scan_content(T, S, _Pos = 1, Name, Attrs, Space,
  2053. Lang, Parents, NS, _Acc = [],_MarkupDel=[]).
  2054. scan_content("<", S= #xmerl_scanner{continuation_fun = F},
  2055. Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
  2056. ?dbg("trailing < detected~n", []),
  2057. F(fun(MoreBytes, S1) -> scan_content("<" ++ MoreBytes, S1,
  2058. Pos, Name, Attrs,
  2059. Space, Lang, Parents, NS, Acc,[]) end,
  2060. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2061. S);
  2062. scan_content([], S=#xmerl_scanner{environment={external,{entity,_}}},
  2063. _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) ->
  2064. {lists:reverse(Acc),[],S};
  2065. scan_content([], S=#xmerl_scanner{environment=internal_parsed_entity},
  2066. _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) ->
  2067. {lists:reverse(Acc),[],S};
  2068. scan_content([], S=#xmerl_scanner{continuation_fun = F},
  2069. Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
  2070. ?dbg("cont()...~n", []),
  2071. F(fun(MoreBytes, S1) -> scan_content(MoreBytes, S1,
  2072. Pos, Name, Attrs,
  2073. Space, Lang, Parents, NS, Acc,[]) end,
  2074. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2075. S);
  2076. scan_content("</" ++ T, S0, _Pos, Name, _Attrs, _Space, _Lang,
  2077. _Parents, _NS, Acc,[]) ->
  2078. ?bump_col(2),
  2079. {ETagName, _NamespaceInfo, T1, S1} = scan_name(T, S),
  2080. if ETagName == Name ->
  2081. ok;
  2082. true ->
  2083. ?fatal({endtag_does_not_match, {ETagName, Name}}, S1)
  2084. end,
  2085. ?strip2,
  2086. case T2 of
  2087. ">" ++ T3 ->
  2088. {lists:reverse(Acc), T3, S2};
  2089. _ ->
  2090. ?fatal({error,{unexpected_end_of_STag}},S)
  2091. end;
  2092. scan_content([$&|_T]=Str,
  2093. #xmerl_scanner{environment={external,{entity,EName}}} = S0,
  2094. Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
  2095. {_EntV,T1,S1}=scan_entity_value(Str,S0 ,[],EName,general),
  2096. %%This is a problem. All referenced entities in the external entity must be checked for recursion, thus parse the contentbut,skip result.
  2097. scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
  2098. scan_content("&"++T,
  2099. #xmerl_scanner{environment=internal_parsed_entity} = S,
  2100. Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
  2101. {_, T1, S1} = scan_reference(T, S),
  2102. scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
  2103. scan_content("&" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) ->
  2104. ?bump_col(1),
  2105. {ExpRef, T1, S1} = scan_reference(T, S),
  2106. case markup_delimeter(ExpRef) of
  2107. true -> scan_content(ExpRef++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,ExpRef);
  2108. _ ->
  2109. scan_content(ExpRef++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,[])
  2110. end;
  2111. scan_content("<!--" ++ T, S, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) ->
  2112. {_, T1, S1} = scan_comment(T, S, Pos, Parents, Lang),
  2113. scan_content(T1, S1, Pos+1, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
  2114. scan_content("<" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) ->
  2115. ?bump_col(1),
  2116. {Markup, T1, S1} =
  2117. scan_content_markup(T, S, Pos, Name, Attrs, Space, Lang, Parents, NS),
  2118. AccF = S1#xmerl_scanner.acc_fun,
  2119. {NewAcc, NewPos, NewS} = case AccF(Markup, Acc, S1) of
  2120. {Acc2, S2} ->
  2121. {Acc2, Pos+1, S2};
  2122. {Acc2, Pos2, S2} ->
  2123. {Acc2, Pos2, S2}
  2124. end,
  2125. scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang,
  2126. Parents, NS, NewAcc,[]);
  2127. scan_content([_H|T], S= #xmerl_scanner{environment={external,{entity,_}}},
  2128. Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
  2129. %% Guess we have to scan the content to find any internal entity
  2130. %% references.
  2131. scan_content(T,S,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
  2132. scan_content(T, S=#xmerl_scanner{acc_fun = F,
  2133. event_fun = Event,
  2134. line = _L},
  2135. Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,MarkupDel) ->
  2136. Text0 = #xmlText{pos = Pos,
  2137. parents = Parents},
  2138. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started,
  2139. line = S#xmerl_scanner.line,
  2140. data = Text0}, S),
  2141. {Data, T1, S2} = scan_char_data(T, S1, Space,MarkupDel),
  2142. Text = Text0#xmlText{value = Data},
  2143. S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
  2144. line = S2#xmerl_scanner.line,
  2145. data = Text}, S2),
  2146. {NewAcc, NewPos, NewS} = case F(Text, Acc, S3) of
  2147. {Acc4, S4} ->
  2148. {Acc4, Pos+1, S4};
  2149. {Acc4, Pos4, S4} ->
  2150. {Acc4, Pos4, S4}
  2151. end,
  2152. scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang,
  2153. Parents, NS, NewAcc,[]).
  2154. scan_content_markup([], S=#xmerl_scanner{continuation_fun = F},
  2155. Pos, Name, Attrs, Space, Lang, Parents, NS) ->
  2156. ?dbg("cont()...~n", []),
  2157. F(fun(MoreBytes, S1) -> scan_content_markup(
  2158. MoreBytes,S1,Pos,Name,
  2159. Attrs,Space,Lang,Parents,NS) end,
  2160. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2161. S);
  2162. scan_content_markup("![CDATA[" ++ T, S0, Pos, _Name, _Attrs,
  2163. _Space, _Lang, Parents, _NS) ->
  2164. ?bump_col(8),
  2165. scan_cdata(T, S, Pos, Parents);
  2166. scan_content_markup("?"++T,S0,Pos,_Name,_Attrs,_Space,_Lang,_Parents,_NS) ->
  2167. ?bump_col(1),
  2168. scan_pi(T, S, Pos);
  2169. scan_content_markup(T, S, Pos, _Name, _Attrs, Space, Lang, Parents, NS) ->
  2170. scan_element(T, S, Pos, Space, Lang, Parents, NS).
  2171. scan_char_data(T, S, Space,MUD) ->
  2172. scan_char_data(T, S, Space,MUD, _Acc = []).
  2173. %%%%%%% [14] CharData
  2174. scan_char_data([], S=#xmerl_scanner{environment={external,{entity,_}}},
  2175. _Space,_MUD, Acc) ->
  2176. {lists:reverse(Acc), [], S};
  2177. scan_char_data([], S=#xmerl_scanner{environment=internal_parsed_entity},
  2178. _Space, _MUD,Acc) ->
  2179. {lists:reverse(Acc), [], S};
  2180. scan_char_data([], S=#xmerl_scanner{continuation_fun = F}, Space, _MUD,Acc) ->
  2181. ?dbg("cont()...~n", []),
  2182. F(fun(MoreBytes, S1) -> scan_char_data(MoreBytes,S1,Space,_MUD,Acc) end,
  2183. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2184. S);
  2185. scan_char_data([$&|T], S,Space,"&",Acc) ->
  2186. scan_char_data(T, S, Space,[], [$&|Acc]);
  2187. scan_char_data(T=[$&|_], S,_Space,_MUD,Acc) ->
  2188. {lists:reverse(Acc), T, S};
  2189. scan_char_data("]]>" ++ _T, S, _Space,_MUD, _Acc) ->
  2190. %% See Section 2.4: Especially:
  2191. %% "The right angle bracket (>) MAY be represented using the string "&gt;",
  2192. %% and MUST, for compatibility, be escaped using either "&gt;" or a
  2193. %% character reference when it appears in the string "]]>" in content, when
  2194. %% that string is not marking the end of a CDATA section.
  2195. ?fatal(unexpected_cdata_end, S);
  2196. scan_char_data([$<|T],S,Space,"<", Acc) ->
  2197. scan_char_data(T, S, Space,[], [$<|Acc]);
  2198. scan_char_data(T = [$<|_], S, _Space,_MUD,Acc) ->
  2199. {lists:reverse(Acc), T, S};
  2200. scan_char_data(T = [H|_], S, Space,MUD, Acc) when ?whitespace(H) ->
  2201. {NewAcc, T1, S1} = accumulate_whitespace(T, S, Space, Acc),
  2202. scan_char_data(T1, S1, Space,MUD,NewAcc);
  2203. scan_char_data([H1,H2|_T],S,_Space,_MUD,_Acc) when ?non_character(H1,H2) ->
  2204. ?fatal({error,{not_allowed_to_use_Unicode_noncharacters}},S);
  2205. scan_char_data("]]>"++_T,S,_Space,_MUD,_Acc) ->
  2206. ?fatal({error,{illegal_character_in_content,"]]>"}},S);
  2207. scan_char_data([H|T],S0,Space,MUD,Acc) ->
  2208. ?bump_col(1),
  2209. wfc_legal_char(H,S),
  2210. scan_char_data(T,S,Space,MUD,[H|Acc]).
  2211. %%%%%%% [18]-[21] CDATA
  2212. scan_cdata(Str, S, Pos, Parents) ->
  2213. scan_cdata(Str, S, Pos, Parents, _Acc = []).
  2214. scan_cdata([], S=#xmerl_scanner{continuation_fun = F}, Pos, Parents, Acc) ->
  2215. ?dbg("cont()...~n", []),
  2216. F(fun(MoreBytes, S1) -> scan_cdata(MoreBytes, S1, Pos, Parents, Acc) end,
  2217. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2218. S);
  2219. scan_cdata("]]>" ++ T, S0, Pos, Parents, Acc) ->
  2220. ?bump_col(3),
  2221. {#xmlText{pos = Pos,
  2222. parents = Parents,
  2223. value = lists:reverse(Acc),
  2224. type = cdata}, T, S};
  2225. scan_cdata([H|T], S0, Pos, Parents, Acc) ->
  2226. case xmerl_lib:is_char(H) of
  2227. true ->
  2228. ?bump_col(1),
  2229. scan_cdata(T, S, Pos, Parents, [H|Acc]);
  2230. false ->
  2231. ?fatal({unexpected_char,H}, S0)
  2232. end.
  2233. %%%%%%% [67] Reference
  2234. %% returns a three tuple {Result,RestBuf,State}
  2235. scan_reference([], S=#xmerl_scanner{continuation_fun = F}) ->
  2236. ?dbg("cont()...~n", []),
  2237. F(fun(MoreBytes, S1) -> scan_reference(MoreBytes, S1) end,
  2238. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2239. S);
  2240. scan_reference("#x" ++ T, S0) ->
  2241. %% [66] CharRef
  2242. ?bump_col(1),
  2243. if hd(T) /= $; ->
  2244. scan_char_ref_hex(T, S, 0);
  2245. true ->
  2246. ?fatal(invalid_char_ref, S)
  2247. end;
  2248. scan_reference("#" ++ T, S0) ->
  2249. %% [66] CharRef
  2250. ?bump_col(1),
  2251. if hd(T) /= $; ->
  2252. scan_char_ref_dec(T, S, []);
  2253. true ->
  2254. ?fatal(invalid_char_ref, S)
  2255. end;
  2256. scan_reference(T, S) ->
  2257. case catch scan_entity_ref(T, S) of
  2258. {'EXIT', _} ->
  2259. ?fatal(error_scanning_entity_ref,S);
  2260. Other ->
  2261. Other
  2262. end.
  2263. %% Chapter 4.4.2: ... the replacement text of entities used to escape
  2264. %% markup delimiters (the entities amp, lt, gt, apos, quot) is always treated
  2265. %% as data. (The string "AT&amp;T;" expands to "AT&T;" and the remaining
  2266. %% ampersand is not recognized as an entity-reference delimiter.)"
  2267. %%
  2268. %% How to achieve this? My current approach is to insert the *strings* "&",
  2269. %% "<", ">", "'", and "\"" instead of the characters. The processor will
  2270. %% ignore them when performing multiple expansions. This means, for now, that
  2271. %% the character data output by the processor is (1-2 levels) deep.
  2272. %% At some suitable point, we should flatten these, so that application-level
  2273. %% processors should not have to be aware of this detail.
  2274. scan_entity_ref([], S=#xmerl_scanner{continuation_fun = F}) ->
  2275. ?dbg("cont()...~n", []),
  2276. F(fun(MoreBytes, S1) -> scan_entity_ref(MoreBytes, S1) end,
  2277. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2278. S);
  2279. scan_entity_ref("amp;" ++ T, S0) ->
  2280. ?bump_col(4),
  2281. {"&", T, S};
  2282. scan_entity_ref("lt;" ++ T, S0) ->
  2283. ?bump_col(3),
  2284. {"<", T, S};
  2285. scan_entity_ref("gt;" ++ T, S0) ->
  2286. ?bump_col(3),
  2287. {">", T, S};
  2288. scan_entity_ref("apos;" ++ T, S0) ->
  2289. ?bump_col(5),
  2290. {"'", T, S};
  2291. scan_entity_ref("quot;" ++ T, S0) ->
  2292. ?bump_col(5),
  2293. {"\"", T, S};
  2294. scan_entity_ref(T, S) ->
  2295. {Name, _NamespaceInfo, T1, S1} = scan_name(T, S),
  2296. ";" ++ T2 = T1,
  2297. S2 = S1,
  2298. Entity = expand_reference(Name, S2),
  2299. {Entity, T2, S2}.
  2300. %%%%%%% [69] PEReference
  2301. scan_pe_reference(T, S) ->
  2302. {Name, _NamespaceInfo, T1, S1} = scan_name(T, S),
  2303. ";" ++ T2 = T1,
  2304. {Name, T2, S1#xmerl_scanner{col = S1#xmerl_scanner.col+1}}.
  2305. expand_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S,WS) ->
  2306. case Read(parameter_entity, Name, S) of
  2307. % undefined when S#xmerl_scanner.validation==true;
  2308. % S#xmerl_scanner.standalone==yes;
  2309. % S#xmerl_scanner.environment==prolog ->
  2310. % ?fatal({unknown_parameter_entity, Name}, S); % WFC or VC failure
  2311. undefined ->
  2312. ?fatal({unknown_parameter_entity, Name}, S); % WFC or VC failure
  2313. Err={error,_Reason} ->
  2314. ?fatal(Err,S);
  2315. Tuple when tuple(Tuple) ->
  2316. Tuple;
  2317. Result ->
  2318. if
  2319. WS == in_literal -> Result;
  2320. true -> " "++Result++" "
  2321. end
  2322. end.
  2323. % Currently unused
  2324. %
  2325. % expand_external_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) ->
  2326. % case Read(parameter_entity, Name, S) of
  2327. % undefined ->
  2328. % ?fatal({unknown_parameter_entity, Name}, S);
  2329. % Result ->
  2330. % fetch_DTD(Result,S)
  2331. % end.
  2332. %%%%%%% [68] EntityReference
  2333. expand_reference(Name, #xmerl_scanner{environment={external,{entity,_}}}) ->
  2334. atom_to_list(Name);
  2335. expand_reference(Name, #xmerl_scanner{environment=internal_parsed_entity}) ->
  2336. atom_to_list(Name);
  2337. expand_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) ->
  2338. case Read(entity, Name, S) of
  2339. undefined ->
  2340. ?fatal({unknown_entity_ref, Name}, S);
  2341. {_,external,{error,enoent}} ->
  2342. ?fatal({error,{entity_target_not_found,{error,enoent},Name}},S);
  2343. {DefEnv,EntType,Value} ->
  2344. wfc_Entity_Declared(DefEnv,S,Name),
  2345. wfc_Internal_parsed_entity(EntType,Value,S),
  2346. Value
  2347. end.
  2348. %%%%%%% [66] CharRef
  2349. scan_char_ref_dec([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
  2350. ?dbg("cont()...~n", []),
  2351. F(fun(MoreBytes, S1) -> scan_char_ref_dec(MoreBytes, S1, Acc) end,
  2352. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2353. S);
  2354. scan_char_ref_dec([H|T], S0, Acc) when H >= $0, H =< $9 ->
  2355. ?bump_col(1),
  2356. scan_char_ref_dec(T, S, [H|Acc]);
  2357. scan_char_ref_dec(";" ++ T, S0, Acc) ->
  2358. ?bump_col(1),
  2359. Ref = list_to_integer(lists:reverse(Acc)),
  2360. wfc_legal_char(Ref,S),
  2361. {[Ref], T, S}. %% changed return value from [[Ref]]
  2362. scan_char_ref_hex([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
  2363. ?dbg("cont()...~n", []),
  2364. F(fun(MoreBytes, S1) -> scan_char_ref_hex(MoreBytes, S1, Acc) end,
  2365. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2366. S);
  2367. scan_char_ref_hex([H|T], S0, Acc) when H >= $0, H =< $9 ->
  2368. ?bump_col(1),
  2369. Dec = H - $0,
  2370. scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4)));
  2371. scan_char_ref_hex([H|T], S0, Acc) when H >= $a, H =< $f ->
  2372. ?bump_col(1),
  2373. Dec = (H - $a) + 10,
  2374. scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4)));
  2375. scan_char_ref_hex([H|T], S0, Acc) when H >= $A, H =< $F ->
  2376. ?bump_col(1),
  2377. Dec = (H - $A) + 10,
  2378. scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4)));
  2379. scan_char_ref_hex(";" ++ T, S0, Acc) ->
  2380. ?bump_col(1),
  2381. wfc_legal_char(Acc,S),
  2382. {[Acc], T, S}. %% changed return value from [[Acc]]
  2383. %%%%%%% [25] Eq
  2384. %%% Eq ::= S? '=' S?
  2385. scan_eq(T, S) ->
  2386. ?strip1,
  2387. case T1 of
  2388. [$=|T2] ->
  2389. S2 = S1,
  2390. ?strip3,
  2391. {T3, S3};
  2392. _ ->
  2393. ?fatal(assignment_expected,S)
  2394. end.
  2395. %% scan_name/2
  2396. %%
  2397. %% We perform some checks here to make sure that the names conform to
  2398. %% the "Namespaces in XML" specification. This is an option.
  2399. %%
  2400. %% Qualified Name:
  2401. %% [6] QName ::= (Prefix ':')? LocalPart
  2402. %% [7] Prefix ::= NCName
  2403. %% [8] LocalPart ::= NCName
  2404. %% [4] NCName ::= (Letter | '_') (NCNameChar)*
  2405. %% [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_'
  2406. %% | CombiningChar | Extender
  2407. %% The effect of XML Names (namespace) conformance is that:
  2408. %% - All element types and attribute names contain either zero or one colon
  2409. %% - No entity names, PI targets, or notation names contain any colons.
  2410. %%
  2411. %% scan_name_no_colons/2 will ensure that the name contains no colons iff
  2412. %% the scanner has been told to be namespace conformant. Otherwise, it will
  2413. %% behave exactly like scan_name/2.
  2414. %%
  2415. scan_name_no_colons(Str, S) ->
  2416. NSC = S#xmerl_scanner.namespace_conformant,
  2417. case NSC of
  2418. true ->
  2419. {Target, NSI, T1, S1} =
  2420. scan_name(Str,S#xmerl_scanner{namespace_conformant=no_colons}),
  2421. {Target,NSI,T1,S1#xmerl_scanner{namespace_conformant=NSC}};
  2422. false ->
  2423. scan_name(Str, S)
  2424. end.
  2425. %% [5] Name ::= (Letter | '_' | ':') (NameChar)*
  2426. scan_name([], S=#xmerl_scanner{continuation_fun = F}) ->
  2427. ?dbg("cont()...~n", []),
  2428. F(fun(MoreBytes, S1) -> scan_name(MoreBytes, S1) end,
  2429. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2430. S);
  2431. scan_name(Str = [$:|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) ->
  2432. if NSC == false ->
  2433. ?bump_col(1),
  2434. scan_nmtoken(T, S, [$:], NSC);
  2435. NSC == no_colons ->
  2436. ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0);
  2437. true ->
  2438. %% In order to conform with the "Namespaces in XML" spec,
  2439. %% we cannot allow names to begin with ":"
  2440. ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0)
  2441. end;
  2442. scan_name([$_|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) ->
  2443. ?bump_col(1),
  2444. scan_nmtoken(T, S, [$_], NSC);
  2445. scan_name("%"++_T,S=#xmerl_scanner{environment=prolog}) ->
  2446. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  2447. scan_name("%"++T,S0=#xmerl_scanner{environment={external,_}}) ->
  2448. %% parameter entity that expands to a name
  2449. ?bump_col(1),
  2450. {PERefName, T1, S1} = scan_pe_reference(T, S),
  2451. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  2452. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  2453. scan_name(T2,S2);
  2454. scan_name([H|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) ->
  2455. case xmerl_lib:is_letter(H) of
  2456. true ->
  2457. ?bump_col(1),
  2458. scan_nmtoken(T, S, [H], NSC);
  2459. false ->
  2460. ?fatal({invalid_name, lists:sublist([H|T], 1, 6)}, S0)
  2461. end;
  2462. scan_name(Str, S) ->
  2463. ?fatal({invalid_name, Str}, S).
  2464. scan_nmtoken(Str, S, Acc, NSC) ->
  2465. scan_nmtoken(Str, S, Acc, _Prefix = [], _Local = Acc, NSC,is7bAscii(hd(Acc),true)).
  2466. %% scan_nmtoken/2
  2467. %% [7] NmToken ::= (NameChar)+
  2468. scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}) ->
  2469. ?dbg("cont()...~n", []),
  2470. F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes, S1) end,
  2471. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2472. S);
  2473. scan_nmtoken(Str = [H|T], S) ->
  2474. case xmerl_lib:is_namechar(H) of
  2475. true ->
  2476. scan_nmtoken(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1},
  2477. _Acc = [H], _Prefix = [], _Local = [H],
  2478. _NamespaceConformant = false,is7bAscii(H,true));
  2479. false ->
  2480. ?fatal({invalid_nmtoken, lists:sublist(Str, 1, 6)}, S)
  2481. end.
  2482. scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F},
  2483. Acc, Prefix, Local, NSC,Is7bAscii) ->
  2484. ?dbg("cont()...~n", []),
  2485. F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes,S1,Acc,Prefix,Local,NSC,Is7bAscii) end,
  2486. fun(S1) -> {list_to_atom(lists:reverse(Acc)),
  2487. namespace_info(Prefix, Local),[],S1} end,
  2488. S);
  2489. %% whitespace marks the end of a name
  2490. scan_nmtoken(Str = [H|_], S, Acc, Prefix, Local, _NSC,true) when ?whitespace(H) ->
  2491. %% we don't strip here because the occurrence of whitespace may be an error
  2492. %% e.g. <!ELEMENT spec (front, body, back ?)>
  2493. NmString = lists:reverse(Acc),
  2494. {list_to_atom(NmString), namespace_info(Prefix, Local), Str, S};
  2495. scan_nmtoken(Str = [$:|_], S, Acc, [], _Local, no_colons,_Is7bAscii) ->
  2496. ?fatal({invalid_NCName,
  2497. lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S);
  2498. scan_nmtoken([$:|T], S0, Acc, [], Local, NSC, Is7bAscii) ->
  2499. ?bump_col(1),
  2500. scan_nmtoken(T, S, [$:|Acc], lists:reverse(Local), [], NSC,Is7bAscii);
  2501. scan_nmtoken(Str = [$:|_T], S, Acc, _Prefix, _Local, _NSC = true,_Is7bAscii) ->
  2502. %% non-empty Prefix means that we've encountered a ":" already.
  2503. %% Conformity with "Namespaces in XML" requires
  2504. %% at most one colon in a name
  2505. ?fatal({invalid_NCName,
  2506. lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S);
  2507. %% non-namechar also marks the end of a name
  2508. scan_nmtoken(Str=[H|T], S0, Acc, Prefix, Local, NSC,Is7bAscii) ->
  2509. ?bump_col(1),
  2510. case xmerl_lib:is_namechar(H) of
  2511. true ->
  2512. scan_nmtoken(T, S, [H|Acc], Prefix, [H|Local], NSC,is7bAscii(H,Is7bAscii));
  2513. _ ->
  2514. NmStr = lists:reverse(Acc),
  2515. {list_to_atom(NmStr), namespace_info(Prefix, Local), Str, S}
  2516. end.
  2517. namespace_info([], _) ->
  2518. [];
  2519. namespace_info(Prefix, Local) ->
  2520. {Prefix, lists:reverse(Local)}.
  2521. is7bAscii(_Ch,false) ->
  2522. false;
  2523. is7bAscii(Ch,_) when Ch > 127 ->
  2524. false;
  2525. is7bAscii(_,_) ->
  2526. true.
  2527. %%%%%%% [11] SystemLiteral
  2528. scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}) ->
  2529. ?dbg("cont()...~n", []),
  2530. F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes, S1) end,
  2531. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2532. S);
  2533. scan_system_literal("\"" ++ T, S) ->
  2534. scan_system_literal(T, S, $", []);
  2535. scan_system_literal("'" ++ T, S) ->
  2536. scan_system_literal(T, S, $', []).
  2537. scan_system_literal([], S=#xmerl_scanner{continuation_fun = F},
  2538. Delimiter, Acc) ->
  2539. ?dbg("cont()...~n", []),
  2540. F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes,S1,Delimiter,Acc) end,
  2541. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2542. S);
  2543. scan_system_literal([H|T], S, H, Acc) ->
  2544. {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}};
  2545. scan_system_literal([H|T], S, Delimiter, Acc) ->
  2546. scan_system_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1},
  2547. Delimiter, [H|Acc]).
  2548. %%%%%%% [12] PubidLiteral
  2549. scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}) ->
  2550. ?dbg("cont()...~n", []),
  2551. F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes, S1) end,
  2552. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2553. S);
  2554. scan_pubid_literal([H|T], S) when H == $"; H == $' ->
  2555. scan_pubid_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []);
  2556. scan_pubid_literal([H|_T], S) ->
  2557. ?fatal({invalid_pubid_char, H}, S).
  2558. scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F},
  2559. Delimiter, Acc) ->
  2560. ?dbg("cont()...~n", []),
  2561. F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes,S1,Delimiter,Acc) end,
  2562. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2563. S);
  2564. scan_pubid_literal([H|T], S, H, Acc) ->
  2565. {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}};
  2566. scan_pubid_literal(Str = [H|_], S, Delimiter, Acc) when ?whitespace(H) ->
  2567. %% Before matching public identifiers, all whitespace must be normalized,
  2568. %% so we do that here
  2569. {_, T, S1} = pub_id_strip(Str, S),
  2570. scan_pubid_literal(T, S1, Delimiter, [16#20|Acc]);
  2571. scan_pubid_literal([H|T], S, Delimiter, Acc) ->
  2572. case is_pubid_char(H) of
  2573. true ->
  2574. scan_pubid_literal(
  2575. T, S#xmerl_scanner{col = S#xmerl_scanner.col+1},
  2576. Delimiter, [H|Acc]);
  2577. false ->
  2578. ?fatal({invalid_pubid_char, H}, S)
  2579. end.
  2580. %% We do not match whitespace here, even though they're allowed in public
  2581. %% identifiers. This is because we normalize this whitespace as we scan
  2582. %% (see above in scan_pubid_literal())
  2583. %%
  2584. is_pubid_char(X) when X >= $a, X =< $z -> true;
  2585. is_pubid_char(X) when X >= $A, X =< $Z -> true;
  2586. is_pubid_char(X) when X >= $0, X =< $9 -> true;
  2587. is_pubid_char(X) ->
  2588. lists:member(X, "-'()+,./:=?;!*#@$_%").
  2589. %%%%%%% [46] contentspec
  2590. scan_contentspec([], S=#xmerl_scanner{continuation_fun = F}) ->
  2591. ?dbg("cont()...~n", []),
  2592. F(fun(MoreBytes, S1) -> scan_contentspec(MoreBytes, S1) end,
  2593. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2594. S);
  2595. scan_contentspec("EMPTY" ++ T, S0) ->
  2596. ?bump_col(5),
  2597. {empty, T, S};
  2598. scan_contentspec("ANY" ++ T, S0) ->
  2599. ?bump_col(3),
  2600. {any, T, S};
  2601. scan_contentspec("%" ++ _T, S=#xmerl_scanner{environment=prolog}) ->
  2602. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  2603. scan_contentspec("%" ++ T, S0) ->
  2604. ?bump_col(1),
  2605. {PERefName, T1, S1} = scan_pe_reference(T, S),
  2606. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  2607. {_,T2,S2} = strip(ExpRef ++ T1,S1),
  2608. scan_contentspec(T2, S2);
  2609. scan_contentspec("(" ++ T, S0) ->
  2610. ?bump_col(1),
  2611. ?strip1,
  2612. scan_elem_content(T1, S1).
  2613. %%%%%%% [47] children
  2614. %%%%%%% [51] Mixed
  2615. scan_elem_content(T, S) ->
  2616. scan_elem_content(T, S, _Context = children, _Mode = unknown, _Acc = []).
  2617. scan_elem_content([], S=#xmerl_scanner{continuation_fun = F},
  2618. Context, Mode, Acc) ->
  2619. ?dbg("cont()...~n", []),
  2620. F(fun(MoreBytes,S1) -> scan_elem_content(MoreBytes,S1,Context,Mode,Acc) end,
  2621. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2622. S);
  2623. scan_elem_content(")" ++ T, S0, Context, Mode0, Acc0) ->
  2624. ?bump_col(1),
  2625. {Mode, Acc} = case {Mode0, Acc0} of
  2626. {unknown, [_X]} ->
  2627. {seq, Acc0};
  2628. {M, _L} when M == seq; M == choice ->
  2629. {Mode0, lists:reverse(Acc0)}
  2630. end,
  2631. {Occurrence, T1, S1} = scan_occurrence(T, S),
  2632. vc_No_Duplicate_Types(S,Context,Acc),
  2633. case {Occurrence, Context,Acc} of
  2634. {once, mixed,['#PCDATA']} -> ok; % It is not ok when there are
  2635. % more names than '#PCDATA'
  2636. % and no '*'.
  2637. {'*', mixed,_} -> ok;
  2638. {Other, mixed,_} ->
  2639. ?fatal({illegal_for_mixed_content, Other}, S1);
  2640. _ ->
  2641. ok
  2642. end,
  2643. ?strip2,
  2644. {format_elem_content({Occurrence, {Mode, Acc}}), T2, S2};
  2645. scan_elem_content("#PCDATA" ++ _T, S, not_mixed, _Mode, _Acc) ->
  2646. ?fatal({error,{extra_set_of_parenthesis}},S);
  2647. scan_elem_content("#PCDATA" ++ _T, S, _Cont, Mode, Acc)
  2648. when Mode==choice;Mode==seq;Acc/=[] ->
  2649. ?fatal({error,{invalid_format_of_mixed_content}},S);
  2650. scan_elem_content("#PCDATA" ++ T, S0, _Context, Mode, Acc) ->
  2651. ?bump_col(7),
  2652. ?strip1,
  2653. scan_elem_content(T1, S1, mixed, Mode, ['#PCDATA'|Acc]);
  2654. scan_elem_content("," ++ _T, S, _Context, choice, _Acc) ->
  2655. ?fatal({mixing_comma_and_vertical_bar_in_content_model},S);
  2656. scan_elem_content("," ++ T, S0, Context, _Mode, Acc) ->
  2657. ?bump_col(1),
  2658. ?strip1,
  2659. scan_elem_content2(T1, S1, Context, seq, Acc);
  2660. scan_elem_content("|" ++ _T, S, _Context, seq, _Acc) ->
  2661. ?fatal({mixing_comma_and_vertical_bar_in_content_model},S);
  2662. scan_elem_content("|" ++ T, S0, Context, _Mode, Acc) ->
  2663. ?bump_col(1),
  2664. ?strip1,
  2665. scan_elem_content2(T1, S1, Context, choice, Acc);
  2666. scan_elem_content(T, S, Context, Mode, Acc) ->
  2667. scan_elem_content2(T, S, Context, Mode, Acc).
  2668. scan_elem_content2("(" ++ _T, S, mixed, _Mode, _Acc) ->
  2669. ?fatal({error,
  2670. {element_names_must_not_be_parenthesized_in_mixed_content}},S);
  2671. scan_elem_content2("(" ++ T, S0, Context, Mode, Acc) ->
  2672. ?bump_col(1),
  2673. ?strip1,
  2674. {Inner, T2, S2} = scan_elem_content(T1, S1, not_mixed, unknown, []),
  2675. scan_elem_content(T2, S2, Context, Mode, [Inner|Acc]);
  2676. scan_elem_content2("%" ++ _T,S=#xmerl_scanner{environment=prolog},_Context,_Mode,_Acc) ->
  2677. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  2678. scan_elem_content2("%" ++ T, S0, Context, Mode, Acc) ->
  2679. ?bump_col(1),
  2680. {PERefName, T1, S1} = scan_pe_reference(T, S),
  2681. ExpRef = expand_pe_reference(PERefName, S1,as_PE),
  2682. {_,T2,S2}=strip(ExpRef++T1,S1),
  2683. scan_elem_content(T2, S2, Context, Mode, Acc);
  2684. scan_elem_content2(T, S, Context, Mode, Acc) ->
  2685. {Name, _NameStr, T1, S1} = scan_name(T, S),
  2686. {Occurrence, T2, S2} = scan_occurrence(T1, S1),
  2687. case {Occurrence, Context} of
  2688. {once, mixed} -> ok;
  2689. {Other, mixed} ->
  2690. ?fatal({illegal_for_mixed_content, Other}, S1);
  2691. _ ->
  2692. ok
  2693. end,
  2694. ?strip3,
  2695. mandatory_delimeter_wfc(T3,S3),
  2696. NewAcc = [format_elem_content({Occurrence, Name}) | Acc],
  2697. scan_elem_content(T3, S3, Context, Mode, NewAcc).
  2698. format_elem_content({once, What}) -> What;
  2699. format_elem_content(Other) -> Other.
  2700. scan_occurrence([], S=#xmerl_scanner{continuation_fun = F}) ->
  2701. ?dbg("cont()...~n", []),
  2702. F(fun(MoreBytes, S1) -> scan_occurrence(MoreBytes, S1) end,
  2703. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2704. S);
  2705. scan_occurrence([$?|T], S0) ->
  2706. ?bump_col(1),
  2707. {'?', T, S};
  2708. scan_occurrence([$+|T], S0) ->
  2709. ?bump_col(1),
  2710. {'+', T, S};
  2711. scan_occurrence([$*|T], S0) ->
  2712. ?bump_col(1),
  2713. {'*', T, S};
  2714. scan_occurrence(T, S) ->
  2715. {once, T , S}.
  2716. %%% Tests of Validity Constraints
  2717. %% first part of VC: Name Token
  2718. vc_Valid_Char(_AT,C,S) ->
  2719. case xmerl_lib:is_namechar(C) of
  2720. true ->
  2721. ok;
  2722. _ ->
  2723. ?fatal({error,{validity_constraint_Name_Token,C}},S)
  2724. end.
  2725. % Currently unused
  2726. %
  2727. % vc_test_attr_value(_,#xmerl_scanner{validation=false}) ->
  2728. % ok;
  2729. % vc_test_attr_value(Attr={_,'ID',_,_,_},S) ->
  2730. % vc_ID_Attribute_Default(Attr,S);
  2731. % vc_test_attr_value({_,{enumeration,_NameL},_,_,_},_S) ->
  2732. % ok.
  2733. vc_ID_Attribute_Default(_,#xmerl_scanner{validation=false}) ->
  2734. ok;
  2735. vc_ID_Attribute_Default({_,'ID',_,Def,_},_S)
  2736. when Def=='#IMPLIED';Def=='#REQUIRED' ->
  2737. ok;
  2738. vc_ID_Attribute_Default({_,'ID',_,Def,_},S) ->
  2739. ?fatal({error,{validity_constraint_error_ID_Attribute_Default,Def}},S);
  2740. vc_ID_Attribute_Default(_,_) ->
  2741. ok.
  2742. vc_Enumeration({_Name,{_,NameList},DefaultVal,_,_},S)
  2743. when list(DefaultVal) ->
  2744. case lists:member(list_to_atom(DefaultVal),NameList) of
  2745. true ->
  2746. ok;
  2747. _ ->
  2748. ?fatal({error,{vc_enumeration,list_to_atom(DefaultVal),NameList}},S)
  2749. end;
  2750. vc_Enumeration({_Name,{_,_NameList},_DefaultVal,_,_},_S) ->
  2751. ok.
  2752. vc_Entity_Name({_Name,'ENTITY',DefaultVal,_,_},S) when list(DefaultVal) ->
  2753. Read = S#xmerl_scanner.rules_read_fun,
  2754. case Read(entity,list_to_atom(DefaultVal),S) of
  2755. {_,external,{_,{ndata,_}}} ->
  2756. ok;
  2757. _ -> ?fatal({error,{vc_Entity_Name,list_to_atom(DefaultVal)}},S)
  2758. end;
  2759. vc_Entity_Name({_Name,'ENTITY',_,_,_},_S) ->
  2760. ok;
  2761. vc_Entity_Name({_,'ENTITIES',DefaultVal,_,_},S) when list(DefaultVal) ->
  2762. Read = S#xmerl_scanner.rules_read_fun,
  2763. NameListFun = fun([],Acc,_St,_Fun) ->
  2764. lists:reverse(Acc);
  2765. (Str,Acc,St,Fun) ->
  2766. {N,_,St2,Str2} = scan_name(Str,St),
  2767. Fun(Str2,[N|Acc],St2,Fun)
  2768. end,
  2769. NameList = NameListFun(DefaultVal,[],S,NameListFun),
  2770. VcFun =
  2771. fun(X) ->
  2772. case Read(entity,X,S) of
  2773. {_,external,{_,{ndata,_}}} ->
  2774. ok;
  2775. _ -> ?fatal({error,{vc_Entity_Name,X}},S)
  2776. end
  2777. end,
  2778. lists:foreach(VcFun,NameList);
  2779. vc_Entity_Name({_,'ENTITIES',_,_,_},_S) ->
  2780. ok.
  2781. vc_No_Duplicate_Types(#xmerl_scanner{validation=true} = S,mixed,Acc) ->
  2782. CheckDupl =
  2783. fun([H|T],F) ->
  2784. case lists:member(H,T) of
  2785. true ->
  2786. ?fatal({no_duplicate_types_allowed,H},S);
  2787. _ -> F(T,F)
  2788. end;
  2789. ([],_) -> ok
  2790. end,
  2791. CheckDupl(Acc,CheckDupl);
  2792. vc_No_Duplicate_Types(_,_,_) ->
  2793. ok.
  2794. %%% Tests of Well-Formededness Constraints
  2795. mandatory_delimeter_wfc(","++_T,_S) ->
  2796. ok;
  2797. mandatory_delimeter_wfc("|"++_T,_S) ->
  2798. ok;
  2799. mandatory_delimeter_wfc(")"++_T,_S) ->
  2800. ok;
  2801. mandatory_delimeter_wfc("%"++_T,_S) ->
  2802. %% a parameter reference is ok
  2803. ok;
  2804. mandatory_delimeter_wfc(T,S) ->
  2805. ?fatal({comma_or_vertical_bar_mandatory_between_names_in_content_model,T},S).
  2806. wfc_unique_att_spec([],_S) ->
  2807. ok;
  2808. wfc_unique_att_spec([#xmlAttribute{name=N}|Atts],S) ->
  2809. case lists:keymember(N,#xmlAttribute.name,Atts) of
  2810. true ->
  2811. ?fatal({error,{unique_att_spec_required,N}},S);
  2812. _ ->
  2813. wfc_unique_att_spec(Atts,S)
  2814. end.
  2815. wfc_legal_char([Ch],S) ->
  2816. case xmerl_lib:is_char(Ch) of
  2817. true ->
  2818. ok;
  2819. _ ->
  2820. ?fatal({error,{wfc_Legal_Character,Ch}},S)
  2821. end;
  2822. wfc_legal_char(Ch,S) ->
  2823. case xmerl_lib:is_char(Ch) of
  2824. true ->
  2825. ok;
  2826. _ ->
  2827. ?fatal({error,{wfc_Legal_Character,Ch}},S)
  2828. end.
  2829. wfc_whitespace_betw_attrs(WS,_S) when ?whitespace(WS) ->
  2830. ok;
  2831. wfc_whitespace_betw_attrs($/,_S) ->
  2832. ok;
  2833. wfc_whitespace_betw_attrs($>,_S) ->
  2834. ok;
  2835. wfc_whitespace_betw_attrs(_,S) ->
  2836. ?fatal({whitespace_required_between_attributes},S).
  2837. wfc_Entity_Declared({external,_},S=#xmerl_scanner{standalone=yes},Name) ->
  2838. ?fatal({reference_to_externally_defed_entity_standalone_doc,Name},S);
  2839. wfc_Entity_Declared({external,_},_S,_) ->
  2840. ok;
  2841. wfc_Entity_Declared(_Env,_S,_) ->
  2842. ok.
  2843. wfc_Internal_parsed_entity(internal,Value,S) ->
  2844. %% WFC test that replacement text matches production content
  2845. scan_content(Value,S#xmerl_scanner{environment=internal_parsed_entity},
  2846. _Name=[],[],S#xmerl_scanner.space,_Lang=[],_Prnt=[],
  2847. #xmlNamespace{});
  2848. wfc_Internal_parsed_entity(_,_,_) ->
  2849. ok.
  2850. vc_Element_valid(_Name,#xmerl_scanner{environment=internal_parsed_entity}) ->
  2851. ok;
  2852. vc_Element_valid(Name,S=#xmerl_scanner{rules_read_fun=Read,
  2853. validation=true}) ->
  2854. case Read(elem_def,Name,S) of
  2855. #xmlElement{elementdef=undeclared} ->
  2856. ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); undefined ->
  2857. ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); _ -> ok
  2858. end;
  2859. vc_Element_valid(_,_) ->
  2860. ok.
  2861. %%%%%%% [74] PEDef
  2862. scan_pe_def([], S=#xmerl_scanner{continuation_fun = F}, PEName) ->
  2863. ?dbg("cont()...~n", []),
  2864. F(fun(MoreBytes, S1) -> scan_pe_def(MoreBytes, S1, PEName) end,
  2865. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2866. S);
  2867. scan_pe_def("'" ++ T, S0, PEName) ->
  2868. ?bump_col(1),
  2869. scan_entity_value(T, S, $', PEName,parameter);
  2870. scan_pe_def("\"" ++ T, S0, PEName) ->
  2871. ?bump_col(1),
  2872. scan_entity_value(T, S, $", PEName,parameter);
  2873. scan_pe_def(Str, S, _PEName) ->
  2874. scan_external_id(Str, S).
  2875. %%%%%%% [82] NotationDecl
  2876. scan_notation_decl(T, #xmerl_scanner{rules_write_fun = Write,
  2877. rules_read_fun=Read,
  2878. rules_delete_fun=Delete} = S) ->
  2879. {Name, _NameStr, T1, S1} = scan_name_no_colons(T, S),
  2880. {_,T2,S2} = mandatory_strip(T1,S1),
  2881. {Def, T3, S3} = scan_notation_decl1(T2, S2),
  2882. ?strip4,
  2883. ">" ++ T5 = T4,
  2884. case Read(notation,Name,S) of
  2885. undeclared -> Delete(notation,Name,S4);
  2886. _ -> ok
  2887. end,
  2888. S5 = Write(notation, Name, Def, S4),
  2889. {T5, S5}.
  2890. scan_notation_decl1([], S=#xmerl_scanner{continuation_fun = F}) ->
  2891. ?dbg("cont()...~n", []),
  2892. F(fun(MoreBytes, S1) -> scan_notation_decl1(MoreBytes, S1) end,
  2893. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2894. S);
  2895. scan_notation_decl1("SYSTEM" ++ T, S0) ->
  2896. ?bump_col(6),
  2897. {_,T1,S1} = mandatory_strip(T,S),
  2898. {SL, T2, S2} = scan_system_literal(T1, S1),
  2899. {{system, SL}, T2, S2};
  2900. scan_notation_decl1("PUBLIC" ++ T, S0) ->
  2901. ?bump_col(6),
  2902. {_,T1,S1} = mandatory_strip(T,S),
  2903. {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
  2904. ?strip3,
  2905. case T3 of
  2906. ">" ++ _ ->
  2907. {{public, PIDL}, T3,
  2908. S3#xmerl_scanner{col = S3#xmerl_scanner.col+1}};
  2909. _ ->
  2910. {SL, T4, S4} = scan_system_literal(T3, S3),
  2911. {{public, PIDL, SL}, T4, S4}
  2912. end.
  2913. %%%%%%% [75] ExternalID
  2914. scan_external_id([], S=#xmerl_scanner{continuation_fun = F}) ->
  2915. ?dbg("cont()...~n", []),
  2916. F(fun(MoreBytes, S1) -> scan_external_id(MoreBytes, S1) end,
  2917. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2918. S);
  2919. scan_external_id("SYSTEM" ++ T, S0) ->
  2920. ?bump_col(6),
  2921. {_,T1,S1} = mandatory_strip(T,S),
  2922. {SL, T2, S2} = scan_system_literal(T1, S1),
  2923. {{system, SL}, T2, S2};
  2924. scan_external_id("PUBLIC" ++ T, S0) ->
  2925. ?bump_col(6),
  2926. {_,T1,S1} = mandatory_strip(T,S),
  2927. {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
  2928. {_,T3,S3} = mandatory_strip(T2,S2),
  2929. {SL, T4, S4} = scan_system_literal(T3, S3),
  2930. {{public, PIDL, SL}, T4, S4}.
  2931. %%%%%%% [9] EntityValue
  2932. %% Note that we have two different scan functions for EntityValue
  2933. %% They differ in that this one checks for recursive calls to the same
  2934. %% parameter entity.
  2935. scan_entity_value(Str, S, Delim, Name, Namespace) ->
  2936. scan_entity_value(Str, S, Delim, _Acc = [], Name, Namespace,[]).
  2937. scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}},
  2938. _Delim,Acc,_,_,[]) ->
  2939. {lists:flatten(lists:reverse(Acc)), [], S};
  2940. scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}},
  2941. validation=true},
  2942. _Delim,_Acc,PEName,_,_) ->
  2943. {{error,{failed_VC_Proper_Declaration_PE_Nesting,1,PEName}},[],S};
  2944. scan_entity_value([],S,
  2945. no_delim,Acc,_,_,[]) ->
  2946. {lists:flatten(lists:reverse(Acc)),[],S};
  2947. scan_entity_value([],S=#xmerl_scanner{validation=true},
  2948. no_delim,_Acc,PEName,_,_PENesting) ->
  2949. {{error,{failed_VC_Proper_Declaration_PE_Nesting,2,PEName}},[],S};
  2950. scan_entity_value([], S=#xmerl_scanner{continuation_fun = F},
  2951. Delim, Acc, PEName,Namespace,PENesting) ->
  2952. ?dbg("cont()...~n", []),
  2953. F(fun(MoreBytes, S1) ->
  2954. scan_entity_value(MoreBytes,S1,
  2955. Delim,Acc,PEName,Namespace,PENesting)
  2956. end,
  2957. fun(S1) -> ?fatal(unexpected_end, S1) end,
  2958. S);
  2959. scan_entity_value([Delim|T], S=#xmerl_scanner{validation=true},
  2960. Delim,_Acc,PEName,_NS,PENesting) when length(PENesting) /= 0 ->
  2961. {{error,{failed_VC_Proper_Declaration_PE_Nesting,3,PEName}},T,S};
  2962. scan_entity_value([Delim|T], S0,
  2963. Delim, Acc, _PEName,_NS,_PENesting) ->
  2964. ?bump_col(1),
  2965. {lists:flatten(lists:reverse(Acc)), T, S};
  2966. scan_entity_value("%" ++ _T,S=#xmerl_scanner{environment=prolog},_,_,_,_,_) ->
  2967. ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
  2968. scan_entity_value("%" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) ->
  2969. ?bump_col(1),
  2970. {PERefName, T1, S1} = scan_pe_reference(T, S),
  2971. if PERefName == PEName,Namespace==parameter ->
  2972. ?fatal({illegal_recursion_in_PE, PEName}, S1);
  2973. true ->
  2974. {ExpandedRef,S2} =
  2975. case expand_pe_reference(PERefName, S1, in_literal) of
  2976. %% actually should pe ref be expanded as_PE but
  2977. %% handle whitespace explicitly in this case.
  2978. Tuple when tuple(Tuple) ->
  2979. %% {system,URI} or {public,URI}
  2980. %% Included in literal.
  2981. {ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
  2982. {EntV,_,_S2} =
  2983. scan_entity_value(ExpRef, S1, no_delim,[],
  2984. PERefName,parameter,[]),
  2985. %% should do an update Write(parameter_entity)
  2986. %% so next expand_pe_reference is faster
  2987. {EntV,_S2};
  2988. ExpRef ->
  2989. {ExpRef,S1}
  2990. end,
  2991. %% single or duoble qoutes are not treated as delimeters
  2992. %% in passages "included in literal"
  2993. S3 = S2#xmerl_scanner{col=S2#xmerl_scanner.col+1},
  2994. {Acc2,_,S4} = scan_entity_value(ExpandedRef,S3,no_delim,Acc,
  2995. PEName,Namespace,[]),
  2996. {_,T2,S5} = strip(" "++T1,S4),
  2997. scan_entity_value(T2,S5,Delim,lists:reverse(Acc2),
  2998. PEName,Namespace,PENesting)
  2999. end;
  3000. scan_entity_value("&" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) ->
  3001. %% This is either a character entity or a general entity (internal
  3002. %% or external) reference. An internal general entity shall not be
  3003. %% expanded in an entity def XML1.0 section 4.5.
  3004. ?bump_col(1),
  3005. case T of
  3006. "#"++_T ->
  3007. {ExpRef, T1, S1} = scan_reference(T, S),
  3008. Tok = pe_nesting_token(ExpRef++T1,Namespace,S1#xmerl_scanner.validation),
  3009. case markup_delimeter(ExpRef) of
  3010. true ->
  3011. scan_entity_value(T1, S1, Delim, [ExpRef|Acc], PEName,
  3012. Namespace,pe_push(Tok,PENesting,S1));
  3013. _ ->
  3014. scan_entity_value(ExpRef ++ T1, S1, Delim, Acc, PEName,
  3015. Namespace,pe_push(Tok,PENesting,S1))
  3016. end;
  3017. _ -> %% General Entity is bypassed, though must check for
  3018. %% recursion: save referenced name now and check for
  3019. %% recursive reference after the hole entity definition is
  3020. %% completed.
  3021. {Name, _NamespaceInfo, T1, S1} = scan_name(T,S),
  3022. S2=save_refed_entity_name(Name,PEName,S1),
  3023. scan_entity_value(T1,S2,Delim,["&"|Acc],PEName,Namespace,PENesting)
  3024. end;
  3025. %% The following clauses is for PE Nesting VC constraint
  3026. %% Start delimeter for ConditionalSection
  3027. scan_entity_value("<!["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
  3028. ?bump_col(3),
  3029. scan_entity_value(T,S,Delim,["<!["|Acc],PEName,NS,
  3030. pe_push("<![",PENesting,S));
  3031. %% Start delimeter for ConditionalSection (2)
  3032. scan_entity_value("["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
  3033. ?bump_col(1),
  3034. scan_entity_value(T,S,Delim,["["|Acc],PEName,NS,
  3035. pe_push("[",PENesting,S));
  3036. %% Start delimeter for comment
  3037. scan_entity_value("<!--"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
  3038. ?bump_col(4),
  3039. scan_entity_value(T,S,Delim,["<!--"|Acc],PEName,NS,
  3040. pe_push("<!--",PENesting,S));
  3041. %% Start delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl
  3042. scan_entity_value("<!"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3043. ?bump_col(2),
  3044. scan_entity_value(T,S,Delim,["<!"|Acc],PEName,NS,
  3045. pe_push("<!",PENesting,S));
  3046. %% Start delimeter for PI
  3047. scan_entity_value("<?"++T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3048. ?bump_col(2),
  3049. scan_entity_value(T,S,Delim,["<?"|Acc],PEName,NS,
  3050. pe_push("<?",PENesting,S));
  3051. %% Start delimeter for elements that matches the proper stop delimeter
  3052. %% for a markupdecl
  3053. scan_entity_value("</"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
  3054. ?bump_col(2),
  3055. scan_entity_value(T,S,Delim,["</"|Acc],PEName,NS,
  3056. pe_push("</",PENesting,S));
  3057. scan_entity_value("<"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
  3058. ?bump_col(1),
  3059. scan_entity_value(T,S,Delim,["<"|Acc],PEName,NS,
  3060. pe_push("<",PENesting,S));
  3061. %% Delimeter for contentspecs
  3062. scan_entity_value("("++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
  3063. ?bump_col(1),
  3064. scan_entity_value(T,S,Delim,["("|Acc],PEName,NS,
  3065. pe_push("(",PENesting,S));
  3066. %% Stop delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl
  3067. scan_entity_value(">"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3068. ?bump_col(1),
  3069. scan_entity_value(T,S,Delim,[">"|Acc],PEName,NS,
  3070. pe_pop(">",PENesting,S));
  3071. %% Stop delimeter for PI
  3072. scan_entity_value("?>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3073. ?bump_col(2),
  3074. scan_entity_value(T,S,Delim,["?>"|Acc],PEName,NS,
  3075. pe_pop("?>",PENesting,S));
  3076. %% Stop delimeter for comment
  3077. scan_entity_value("-->"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3078. ?bump_col(3),
  3079. scan_entity_value(T,S,Delim,["-->"|Acc],PEName,NS,
  3080. pe_pop("-->",PENesting,S));
  3081. %% Stop delimeter for ConditionalSection
  3082. scan_entity_value("]]>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3083. ?bump_col(3),
  3084. scan_entity_value(T,S,Delim,["]]>"|Acc],PEName,NS,
  3085. pe_pop("]]>",PENesting,S));
  3086. %% Stop delimeter added to match a content start delimeter included
  3087. scan_entity_value("/>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3088. ?bump_col(2),
  3089. scan_entity_value(T,S,Delim,["/>"|Acc],PEName,NS,
  3090. pe_pop("/>",PENesting,S));
  3091. scan_entity_value(")"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
  3092. ?bump_col(1),
  3093. scan_entity_value(T,S,Delim,[")"|Acc],PEName,NS,
  3094. pe_pop(")",PENesting,S));
  3095. scan_entity_value([H|T], S0, Delim, Acc, PEName,Namespace,PENesting) ->
  3096. case xmerl_lib:is_char(H) of
  3097. true ->
  3098. ?bump_col(1),
  3099. scan_entity_value(T, S, Delim, [H|Acc], PEName,Namespace,PENesting);
  3100. false ->
  3101. ?fatal({unexpected_char,H}, S0)
  3102. end.
  3103. save_refed_entity_name(Name,PEName,S) ->
  3104. case predefined_entity(Name) of
  3105. true ->
  3106. S;
  3107. _ ->
  3108. save_refed_entity_name1(Name,PEName,S)
  3109. end.
  3110. save_refed_entity_name1(Name,PEName,
  3111. S=#xmerl_scanner{entity_references=ERefs}) ->
  3112. case lists:keysearch(PEName,1,ERefs) of
  3113. {value,{_,Refs}} ->
  3114. NewRefs =
  3115. case lists:member(Name,Refs) of
  3116. true ->Refs;
  3117. _ -> [Name|Refs]
  3118. end,
  3119. S#xmerl_scanner{entity_references=lists:keyreplace(PEName,1,ERefs,
  3120. {PEName,NewRefs})
  3121. };
  3122. _ ->
  3123. S#xmerl_scanner{entity_references=[{PEName,[Name]}|ERefs]}
  3124. end.
  3125. pe_push(Tok,Stack,_S) when Tok=="<!";Tok=="<?";Tok=="<!--";Tok=="<![";
  3126. Tok=="[";Tok=="<";Tok=="</";Tok=="(" ->
  3127. [Tok|Stack];
  3128. pe_push(Tok,Stack,#xmerl_scanner{validation=true})
  3129. when Tok==")";Tok==">";Tok=="?>";Tok=="]]>";Tok=="-->";Tok=="/>"->
  3130. [Tok|Stack];
  3131. pe_push(_,Stack,_S) ->
  3132. Stack.
  3133. pe_pop(">",["<!"|Rest],_S) -> Rest;
  3134. pe_pop("?>",["<?"|Rest],_S) -> Rest;
  3135. pe_pop("-->",["<!--"|Rest],_S) -> Rest;
  3136. pe_pop("]]>",["[","<!["|Rest],_S) -> Rest;
  3137. pe_pop("/>",["<"|Rest],_S) -> Rest;
  3138. pe_pop(">",["<"|Rest],_S) -> Rest;
  3139. pe_pop(">",["</"|Rest],_S) -> Rest;
  3140. pe_pop(")",["("|Rest],_S) -> Rest;
  3141. pe_pop(Token,_Stack,S=#xmerl_scanner{validation=true}) ->
  3142. ?fatal({error,{failed_VC_Proper_Declaration_PE_Nesting,5,Token}},S);
  3143. pe_pop(_,Rest,_) ->
  3144. Rest.
  3145. pe_nesting_token("<!"++_T,parameter,true) -> "<!";
  3146. pe_nesting_token("<?"++_T,parameter,true) -> "<?";
  3147. pe_nesting_token("<!--"++_T,parameter,true) -> "<!--";
  3148. pe_nesting_token("<!["++_T,parameter,true) -> "<![";
  3149. pe_nesting_token("["++_T,parameter,true) -> "[";
  3150. pe_nesting_token("("++_T,parameter,true) -> "(";
  3151. pe_nesting_token(">"++_T,parameter,true) -> ">";
  3152. pe_nesting_token("?>"++_T,parameter,true) -> "?>";
  3153. pe_nesting_token("-->"++_T,parameter,true) -> "-->";
  3154. pe_nesting_token("]]>"++_T,parameter,true) -> "]]>";
  3155. pe_nesting_token(")"++_T,parameter,true) -> ")";
  3156. pe_nesting_token("/>"++_T,parameter,true) -> "/>";
  3157. pe_nesting_token(_,_,_) -> false.
  3158. predefined_entity(amp) -> true;
  3159. predefined_entity(lt) -> true;
  3160. predefined_entity(gt) -> true;
  3161. predefined_entity(apos) -> true;
  3162. predefined_entity(quot) -> true;
  3163. predefined_entity(_) -> false.
  3164. check_entity_recursion(EName,
  3165. S=#xmerl_scanner{entity_references=EntityRefList}) ->
  3166. Set = sofs:family(EntityRefList),
  3167. case catch sofs:family_to_digraph(Set, [acyclic]) of
  3168. {'EXIT',{cyclic,_}} ->
  3169. ?fatal({illegal_recursion_in_Entity, EName}, S);
  3170. {graph,_,_,_,_} ->
  3171. ok
  3172. end.
  3173. %%%%%%% [15] Comment
  3174. scan_comment(Str, S) ->
  3175. scan_comment(Str, S, _Pos = undefined, _Parents = [], _Lang = []).
  3176. scan_comment(Str,S=#xmerl_scanner{col=C,event_fun=Event}, Pos, Parents, Lang) ->
  3177. Comment = #xmlComment{pos = Pos,
  3178. parents = Parents,
  3179. language = Lang,
  3180. value = undefined},
  3181. S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started,
  3182. line = S#xmerl_scanner.line,
  3183. col = C,
  3184. pos = Pos,
  3185. data = Comment}, S),
  3186. scan_comment1(Str, S1, Pos, Comment, _Acc = []).
  3187. scan_comment1([], S=#xmerl_scanner{continuation_fun = F},
  3188. Pos, Comment, Acc) ->
  3189. ?dbg("cont()...~n", []),
  3190. F(fun(MoreBytes, S1) -> scan_comment1(MoreBytes, S1, Pos, Comment, Acc) end,
  3191. fun(S1) -> ?fatal(unexpected_end, S1) end,
  3192. S);
  3193. scan_comment1("--" ++ T, S0 = #xmerl_scanner{col = C,
  3194. event_fun = Event,
  3195. hook_fun = Hook},
  3196. _Pos, Comment, Acc) ->
  3197. case hd(T) of
  3198. $> ->
  3199. ?bump_col(2),
  3200. Comment1 = Comment#xmlComment{value = lists:reverse(Acc)},
  3201. S1=#xmerl_scanner{}=Event(#xmerl_event{event = ended,
  3202. line=S#xmerl_scanner.line,
  3203. col = C,
  3204. data = Comment1}, S),
  3205. {Ret, S2} = Hook(Comment1, S1),
  3206. T2 = tl(T),
  3207. ?strip3,
  3208. {Ret, T3, S3};
  3209. Char ->
  3210. ?fatal({invalid_comment,"--"++[Char]}, S0)
  3211. end;
  3212. scan_comment1("\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
  3213. scan_comment1(T, S#xmerl_scanner{line=L+1,col=1},Pos, Cmt, "\n" ++ Acc);
  3214. scan_comment1("\r\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
  3215. %% CR followed by LF is read as a single LF
  3216. scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc);
  3217. scan_comment1("\r" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
  3218. %% CR not followed by LF is read as a LF
  3219. scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc);
  3220. scan_comment1([H|T], S=#xmerl_scanner{col = C}, Pos, Cmt, Acc) ->
  3221. wfc_legal_char(H,S),
  3222. scan_comment1(T, S#xmerl_scanner{col=C+1}, Pos, Cmt, [H|Acc]).
  3223. %%%%%%%
  3224. scan_markup_completion_gt([$>|_R]=T,S) ->
  3225. {T,S};
  3226. scan_markup_completion_gt([$%|T],S0) ->
  3227. ?bump_col(1),
  3228. {Name,T1,S1} = scan_pe_reference(T,S),
  3229. ExpandedRef = expand_pe_reference(Name,S1,as_PE),
  3230. {_,T2,S2} = strip(ExpandedRef++T1,S1),
  3231. scan_markup_completion_gt(T2,S2);
  3232. scan_markup_completion_gt(T,S) ->
  3233. ?fatal({error,{malformed_syntax_entity_completion,T}},S).
  3234. strip(Str,S) ->
  3235. strip(Str,S,all).
  3236. strip([], S=#xmerl_scanner{continuation_fun = F},_) ->
  3237. ?dbg("cont()... stripping whitespace~n", []),
  3238. F(fun(MoreBytes, S1) -> strip(MoreBytes, S1) end,
  3239. fun(S1) -> {[], [], S1} end,
  3240. S);
  3241. strip("\s" ++ T, S=#xmerl_scanner{col = C},Lim) ->
  3242. strip(T, S#xmerl_scanner{col = C+1},Lim);
  3243. strip("\t" ++ _T, S ,no_tab) ->
  3244. ?fatal({error,{no_tab_allowed}},S);
  3245. strip("\t" ++ T, S=#xmerl_scanner{col = C},Lim) ->
  3246. strip(T, S#xmerl_scanner{col = expand_tab(C)},Lim);
  3247. strip("\n" ++ T, S=#xmerl_scanner{line = L},Lim) ->
  3248. strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
  3249. strip("\r\n" ++ T, S=#xmerl_scanner{line = L},Lim) ->
  3250. %% CR followed by LF is read as a single LF
  3251. strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
  3252. strip("\r" ++ T, S=#xmerl_scanner{line = L},Lim) ->
  3253. %% CR not followed by LF is read as a LF
  3254. strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
  3255. strip(Str, S,_Lim) ->
  3256. {[], Str, S}.
  3257. %% demands a whitespace, though a parameter entity is ok, it will
  3258. %% expand with a whitespace on each side.
  3259. mandatory_strip([],S) ->
  3260. ?fatal({error,{whitespace_was_expected}},S);
  3261. mandatory_strip(T,S) when ?whitespace(hd(T)) ->
  3262. strip(T,S,all);
  3263. mandatory_strip([$%|T],S) when ?whitespace(hd(T)) -> %this is not a PERefence, but an PEDeclaration
  3264. ?fatal({error,{whitespace_was_expected}},S);
  3265. mandatory_strip([$%|_T]=T,S) ->
  3266. {[],T,S};
  3267. mandatory_strip(_T,S) ->
  3268. ?fatal({error,{whitespace_was_expected}},S).
  3269. %% strip but don't accept tab
  3270. pub_id_strip(Str, S) ->
  3271. strip(Str,S,no_tab).
  3272. normalize("&"++T,S,IsNorm) ->
  3273. case scan_reference(T, S) of
  3274. {ExpRef, T1, S1} when ?whitespace(hd(ExpRef)) ->
  3275. normalize(ExpRef++T1,S1,IsNorm);
  3276. _ ->
  3277. {"&"++T,S,IsNorm}
  3278. end;
  3279. normalize(T,S,IsNorm) ->
  3280. case strip(T,S) of
  3281. {_,T,S} ->
  3282. {T,S,IsNorm};
  3283. {_,T1,S1} ->
  3284. {T1,S1,true}
  3285. end.
  3286. %%% @spec accumulate_whitespace(T::string(),S::global_state(),
  3287. %%% atom(),Acc::string()) -> {Acc, T1, S1}
  3288. %%%
  3289. %%% @doc Function to accumulate and normalize whitespace.
  3290. accumulate_whitespace(T, S, preserve, Acc) ->
  3291. accumulate_whitespace(T, S, Acc);
  3292. accumulate_whitespace(T, S, normalize, Acc) ->
  3293. {_WsAcc, T1, S1} = accumulate_whitespace(T, S, []),
  3294. {[$\s|Acc], T1, S1}.
  3295. accumulate_whitespace([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
  3296. ?dbg("cont()...~n", []),
  3297. F(fun(MoreBytes, S1) -> accumulate_whitespace(MoreBytes, S1, Acc) end,
  3298. fun(S1) -> {Acc, [], S1} end,
  3299. S);
  3300. accumulate_whitespace("\s" ++ T, S=#xmerl_scanner{col = C}, Acc) ->
  3301. accumulate_whitespace(T, S#xmerl_scanner{col = C+1}, [$\s|Acc]);
  3302. accumulate_whitespace("\t" ++ T, S=#xmerl_scanner{col = C}, Acc) ->
  3303. accumulate_whitespace(T, S#xmerl_scanner{col = expand_tab(C)}, [$\t|Acc]);
  3304. accumulate_whitespace("\n" ++ T, S=#xmerl_scanner{line = L}, Acc) ->
  3305. accumulate_whitespace(T, S#xmerl_scanner{line = L+1, col = 1}, [$\n|Acc]);
  3306. accumulate_whitespace("\r\n" ++ T, S=#xmerl_scanner{line = L}, Acc) ->
  3307. %% CR followed by LF is read as a single LF
  3308. accumulate_whitespace(T, S#xmerl_scanner{line = L+1, col=1}, [$\n|Acc]);
  3309. accumulate_whitespace("\r" ++ T, S=#xmerl_scanner{line = L}, Acc) ->
  3310. %% CR not followed by LF is read as a LF
  3311. accumulate_whitespace(T, S#xmerl_scanner{line = L+1, col = 1}, [$\n|Acc]);
  3312. accumulate_whitespace(Str, S, Acc) ->
  3313. {Acc, Str, S}.
  3314. expand_tab(Col) ->
  3315. Rem = (Col-1) rem 8,
  3316. _NewCol = Col + 8 - Rem.
  3317. %%% Helper functions
  3318. fatal(Reason, S) ->
  3319. exit({fatal, {Reason, S#xmerl_scanner.line, S#xmerl_scanner.col}}).
  3320. %% BUG when we are many <!ATTLIST ..> balise none attributes has save in rules
  3321. rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) ->
  3322. case ets:lookup(T, {Context, Name}) of
  3323. [] ->
  3324. ets:insert(T, {{Context, Name}, Value});
  3325. _ ->
  3326. ok
  3327. end,
  3328. S.
  3329. rules_read(Context, Name, #xmerl_scanner{rules = T}) ->
  3330. case ets:lookup(T, {Context, Name}) of
  3331. [] ->
  3332. undefined;
  3333. [{_, V}] ->
  3334. V
  3335. end.
  3336. rules_delete(Context,Name,#xmerl_scanner{rules = T}) ->
  3337. ets:delete(T,{Context,Name}).
  3338. % decode_UTF8(Str) ->
  3339. % decode_UTF8(Str,[]).
  3340. % decode_UTF8([],Acc) ->
  3341. % lists:reverse(Acc);
  3342. % decode_UTF8([H|T],Acc) when H =< 127 ->
  3343. % decode_UTF8(T,[H|Acc]);
  3344. % decode_UTF8([H1,H2|T],Acc) when H1 =< 16#DF->
  3345. % Ch = char_UTF8_2b(H1,H2),
  3346. % decode_UTF8(T,[Ch|Acc]);
  3347. % decode_UTF8([H1,H2,H3|T],Acc) when H1 =< 16#EF ->
  3348. % Ch = char_UTF8_3b(H1,H2,H3),
  3349. % decode_UTF8(T,[Ch|Acc]);
  3350. % decode_UTF8([H1,H2,H3,H4|T],Acc) when H1 =< 16#F7 ->
  3351. % Ch = char_UTF8_4b(H1,H2,H3,H4),
  3352. % decode_UTF8(T,[Ch|Acc]);
  3353. % decode_UTF8([H1,H2,H3,H4,H5|T],Acc) when H1 =< 16#FB ->
  3354. % Ch = char_UTF8_5b(H1,H2,H3,H4,H5),
  3355. % decode_UTF8(T,[Ch|Acc]);
  3356. % decode_UTF8([H1,H2,H3,H4,H5,H6|T],Acc) ->
  3357. % Ch = char_UTF8_6b(H1,H2,H3,H4,H5,H6),
  3358. % decode_UTF8(T,[Ch|Acc]).
  3359. % char_UTF8_2b(H1,H2) ->
  3360. % Msb = (H1 band 16#1F) bsl 6,
  3361. % Lsb = H2 band 16#3F,
  3362. % Msb + Lsb.
  3363. % char_UTF8_3b(H1,H2,H3) ->
  3364. % (H3 band 16#3F) + ((H2 band 16#3F) bsl 6) + ((H1 band 16#0F) bsl 12).
  3365. % char_UTF8_4b(H1,H2,H3,H4) ->
  3366. % (H4 band 16#3F) + ((H3 band 16#3F) bsl 6) + ((H2 band 16#3F) bsl 12) +
  3367. % ((H1 band 16#07) bsl 18).
  3368. % char_UTF8_5b(H1,H2,H3,H4,H5) ->
  3369. % (H5 band 16#3F) + ((H4 band 16#3F) bsl 6) + ((H3 band 16#3F) bsl 12) +
  3370. % ((H2 band 16#3F) bsl 18) + ((H1 band 16#03) band 24).
  3371. % char_UTF8_6b(H1,H2,H3,H4,H5,H6) ->
  3372. % (H6 band 16#3F) + ((H5 band 16#3F) bsl 6) + ((H4 band 16#3F) bsl 12) +
  3373. % ((H3 band 16#3F) bsl 18) + ((H2 band 16#3F) bsl 24) +
  3374. % ((H1 band 16#01) bsl 30).
  3375. % utf8_char([H|T],S0=#xmerl_scanner{encoding="UTF-16"}) ->
  3376. % ?bump_col(1),
  3377. % {H,T,S};
  3378. % utf8_char([H|T],S0) when H =< 127 ->
  3379. % ?bump_col(1),
  3380. % {H,T,S};
  3381. % utf8_char([H1,H2|T],S0) when H1 =< 16#DF->
  3382. % Ch = char_UTF8_2b(H1,H2),
  3383. % ?bump_col(2),
  3384. % {Ch,T,S};
  3385. % utf8_char([H1,H2,H3|T],S0) when H1 =< 16#EF ->
  3386. % Ch = char_UTF8_3b(H1,H2,H3),
  3387. % ?bump_col(3),
  3388. % {Ch,T,S};
  3389. % utf8_char([H1,H2,H3,H4|T],S0) when H1 =< 16#F7 ->
  3390. % Ch = char_UTF8_4b(H1,H2,H3,H4),
  3391. % ?bump_col(4),
  3392. % {Ch,T,S};
  3393. % utf8_char([H1,H2,H3,H4,H5|T],S0) when H1 =< 16#FB ->
  3394. % Ch = char_UTF8_5b(H1,H2,H3,H4,H5),
  3395. % ?bump_col(5),
  3396. % {Ch,T,S};
  3397. % utf8_char([H1,H2,H3,H4,H5,H6|T],S0) ->
  3398. % Ch = char_UTF8_6b(H1,H2,H3,H4,H5,H6),
  3399. % ?bump_col(6),
  3400. % {Ch,T,S}.