/source/otp_src_R14B02/lib/edoc/src/edoc_scanner.erl
Erlang | 366 lines | 252 code | 40 blank | 74 comment | 1 complexity | 3289d4ef0ef93d3552c73a71e960aca5 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
1%% ``The contents of this file are subject to the Erlang Public License,
2%% Version 1.1, (the "License"); you may not use this file except in
3%% compliance with the License. You should have received a copy of the
4%% Erlang Public License along with this software. If not, it can be
5%% retrieved via the world wide web at http://www.erlang.org/.
6%%
7%% Software distributed under the License is distributed on an "AS IS"
8%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
9%% the License for the specific language governing rights and
10%% limitations under the License.
11%%
12%% The Initial Developer of the Original Code is Ericsson Utvecklings
13%% AB. Portions created by Ericsson are Copyright 1999, Ericsson
14%% Utvecklings AB. All Rights Reserved.''
15%%
16%% $Id: $
17%%
18%% @private
19%% @copyright Richard Carlsson 2001-2003. Portions created by Ericsson
20%% are Copyright 1999, Ericsson Utvecklings AB. All Rights Reserved.
21%% @author Richard Carlsson <richardc@it.uu.se>
22%% @see edoc
23%% @end
24
25%% @doc Tokeniser for EDoc. Based on the Erlang standard library module
26%% {@link //stdlib/erl_scan}.
27
28-module(edoc_scanner).
29
30%% NOTE: the interface to this module is ancient and should be updated.
31%% Please do not regard these exported functions as stable. Their
32%% behaviour is described in the documentation of the module `erl_scan'.
33%%
34%% Since there are no `full stop' tokens in EDoc specifications, the
35%% `tokens' function *always* returns `{more, Continuation}' unless an
36%% error occurs.
37
38-export([string/1,string/2,format_error/1]).
39
40-import(lists, [reverse/1]).
41
42string(Cs) -> string(Cs, 1).
43
44string(Cs, StartPos) ->
45 case scan(Cs, StartPos) of
46 {ok,Toks} -> {ok,Toks,StartPos};
47 {error,E} -> {error,E,StartPos}
48 end.
49
50%% format_error(Error)
51%% Return a string describing the error.
52
53format_error({string,Quote,Head}) ->
54 ["unterminated string starting with " ++ io_lib:write_string(Head,Quote)];
55format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
56format_error(char) -> "unterminated character";
57format_error(scan) -> "premature end";
58format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
59format_error(float) -> "bad float";
60
61format_error(Other) -> io_lib:write(Other).
62
63%% Reserved words, not atoms:
64reserved('where') -> true;
65reserved(_) -> false.
66
67%% scan(CharList, StartPos)
68%% This takes a list of characters and tries to tokenise them.
69%%
70%% The token list is built in reverse order (in a stack) to save appending
71%% and then reversed when all the tokens have been collected. Most tokens
72%% are built in the same way.
73%%
74%% Returns:
75%% {ok,[Tok]}
76%% {error,{ErrorPos,edoc_scanner,What}}
77
78scan(Cs, Pos) ->
79 scan1(Cs, [], Pos).
80
81%% scan1(Characters, TokenStack, Position)
82%% Scan a list of characters into tokens.
83
84scan1([$\n|Cs], Toks, Pos) -> % Newline
85 scan1(Cs, Toks, Pos+1);
86scan1([C|Cs], Toks, Pos) when C >= 0, C =< $ -> % Skip blanks
87 scan1(Cs, Toks, Pos);
88scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> % Unquoted atom
89 scan_atom(C, Cs, Toks, Pos);
90scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> % Numbers
91 scan_number(C, Cs, Toks, Pos);
92scan1([$-,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers
93 scan_signed_number($-, C, Cs, Toks, Pos);
94scan1([$+,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers
95 scan_signed_number($+, C, Cs, Toks, Pos);
96scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> % Variables
97 scan_variable(C, Cs, Toks, Pos);
98scan1([$_|Cs], Toks, Pos) -> % Variables
99 scan_variable($_, Cs, Toks, Pos);
100scan1([$$|Cs], Toks, Pos) -> % Character constant
101 case scan_char_const(Cs, Toks, Pos) of
102 {ok, Result} ->
103 {ok, Result};
104 {error, truncated_char} ->
105 scan_error(char, Pos);
106 {error, illegal_character} ->
107 scan_error({illegal, char}, Pos)
108 end;
109scan1([$'|Cs0], Toks, Pos) -> % Quoted atom
110 case scan_string(Cs0, $', Pos) of
111 {S,Cs1,Pos1} ->
112 case catch list_to_atom(S) of
113 A when is_atom(A) ->
114 scan1(Cs1, [{atom,Pos,A}|Toks], Pos1);
115 _Error -> scan_error({illegal,atom}, Pos)
116 end;
117 {error, premature_end} ->
118 scan_error({string,$',Cs0}, Pos);
119 {error, truncated_char} ->
120 scan_error(char, Pos);
121 {error, illegal_character} ->
122 scan_error({illegal, atom}, Pos)
123 end;
124scan1([$"|Cs0], Toks, Pos) -> % String
125 case scan_string(Cs0, $", Pos) of
126 {S,Cs1,Pos1} ->
127 case Toks of
128 [{string, Pos0, S0} | Toks1] ->
129 scan1(Cs1, [{string, Pos0, S0 ++ S} | Toks1],
130 Pos1);
131 _ ->
132 scan1(Cs1, [{string,Pos,S}|Toks], Pos1)
133 end;
134 {error, premature_end} ->
135 scan_error({string,$",Cs0}, Pos);
136 {error, truncated_char} ->
137 scan_error(char, Pos);
138 {error, illegal_character} ->
139 scan_error({illegal, string}, Pos)
140 end;
141%% Punctuation characters and operators, first recognise multiples.
142scan1([$<,$<|Cs], Toks, Pos) ->
143 scan1(Cs, [{'<<',Pos}|Toks], Pos);
144scan1([$>,$>|Cs], Toks, Pos) ->
145 scan1(Cs, [{'>>',Pos}|Toks], Pos);
146scan1([$-,$>|Cs], Toks, Pos) ->
147 scan1(Cs, [{'->',Pos}|Toks], Pos);
148scan1([$:,$:|Cs], Toks, Pos) ->
149 scan1(Cs, [{'::',Pos}|Toks], Pos);
150scan1([$/,$/|Cs], Toks, Pos) ->
151 scan1(Cs, [{'//',Pos}|Toks], Pos);
152scan1([$.,$.,$.|Cs], Toks, Pos) ->
153 scan1(Cs, [{'...',Pos}|Toks], Pos);
154scan1([$.,$.|Cs], Toks, Pos) ->
155 scan1(Cs, [{'..',Pos}|Toks], Pos);
156scan1([C|Cs], Toks, Pos) -> % Punctuation character
157 P = list_to_atom([C]),
158 scan1(Cs, [{P,Pos}|Toks], Pos);
159scan1([], Toks0, _Pos) ->
160 Toks = reverse(Toks0),
161 {ok,Toks}.
162
163%% Note that `_' is not accepted as a variable token.
164scan_variable(C, Cs, Toks, Pos) ->
165 {Wcs,Cs1} = scan_name(Cs, []),
166 W = [C|reverse(Wcs)],
167 case W of
168 "_" ->
169 scan1(Cs1, [{an_var,Pos,'_'}|Toks], Pos);
170 _ ->
171 case catch list_to_atom(W) of
172 A when is_atom(A) ->
173 scan1(Cs1, [{var,Pos,A}|Toks], Pos);
174 _ ->
175 scan_error({illegal,variable}, Pos)
176 end
177 end.
178
179scan_atom(C, Cs, Toks, Pos) ->
180 {Wcs,Cs1} = scan_name(Cs, []),
181 W = [C|reverse(Wcs)],
182 case catch list_to_atom(W) of
183 A when is_atom(A) ->
184 case reserved(A) of
185 true ->
186 scan1(Cs1, [{A,Pos}|Toks], Pos);
187 false ->
188 scan1(Cs1, [{atom,Pos,A}|Toks], Pos)
189 end;
190 _ ->
191 scan_error({illegal,token}, Pos)
192 end.
193
194%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
195
196scan_name([C|Cs], Ncs) ->
197 case name_char(C) of
198 true ->
199 scan_name(Cs, [C|Ncs]);
200 false ->
201 {Ncs,[C|Cs]} % Must rebuild here, sigh!
202 end;
203scan_name([], Ncs) ->
204 {Ncs,[]}.
205
206name_char(C) when C >= $a, C =< $z -> true;
207name_char(C) when C >= $\337, C =< $\377, C /= $\367 -> true;
208name_char(C) when C >= $A, C =< $Z -> true;
209name_char(C) when C >= $\300, C =< $\336, C /= $\327 -> true;
210name_char(C) when C >= $0, C =< $9 -> true;
211name_char($_) -> true;
212name_char($@) -> true;
213name_char(_) -> false.
214
215%% scan_string(CharList, QuoteChar, Pos) ->
216%% {StringChars,RestChars, NewPos}
217
218scan_string(Cs, Quote, Pos) ->
219 scan_string(Cs, [], Quote, Pos).
220
221scan_string([Quote|Cs], Scs, Quote, Pos) ->
222 {reverse(Scs),Cs,Pos};
223scan_string([], _Scs, _Quote, _Pos) ->
224 {error, premature_end};
225scan_string(Cs0, Scs, Quote, Pos) ->
226 case scan_char(Cs0, Pos) of
227 {C,Cs,Pos1} ->
228 %% Only build the string here
229 scan_string(Cs, [C|Scs], Quote, Pos1);
230 Error ->
231 Error
232 end.
233
234%% Note that space characters are not allowed
235scan_char_const([$\040 | _Cs0], _Toks, _Pos) ->
236 {error, illegal_character};
237scan_char_const(Cs0, Toks, Pos) ->
238 case scan_char(Cs0, Pos) of
239 {C,Cs,Pos1} ->
240 scan1(Cs, [{char,Pos,C}|Toks], Pos1);
241 Error ->
242 Error
243 end.
244
245%% {Character,RestChars,NewPos} = scan_char(Chars, Pos)
246%% Read a single character from a string or character constant. The
247%% pre-scan phase has checked for errors here.
248%% Note that control characters are not allowed.
249
250scan_char([$\\|Cs], Pos) ->
251 scan_escape(Cs, Pos);
252scan_char([C | _Cs], _Pos) when C =< 16#1f ->
253 {error, illegal_character};
254scan_char([C|Cs], Pos) ->
255 {C,Cs,Pos};
256scan_char([], _Pos) ->
257 {error, truncated_char}.
258
259%% The following conforms to Standard Erlang escape sequences.
260
261scan_escape([O1, O2, O3 | Cs], Pos) when % \<1-3> octal digits
262 O1 >= $0, O1 =< $3, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
263 Val = (O1*8 + O2)*8 + O3 - 73*$0,
264 {Val,Cs,Pos};
265scan_escape([O1, O2 | Cs], Pos) when
266 O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
267 Val = (O1*8 + O2) - 9*$0,
268 {Val,Cs,Pos};
269scan_escape([O1 | Cs], Pos) when
270 O1 >= $0, O1 =< $7 ->
271 {O1 - $0,Cs,Pos};
272scan_escape([$^, C | Cs], Pos) -> % \^X -> CTL-X
273 if C >= $\100, C =< $\137 ->
274 {C - $\100,Cs,Pos};
275 true -> {error, illegal_control_character}
276 end;
277scan_escape([C | Cs], Pos) ->
278 case escape_char(C) of
279 C1 when C1 > $\000 -> {C1,Cs,Pos};
280 _ -> {error, undefined_escape_sequence}
281 end;
282scan_escape([], _Pos) ->
283 {error, truncated_char}.
284
285%% Note that we return $\000 for undefined escapes.
286escape_char($b) -> $\010; % \b = BS
287escape_char($d) -> $\177; % \d = DEL
288escape_char($e) -> $\033; % \e = ESC
289escape_char($f) -> $\014; % \f = FF
290escape_char($n) -> $\012; % \n = LF
291escape_char($r) -> $\015; % \r = CR
292escape_char($s) -> $\040; % \s = SPC
293escape_char($t) -> $\011; % \t = HT
294escape_char($v) -> $\013; % \v = VT
295escape_char($\\) -> $\134; % \\ = \
296escape_char($') -> $\047; % \' = '
297escape_char($") -> $\042; % \" = "
298escape_char(_C) -> $\000.
299
300%% scan_number(Char, CharList, TokenStack, Pos)
301%% We handle sign and radix notation:
302%% [+-]<digits> - the digits in base [+-]10
303%% [+-]<digits>.<digits>
304%% [+-]<digits>.<digits>E+-<digits>
305%% [+-]<digits>#<digits> - the digits read in base [+-]B
306%%
307%% Except for explicitly based integers we build a list of all the
308%% characters and then use list_to_integer/1 or list_to_float/1 to
309%% generate the value.
310
311%% SPos == Start position
312%% CPos == Current position
313
314scan_number(C, Cs0, Toks, Pos) ->
315 {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
316 scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
317
318scan_signed_number(S, C, Cs0, Toks, Pos) ->
319 {Ncs,Cs,Pos1} = scan_integer(Cs0, [C, S], Pos),
320 scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
321
322scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
323 scan_integer(Cs, [C|Stack], Pos);
324scan_integer(Cs, Stack, Pos) ->
325 {Stack,Cs,Pos}.
326
327scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
328 {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
329 scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
330scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
331 N = list_to_integer(reverse(Ncs)),
332 scan1(Cs, [{integer,SPos,N}|Toks], CPos).
333
334scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
335 scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
336scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
337 scan_exponent(Cs, [$e|Ncs], Toks, SPos, CPos);
338scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
339 case catch list_to_float(reverse(Ncs)) of
340 N when is_float(N) ->
341 scan1(Cs, [{float,SPos,N}|Toks], CPos);
342 _Error -> scan_error({illegal,float}, SPos)
343 end.
344
345%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
346%% Generate an error here if E{+|-} not followed by any digits.
347
348scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
349 scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
350scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
351 scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
352scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
353 scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
354
355scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
356 {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
357 case catch list_to_float(reverse(Ncs)) of
358 N when is_float(N) ->
359 scan1(Cs, [{float,SPos,N}|Toks], CPos1);
360 _Error -> scan_error({illegal,float}, SPos)
361 end;
362scan_exponent1(_, _, _, _, CPos) ->
363 scan_error(float, CPos).
364
365scan_error(In, Pos) ->
366 {error,{Pos,edoc_scanner,In}}.