PageRenderTime 59ms CodeModel.GetById 33ms app.highlight 22ms RepoModel.GetById 1ms app.codeStats 0ms

/source/otp_src_R14B02/lib/edoc/src/edoc_scanner.erl

https://github.com/akiernan/omnibus
Erlang | 366 lines | 252 code | 40 blank | 74 comment | 1 complexity | 3289d4ef0ef93d3552c73a71e960aca5 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1%% ``The contents of this file are subject to the Erlang Public License,
  2%% Version 1.1, (the "License"); you may not use this file except in
  3%% compliance with the License. You should have received a copy of the
  4%% Erlang Public License along with this software. If not, it can be
  5%% retrieved via the world wide web at http://www.erlang.org/.
  6%%
  7%% Software distributed under the License is distributed on an "AS IS"
  8%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
  9%% the License for the specific language governing rights and
 10%% limitations under the License.
 11%%
 12%% The Initial Developer of the Original Code is Ericsson Utvecklings
 13%% AB. Portions created by Ericsson are Copyright 1999, Ericsson
 14%% Utvecklings AB. All Rights Reserved.''
 15%%
 16%% $Id: $
 17%%
 18%% @private
 19%% @copyright Richard Carlsson 2001-2003. Portions created by Ericsson
 20%% are Copyright 1999, Ericsson Utvecklings AB. All Rights Reserved.
 21%% @author Richard Carlsson <richardc@it.uu.se>
 22%% @see edoc
 23%% @end
 24
 25%% @doc Tokeniser for EDoc. Based on the Erlang standard library module
 26%% {@link //stdlib/erl_scan}.
 27
 28-module(edoc_scanner).
 29
 30%% NOTE: the interface to this module is ancient and should be updated.
 31%% Please do not regard these exported functions as stable. Their
 32%% behaviour is described in the documentation of the module `erl_scan'.
 33%%
 34%% Since there are no `full stop' tokens in EDoc specifications, the
 35%% `tokens' function *always* returns `{more, Continuation}' unless an
 36%% error occurs.
 37
 38-export([string/1,string/2,format_error/1]).
 39
 40-import(lists, [reverse/1]).
 41
 42string(Cs) -> string(Cs, 1).
 43
 44string(Cs, StartPos) ->
 45    case scan(Cs, StartPos) of
 46	{ok,Toks} -> {ok,Toks,StartPos};
 47	{error,E} -> {error,E,StartPos}
 48    end.
 49
 50%% format_error(Error)
 51%%  Return a string describing the error.
 52
 53format_error({string,Quote,Head}) ->
 54    ["unterminated string starting with " ++ io_lib:write_string(Head,Quote)];
 55format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
 56format_error(char) -> "unterminated character";
 57format_error(scan) -> "premature end";
 58format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
 59format_error(float) -> "bad float";
 60
 61format_error(Other) -> io_lib:write(Other).
 62
 63%% Reserved words, not atoms:
 64reserved('where') -> true;
 65reserved(_) -> false.
 66
 67%% scan(CharList, StartPos)
 68%%  This takes a list of characters and tries to tokenise them.
 69%%
 70%%  The token list is built in reverse order (in a stack) to save appending
 71%%  and then reversed when all the tokens have been collected. Most tokens
 72%%  are built in the same way.
 73%%
 74%%  Returns:
 75%%	{ok,[Tok]}
 76%%	{error,{ErrorPos,edoc_scanner,What}}
 77
 78scan(Cs, Pos) ->
 79    scan1(Cs, [], Pos).
 80
 81%% scan1(Characters, TokenStack, Position)
 82%%  Scan a list of characters into tokens.
 83
 84scan1([$\n|Cs], Toks, Pos) ->            	        % Newline
 85    scan1(Cs, Toks, Pos+1);
 86scan1([C|Cs], Toks, Pos) when C >= 0, C =< $  -> 	% Skip blanks
 87    scan1(Cs, Toks, Pos);
 88scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z ->	% Unquoted atom
 89    scan_atom(C, Cs, Toks, Pos);
 90scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	% Numbers
 91    scan_number(C, Cs, Toks, Pos);
 92scan1([$-,C| Cs], Toks, Pos) when C >= $0, C =< $9 ->	% Signed numbers
 93    scan_signed_number($-, C, Cs, Toks, Pos);
 94scan1([$+,C| Cs], Toks, Pos) when C >= $0, C =< $9 ->	% Signed numbers
 95    scan_signed_number($+, C, Cs, Toks, Pos);
 96scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z ->	% Variables
 97    scan_variable(C, Cs, Toks, Pos);
 98scan1([$_|Cs], Toks, Pos) ->				% Variables
 99    scan_variable($_, Cs, Toks, Pos);
100scan1([$$|Cs], Toks, Pos) ->			% Character constant
101    case scan_char_const(Cs, Toks, Pos) of
102	{ok, Result} ->
103	    {ok, Result};
104	{error, truncated_char} ->
105	    scan_error(char, Pos);
106	{error, illegal_character} ->
107	    scan_error({illegal, char}, Pos)
108    end;
109scan1([$'|Cs0], Toks, Pos) ->				% Quoted atom
110    case scan_string(Cs0, $', Pos) of
111	{S,Cs1,Pos1} ->
112	    case catch list_to_atom(S) of
113		A when is_atom(A) ->
114		    scan1(Cs1, [{atom,Pos,A}|Toks], Pos1);
115		_Error -> scan_error({illegal,atom}, Pos)
116	    end;
117	{error, premature_end} ->
118	    scan_error({string,$',Cs0}, Pos);
119	{error, truncated_char} ->
120	    scan_error(char, Pos);
121	{error, illegal_character} ->
122	    scan_error({illegal, atom}, Pos)
123    end;
124scan1([$"|Cs0], Toks, Pos) ->				% String
125    case scan_string(Cs0, $", Pos) of
126	{S,Cs1,Pos1} ->
127	    case Toks of
128		[{string, Pos0, S0} | Toks1] ->
129		    scan1(Cs1, [{string, Pos0, S0 ++ S} | Toks1],
130			  Pos1);
131		_ ->
132		    scan1(Cs1, [{string,Pos,S}|Toks], Pos1)
133	    end;
134	{error, premature_end} ->
135	    scan_error({string,$",Cs0}, Pos);
136	{error, truncated_char} ->
137	    scan_error(char, Pos);
138	{error, illegal_character} ->
139	    scan_error({illegal, string}, Pos)
140    end;
141%% Punctuation characters and operators, first recognise multiples.
142scan1([$<,$<|Cs], Toks, Pos) ->
143    scan1(Cs, [{'<<',Pos}|Toks], Pos);
144scan1([$>,$>|Cs], Toks, Pos) ->
145    scan1(Cs, [{'>>',Pos}|Toks], Pos);
146scan1([$-,$>|Cs], Toks, Pos) ->
147    scan1(Cs, [{'->',Pos}|Toks], Pos);
148scan1([$:,$:|Cs], Toks, Pos) ->
149    scan1(Cs, [{'::',Pos}|Toks], Pos);
150scan1([$/,$/|Cs], Toks, Pos) ->
151    scan1(Cs, [{'//',Pos}|Toks], Pos);
152scan1([$.,$.,$.|Cs], Toks, Pos) ->
153    scan1(Cs, [{'...',Pos}|Toks], Pos);
154scan1([$.,$.|Cs], Toks, Pos) ->
155    scan1(Cs, [{'..',Pos}|Toks], Pos);
156scan1([C|Cs], Toks, Pos) -> % Punctuation character
157    P = list_to_atom([C]),
158    scan1(Cs, [{P,Pos}|Toks], Pos);
159scan1([], Toks0, _Pos) ->
160    Toks = reverse(Toks0),
161    {ok,Toks}.
162
163%% Note that `_' is not accepted as a variable token.
164scan_variable(C, Cs, Toks, Pos) ->
165    {Wcs,Cs1} = scan_name(Cs, []),
166    W = [C|reverse(Wcs)],
167    case W of
168	"_" ->
169            scan1(Cs1, [{an_var,Pos,'_'}|Toks], Pos);
170	_ ->
171	    case catch list_to_atom(W) of
172		A when is_atom(A) ->
173		    scan1(Cs1, [{var,Pos,A}|Toks], Pos);
174		_ ->
175		    scan_error({illegal,variable}, Pos)
176	    end
177    end.
178
179scan_atom(C, Cs, Toks, Pos) ->
180    {Wcs,Cs1} = scan_name(Cs, []),
181    W = [C|reverse(Wcs)],
182    case catch list_to_atom(W) of
183	A when is_atom(A) ->
184	    case reserved(A) of
185		true ->
186		    scan1(Cs1, [{A,Pos}|Toks], Pos);
187		false ->
188		    scan1(Cs1, [{atom,Pos,A}|Toks], Pos)
189	    end;
190	_ ->
191	    scan_error({illegal,token}, Pos)
192    end.
193
194%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
195
196scan_name([C|Cs], Ncs) ->
197    case name_char(C) of
198	true ->
199	    scan_name(Cs, [C|Ncs]);
200	false ->
201	    {Ncs,[C|Cs]}		% Must rebuild here, sigh!
202    end;
203scan_name([], Ncs) ->
204    {Ncs,[]}.
205
206name_char(C) when C >= $a, C =< $z -> true;
207name_char(C) when C >= $\337, C =< $\377, C /= $\367 -> true;
208name_char(C) when C >= $A, C =< $Z -> true;
209name_char(C) when C >= $\300, C =< $\336, C /= $\327 -> true;
210name_char(C) when C >= $0, C =< $9 -> true;
211name_char($_) -> true;
212name_char($@) -> true;
213name_char(_) -> false.
214
215%% scan_string(CharList, QuoteChar, Pos) ->
216%%	{StringChars,RestChars, NewPos}
217
218scan_string(Cs, Quote, Pos) ->
219    scan_string(Cs, [], Quote, Pos).
220
221scan_string([Quote|Cs], Scs, Quote, Pos) ->
222    {reverse(Scs),Cs,Pos};
223scan_string([], _Scs, _Quote, _Pos) ->
224    {error, premature_end};
225scan_string(Cs0, Scs, Quote, Pos) ->
226    case scan_char(Cs0, Pos) of
227	{C,Cs,Pos1} ->
228	    %% Only build the string here
229	    scan_string(Cs, [C|Scs], Quote, Pos1);
230	Error ->
231	    Error
232    end.
233
234%% Note that space characters are not allowed
235scan_char_const([$\040 | _Cs0], _Toks, _Pos) ->
236    {error, illegal_character};
237scan_char_const(Cs0, Toks, Pos) ->
238    case scan_char(Cs0, Pos) of
239	{C,Cs,Pos1} ->
240	    scan1(Cs, [{char,Pos,C}|Toks], Pos1);
241	Error ->
242	    Error
243    end.
244
245%% {Character,RestChars,NewPos} = scan_char(Chars, Pos)
246%% Read a single character from a string or character constant. The
247%% pre-scan phase has checked for errors here.
248%% Note that control characters are not allowed.
249
250scan_char([$\\|Cs], Pos) ->
251    scan_escape(Cs, Pos);
252scan_char([C | _Cs], _Pos) when C =< 16#1f ->
253    {error, illegal_character};
254scan_char([C|Cs], Pos) ->
255    {C,Cs,Pos};
256scan_char([], _Pos) ->
257    {error, truncated_char}.
258
259%% The following conforms to Standard Erlang escape sequences.
260
261scan_escape([O1, O2, O3 | Cs], Pos) when        % \<1-3> octal digits
262  O1 >= $0, O1 =< $3, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
263    Val = (O1*8 + O2)*8 + O3 - 73*$0,
264    {Val,Cs,Pos};
265scan_escape([O1, O2 | Cs], Pos) when
266  O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
267    Val = (O1*8 + O2) - 9*$0,
268    {Val,Cs,Pos};
269scan_escape([O1 | Cs], Pos) when
270  O1 >= $0, O1 =< $7 ->
271    {O1 - $0,Cs,Pos};
272scan_escape([$^, C | Cs], Pos) ->    % \^X -> CTL-X
273    if C >= $\100, C =< $\137 ->
274	    {C - $\100,Cs,Pos};
275       true -> {error, illegal_control_character}
276    end;
277scan_escape([C | Cs], Pos) ->
278    case escape_char(C) of
279	C1 when C1 > $\000 -> {C1,Cs,Pos};
280	_ -> {error, undefined_escape_sequence}
281    end;
282scan_escape([], _Pos) ->
283    {error, truncated_char}.
284
285%% Note that we return $\000 for undefined escapes.
286escape_char($b) -> $\010;		% \b = BS
287escape_char($d) -> $\177;		% \d = DEL
288escape_char($e) -> $\033;		% \e = ESC
289escape_char($f) -> $\014;		% \f = FF
290escape_char($n) -> $\012;		% \n = LF
291escape_char($r) -> $\015;		% \r = CR
292escape_char($s) -> $\040;		% \s = SPC
293escape_char($t) -> $\011;		% \t = HT
294escape_char($v) -> $\013;		% \v = VT
295escape_char($\\) -> $\134;		% \\ = \
296escape_char($') -> $\047;		% \' = '
297escape_char($") -> $\042;		% \" = "
298escape_char(_C) -> $\000.
299
300%% scan_number(Char, CharList, TokenStack, Pos)
301%%  We handle sign and radix notation:
302%%    [+-]<digits>		- the digits in base [+-]10
303%%    [+-]<digits>.<digits>
304%%    [+-]<digits>.<digits>E+-<digits>
305%%    [+-]<digits>#<digits>	- the digits read in base [+-]B
306%%
307%%  Except for explicitly based integers we build a list of all the
308%%  characters and then use list_to_integer/1 or list_to_float/1 to
309%%  generate the value.
310
311%%  SPos == Start position
312%%  CPos == Current position
313
314scan_number(C, Cs0, Toks, Pos) ->
315    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
316    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
317
318scan_signed_number(S, C, Cs0, Toks, Pos) ->
319    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C, S], Pos),
320    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
321
322scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
323    scan_integer(Cs, [C|Stack], Pos);
324scan_integer(Cs, Stack, Pos) ->
325    {Stack,Cs,Pos}.
326
327scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
328    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
329    scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
330scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
331    N = list_to_integer(reverse(Ncs)),
332    scan1(Cs, [{integer,SPos,N}|Toks], CPos).
333
334scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
335    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
336scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
337    scan_exponent(Cs, [$e|Ncs], Toks, SPos, CPos);
338scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
339    case catch list_to_float(reverse(Ncs)) of
340	N when is_float(N) ->
341	    scan1(Cs, [{float,SPos,N}|Toks], CPos);
342	_Error -> scan_error({illegal,float}, SPos)
343    end.
344
345%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
346%%  Generate an error here if E{+|-} not followed by any digits.
347
348scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
349    scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
350scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
351    scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
352scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
353    scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
354
355scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
356    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
357    case catch list_to_float(reverse(Ncs)) of
358	N when is_float(N) ->
359	    scan1(Cs, [{float,SPos,N}|Toks], CPos1);
360	_Error -> scan_error({illegal,float}, SPos)
361    end;
362scan_exponent1(_, _, _, _, CPos) ->
363    scan_error(float, CPos).
364
365scan_error(In, Pos) ->
366    {error,{Pos,edoc_scanner,In}}.