PageRenderTime 65ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/clamav-0.97.5/libclamav/jsparse/js-norm.c

#
C | 1559 lines | 1319 code | 58 blank | 182 comment | 94 complexity | e53c738d04d4a55062717697c3837407 MD5 | raw file
  1. /*
  2. * Javascript normalizer.
  3. *
  4. * Copyright (C) 2008 Sourcefire, Inc.
  5. *
  6. * Authors: T??r??k Edvin
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License version 2 as
  10. * published by the Free Software Foundation.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  20. * MA 02110-1301, USA.
  21. */
  22. #ifdef HAVE_CONFIG_H
  23. #include "clamav-config.h"
  24. #endif
  25. #include <stdio.h>
  26. #ifdef HAVE_UNISTD_H
  27. #include <unistd.h>
  28. #endif
  29. #include <fcntl.h>
  30. #include <stdlib.h>
  31. #include <string.h>
  32. #include <ctype.h>
  33. #include <assert.h>
  34. #include "cltypes.h"
  35. #include "jsparse/lexglobal.h"
  36. #include "hashtab.h"
  37. #include "others.h"
  38. #include "str.h"
  39. #include "js-norm.h"
  40. #include "jsparse/generated/operators.h"
  41. #include "jsparse/generated/keywords.h"
  42. #include "jsparse/textbuf.h"
  43. /* ----------- tokenizer ---------------- */
  44. enum tokenizer_state {
  45. Initial,
  46. MultilineComment,
  47. SinglelineComment,
  48. Number,
  49. DoubleQString,
  50. SingleQString,
  51. Identifier,
  52. Dummy
  53. };
  54. typedef struct scanner {
  55. struct text_buffer buf;
  56. const char *yytext;
  57. size_t yylen;
  58. const char *in;
  59. size_t insize;
  60. size_t pos;
  61. size_t lastpos;
  62. enum tokenizer_state state;
  63. enum tokenizer_state last_state;
  64. } *yyscan_t;
  65. typedef int YY_BUFFER_STATE;
  66. static int yylex( YYSTYPE *lvalp, yyscan_t );
  67. static YY_BUFFER_STATE yy_scan_bytes( const char *, size_t, yyscan_t scanner );
  68. static const char *yyget_text ( yyscan_t scanner );
  69. static int yyget_leng ( yyscan_t scanner );
  70. static int yylex_init ( yyscan_t * ptr_yy_globals ) ;
  71. static int yylex_destroy ( yyscan_t yyscanner ) ;
  72. /* ----------- tokenizer end ---------------- */
  73. enum fsm_state {
  74. Base,
  75. InsideVar,
  76. InsideInitializer,
  77. WaitFunctionName,
  78. WaitParameterList,
  79. InsideFunctionDecl
  80. };
  81. struct scope {
  82. struct cli_hashtable id_map;
  83. struct scope *parent;/* hierarchy */
  84. struct scope *nxt;/* all scopes kept in a list so we can easily free all of them */
  85. enum fsm_state fsm_state;
  86. int last_token;
  87. unsigned int brackets;
  88. unsigned int blocks;
  89. };
  90. struct tokens {
  91. yystype *data;
  92. size_t cnt;
  93. size_t capacity;
  94. };
  95. /* state for the current JS file being parsed */
  96. struct parser_state {
  97. unsigned long var_uniq;
  98. unsigned long syntax_errors;
  99. struct scope *global;
  100. struct scope *current;
  101. struct scope *list;
  102. yyscan_t scanner;
  103. struct tokens tokens;
  104. unsigned int rec;
  105. };
  106. static struct scope* scope_new(struct parser_state *state)
  107. {
  108. struct scope *parent = state->current;
  109. struct scope *s = cli_calloc(1, sizeof(*s));
  110. if(!s)
  111. return NULL;
  112. if(cli_hashtab_init(&s->id_map, 10) < 0) {
  113. free(s);
  114. return NULL;
  115. }
  116. s->parent = parent;
  117. s->fsm_state = Base;
  118. s->nxt = state->list;
  119. state->list = s;
  120. state->current = s;
  121. return s;
  122. }
  123. static struct scope* scope_done(struct scope *s)
  124. {
  125. struct scope* parent = s->parent;
  126. /* TODO: have a hashtab_destroy */
  127. cli_hashtab_clear(&s->id_map);
  128. free(s->id_map.htable);
  129. free(s);
  130. return parent;
  131. }
  132. /* transitions:
  133. * Base --(VAR)--> InsideVar
  134. * InsideVar --(Identifier)-->InsideInitializer
  135. * InsideVar --(anything_else) --> POP (to Base)
  136. * InsideInitializer --(COMMA)--> POP (to InsideVar)
  137. * InsideInitializer | InsideVar --(SEMICOLON) --> POP (to Base)
  138. * InsideInitializer --(BRACKET_OPEN) --> WaitBrClose
  139. * InsideInitializer --(PAR_OPEN) --> WaitParClose
  140. * WaitBrClose --(BRACKET_OPEN) --> increase depth
  141. * WaitBrClose --(BRACKET_CLOSE) --> POP
  142. * WaitParClose --(PAR_CLOSE) --> POP
  143. * WaitParClose --(PAR_OPEN) --> increase depth
  144. */
  145. /* Base --(VAR)--> PUSH, to InsideVar
  146. * InsideVar --(Identifier)--> InsideInitializer
  147. * InsideVar --(ELSE)--> POP, inc. syntax_errors
  148. * InsideInitializer --(COMMA)--> POP (to InsideVar)
  149. * --(BRACKET_OPEN)--> inc bracket_counter
  150. * --(PAR_OPEN)--> inc par_counter
  151. * --(BRACKET_CLOSE) --> dec bracket_counter
  152. * --(PAR_CLOSE)--> dec par_counter
  153. * --(VAR)--> PUSH, to InsideVar (if bracket_counter != 0 || par_counter != 0)
  154. * --> POP, to InsideVar, inc. syntax_errors (if bracket_counter == 0 && par_counter == 0)
  155. * POP only allowed if bracket_counter == 0 && par_counter == 0
  156. *
  157. * InsideInitializer acts differently, make it only a flag
  158. * ....................
  159. *
  160. * Pushing, Poping is done when entering / exiting function scopes,
  161. * tracking { and function ( is done by the function scope tracker too.
  162. *
  163. * we only need to track brackets.
  164. */
  165. /*
  166. * var x = document;
  167. * x.writeln(...);
  168. *
  169. * ^we must not normalize member method names
  170. */
  171. /*
  172. * Variables are declared at function scope, and their initial value is
  173. * undefined. At the point where the initializer is, and from there on the value
  174. * is defined.
  175. *
  176. * { doesn't introduce a new variable scope, they are in function's scope too
  177. *
  178. * function foo() {
  179. * alert(x); -> x exists, undefined
  180. * var x=5;
  181. * alert(x); -> x exists, =5
  182. * }
  183. *
  184. * vs.
  185. *
  186. * function bar() {
  187. * alert(x);//error, x not declared
  188. * x=5;
  189. * }
  190. *
  191. * vs.
  192. *
  193. * but we can declare variables without var, only valid if we use them after
  194. * assigning.
  195. *
  196. * function foobar() {
  197. * x=5;
  198. * alert(x);//x is defined, value is 5
  199. * }
  200. *
  201. * other examples:
  202. * function foo2() {
  203. * alert(x); -> x exists, undefined
  204. * {
  205. * var x=5; -> x equals to 5
  206. * }
  207. * alert(x); -> x is 5
  208. * }
  209. *
  210. * function foo3() {
  211. * var x=4; -> x exists, equals to 4
  212. * alert(x); -> x exists, equals to 4
  213. * {
  214. * var x=5; -> x equals to 5
  215. * }
  216. * alert(x); -> x is 5
  217. * }
  218. *
  219. * function bar3() {
  220. * //same as foo3
  221. * var x=4;
  222. * alert(x);
  223. * {
  224. * x=5;
  225. * }
  226. * alert(x);
  227. * }
  228. *
  229. */
  230. static const char* scope_declare(struct scope *s, const char *token, const size_t len, struct parser_state *state)
  231. {
  232. const struct cli_element *el = cli_hashtab_insert(&s->id_map, token, len, state->var_uniq++);
  233. /* cli_hashtab_insert either finds an already existing entry, or allocates a
  234. * new one, we return the allocated string */
  235. return el ? el->key : NULL;
  236. }
  237. static const char* scope_use(struct scope *s, const char *token, const size_t len)
  238. {
  239. const struct cli_element *el = cli_hashtab_find(&s->id_map, token, len);
  240. if(el) {
  241. /* identifier already found in current scope,
  242. * return here to avoid overwriting uniq id */
  243. return el->key;
  244. }
  245. /* identifier not yet in current scope's hashtab, add with ID -1.
  246. * Later if we find a declaration it will automatically assign a uniq ID
  247. * to it. If not, we'll know that we have to push ID == -1 tokens to an
  248. * outer scope.*/
  249. el = cli_hashtab_insert(&s->id_map, token, len, -1);
  250. return el ? el->key : NULL;
  251. }
  252. static long scope_lookup(struct scope *s, const char *token, const size_t len)
  253. {
  254. while(s) {
  255. const struct cli_element *el = cli_hashtab_find(&s->id_map, token, len);
  256. if(el && el->data != -1) {
  257. return el->data;
  258. }
  259. /* not found in current scope, try in outer scope */
  260. s = s->parent;
  261. }
  262. return -1;
  263. }
  264. static int tokens_ensure_capacity(struct tokens *tokens, size_t cap)
  265. {
  266. if(tokens->capacity < cap) {
  267. yystype *data;
  268. cap += 1024;
  269. /* Keep old data if OOM */
  270. data = cli_realloc(tokens->data, cap * sizeof(*tokens->data));
  271. if(!data)
  272. return CL_EMEM;
  273. tokens->data = data;
  274. tokens->capacity = cap;
  275. }
  276. return CL_SUCCESS;
  277. }
  278. static int add_token(struct parser_state *state, const yystype *token)
  279. {
  280. if(tokens_ensure_capacity(&state->tokens, state->tokens.cnt + 1))
  281. return -1;
  282. state->tokens.data[state->tokens.cnt++] = *token;
  283. return 0;
  284. }
  285. struct buf {
  286. size_t pos;
  287. int outfd;
  288. char buf[65536];
  289. };
  290. static inline int buf_outc(char c, struct buf *buf)
  291. {
  292. if(buf->pos >= sizeof(buf->buf)) {
  293. if(write(buf->outfd, buf->buf, sizeof(buf->buf)) != sizeof(buf->buf))
  294. return CL_EWRITE;
  295. buf->pos = 0;
  296. }
  297. buf->buf[buf->pos++] = c;
  298. return CL_SUCCESS;
  299. }
  300. static inline int buf_outs(const char *s, struct buf *buf)
  301. {
  302. const size_t buf_len = sizeof(buf->buf);
  303. size_t i;
  304. i = buf->pos;
  305. while(*s) {
  306. while(i < buf_len && *s) {
  307. if(isspace(*s & 0xff))
  308. buf->buf[i++] = ' ';
  309. else
  310. buf->buf[i++] = tolower((unsigned char)(*s));
  311. ++s;
  312. }
  313. if(i == buf_len) {
  314. if(write(buf->outfd, buf->buf, buf_len) < 0)
  315. return CL_EWRITE;
  316. i = 0;
  317. }
  318. }
  319. buf->pos = i;
  320. return CL_SUCCESS;
  321. }
  322. static inline void output_space(char last, char current, struct buf *out)
  323. {
  324. if(isalnum(last) && isalnum(current))
  325. buf_outc(' ', out);
  326. }
  327. /* return class of last character */
  328. static char output_token(const yystype *token, struct scope *scope, struct buf *out, char lastchar)
  329. {
  330. char sbuf[128];
  331. const char *s = TOKEN_GET(token, cstring);
  332. /* TODO: use a local buffer, instead of FILE* */
  333. switch(token->type) {
  334. case TOK_StringLiteral:
  335. output_space(lastchar,'"', out);
  336. buf_outc('"', out);
  337. if(s) {
  338. buf_outs(s, out);
  339. }
  340. buf_outc('"', out);
  341. return '\"';
  342. case TOK_NumericInt:
  343. output_space(lastchar,'0', out);
  344. snprintf(sbuf, sizeof(sbuf), "%ld", TOKEN_GET(token, ival));
  345. buf_outs(sbuf, out);
  346. return '0';
  347. case TOK_NumericFloat:
  348. output_space(lastchar,'0', out);
  349. snprintf(sbuf, sizeof(sbuf), "%g", TOKEN_GET(token, dval));
  350. buf_outs(sbuf, out);
  351. return '0';
  352. case TOK_IDENTIFIER_NAME:
  353. output_space(lastchar,'a', out);
  354. if(s) {
  355. long id = scope_lookup(scope, s, strlen(s));
  356. if(id == -1) {
  357. /* identifier not normalized */
  358. buf_outs(s, out);
  359. } else {
  360. snprintf(sbuf, sizeof(sbuf), "n%03ld",id);
  361. buf_outs(sbuf, out);
  362. }
  363. }
  364. return 'a';
  365. case TOK_FUNCTION:
  366. output_space(lastchar,'a', out);
  367. buf_outs("function",out);
  368. return 'a';
  369. default:
  370. if(s) {
  371. const size_t len = strlen(s);
  372. output_space(lastchar,s[0], out);
  373. buf_outs(s, out);
  374. return len ? s[len-1] : '\0';
  375. }
  376. return '\0';
  377. }
  378. }
  379. /*
  380. * We can't delete the scope as soon as we see a }, because
  381. * we still need the hashmap from it.
  382. *
  383. * If we would normalize all the identifiers, and output when a scope is closed,
  384. * then it would be impossible to normalize calls to other functions.
  385. *
  386. * So we need to keep all scopes in memory, to do this instead of scope_done, we
  387. * simply just set current = current->parent when a scope is closed.
  388. * We keep a list of all scopes created in parser_state-> When we parsed
  389. * everything, we output everything, and then delete all scopes.
  390. *
  391. * We also need to know where to switch scopes on the second pass, so for
  392. * TOK_FUNCTION types we will use another pointer, that points to the scope
  393. * (added to yystype's union).
  394. *
  395. * We lookup the identifier in the scope (using scope_lookup, it looks in parent
  396. * scopes too), if ID is found then output (n%3d, Id),
  397. * otherwise output the identifier as is.
  398. *
  399. * To make it easier to match sigs, we do a xfrm :
  400. * 'function ID1 (..'. => 'n%3d = function (...'
  401. */
  402. /*
  403. * we'll add all identifier to the scope's map
  404. * those that are not decl. will have initial ID -1
  405. * if we later see a decl for it in same scope, it'll automatically get a
  406. * correct ID.
  407. *
  408. * When parsing of local scope is done, we take any ID -1 identifiers,
  409. * and push them up one level (careful not to overwrite existing IDs).
  410. *
  411. * it would be nice if the tokens would contain a link to the entry in the
  412. * hashtab, a link that automatically gets updated when the element is moved
  413. * (pushed up). This would prevent subsequent lookups in the map,
  414. * when we want to output the tokens.
  415. * There is no easy way to do that, so we just do another lookup
  416. *
  417. */
  418. /*
  419. * This actually works, redefining foo:
  420. * function foo() {
  421. * var foo=5; alert(foo);
  422. * }
  423. * So we can't treat function names just as any other identifier?
  424. * We can, because you can no longer call foo, if you redefined it as a var.
  425. * So if we rename both foo-s with same name, it will have same behaviour.
  426. *
  427. * This means that a new scope should begin after function, and not after
  428. * function ... (.
  429. */
  430. static void scope_free_all(struct scope *p)
  431. {
  432. struct scope *nxt;
  433. do {
  434. nxt = p->nxt;
  435. scope_done(p);
  436. p = nxt;
  437. } while(p);
  438. }
  439. size_t cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens);
  440. static int match_parameters(const yystype *tokens, const char ** param_names, size_t count)
  441. {
  442. size_t i,j=0;
  443. if(tokens[0].type != TOK_PAR_OPEN)
  444. return -1;
  445. i=1;
  446. while(count--) {
  447. const char *token_val = TOKEN_GET(&tokens[i], cstring);
  448. if(tokens[i].type != TOK_IDENTIFIER_NAME ||
  449. !token_val ||
  450. strcmp(token_val, param_names[j++]))
  451. return -1;
  452. ++i;
  453. if((count && tokens[i].type != TOK_COMMA)
  454. || (!count && tokens[i].type != TOK_PAR_CLOSE))
  455. return -1;
  456. ++i;
  457. }
  458. return 0;
  459. }
  460. static const char *de_packer_3[] = {"p","a","c","k","e","r"};
  461. static const char *de_packer_2[] = {"p","a","c","k","e","d"};
  462. static inline char *textbuffer_done(yyscan_t scanner)
  463. {
  464. char *str = cli_realloc(scanner->buf.data, scanner->buf.pos);
  465. if(!str) {
  466. str = scanner->buf.data;
  467. }
  468. scanner->yytext = str;
  469. scanner->yylen = scanner->buf.pos - 1;
  470. memset(&scanner->buf, 0, sizeof(scanner->buf));
  471. return str;
  472. }
  473. #define MODULE "JS-Norm: "
  474. static void free_token(yystype *token)
  475. {
  476. if(token->vtype == vtype_string) {
  477. free(token->val.string);
  478. token->val.string = NULL;
  479. }
  480. }
  481. static int replace_token_range(struct tokens *dst, size_t start, size_t end, const struct tokens *with)
  482. {
  483. const size_t len = with ? with->cnt : 0;
  484. size_t i;
  485. cli_dbgmsg(MODULE "Replacing tokens %lu - %lu with %lu tokens\n",start, end, len);
  486. if(start >= dst->cnt || end > dst->cnt)
  487. return -1;
  488. for(i=start;i<end;i++) {
  489. free_token(&dst->data[i]);
  490. }
  491. if(tokens_ensure_capacity(dst, dst->cnt - (end-start) + len))
  492. return CL_EMEM;
  493. memmove(&dst->data[start+len], &dst->data[end], (dst->cnt - end) * sizeof(dst->data[0]));
  494. if(with && len > 0) {
  495. memcpy(&dst->data[start], with->data, len * sizeof(dst->data[0]));
  496. }
  497. dst->cnt = dst->cnt - (end-start) + len;
  498. return CL_SUCCESS;
  499. }
  500. static int append_tokens(struct tokens *dst, const struct tokens *src)
  501. {
  502. if(!dst || !src)
  503. return CL_ENULLARG;
  504. if(tokens_ensure_capacity(dst, dst->cnt + src->cnt))
  505. return CL_EMEM;
  506. cli_dbgmsg(MODULE "Appending %lu tokens\n", src->cnt);
  507. memcpy(&dst->data[dst->cnt], src->data, src->cnt * sizeof(dst->data[0]));
  508. dst->cnt += src->cnt;
  509. return CL_SUCCESS;
  510. }
  511. static void decode_de(yystype *params[], struct text_buffer *txtbuf)
  512. {
  513. const char *p = TOKEN_GET(params[0], cstring);
  514. const long a = TOKEN_GET(params[1], ival);
  515. /*const char *c = params[2];*/
  516. char *k = TOKEN_GET(params[3], string);
  517. /*const char *r = params[5];*/
  518. unsigned val=0;
  519. unsigned nsplit = 0;
  520. const char* o;
  521. const char **tokens;
  522. memset(txtbuf, 0, sizeof(*txtbuf));
  523. if(!p || !k )
  524. return;
  525. for(o = k; *o; o++) if(*o == '|') nsplit++;
  526. nsplit++;
  527. tokens = malloc(sizeof(char*)*nsplit);
  528. if(!tokens) {
  529. return;
  530. }
  531. cli_strtokenize(k,'|',nsplit, tokens);
  532. do {
  533. while(*p && !isalnum(*p)) {
  534. if(*p=='\\' && (p[1] == '\'' || p[1] == '\"'))
  535. p++;
  536. else
  537. textbuffer_putc(txtbuf, *p++);
  538. }
  539. if(!*p) break;
  540. val = 0;
  541. o = p;
  542. while(*p && isalnum(*p)) {
  543. unsigned x;
  544. unsigned char v = *p++;
  545. /* TODO: use a table here */
  546. if(v >= 'a') x = 10+v-'a';
  547. else if(v >= 'A') x = 36+v-'A';
  548. else x = v-'0';
  549. val = val*a+x;
  550. }
  551. if(val >= nsplit || !tokens[val] || !tokens[val][0])
  552. while(o!=p)
  553. textbuffer_putc(txtbuf, *o++);
  554. else textbuffer_append(txtbuf, tokens[val]);
  555. } while (*p);
  556. free(tokens);
  557. textbuffer_append(txtbuf, "\0");
  558. }
  559. struct decode_result {
  560. struct text_buffer txtbuf;
  561. size_t pos_begin;
  562. size_t pos_end;
  563. unsigned append:1; /* 0: tokens are replaced with new token(s),
  564. 1: old tokens are deleted, new ones appended at the end */
  565. };
  566. static void handle_de(yystype *tokens, size_t start, const size_t cnt, const char *name, struct decode_result *res)
  567. {
  568. /* find function decl. end */
  569. size_t i, nesting = 1, j;
  570. yystype* parameters [6];
  571. const size_t parameters_cnt = 6;
  572. for(i=start;i < cnt; i++) {
  573. if(tokens[i].type == TOK_FUNCTION) {
  574. if(TOKEN_GET(&tokens[i], scope))
  575. nesting++;
  576. else
  577. nesting--;
  578. if(!nesting)
  579. break;
  580. }
  581. }
  582. if(nesting)
  583. return;
  584. memset(parameters, 0, sizeof(parameters));
  585. if(name) {
  586. /* find call to function */
  587. for(;i+2 < cnt; i++) {
  588. const char* token_val = TOKEN_GET(&tokens[i], cstring);
  589. if(tokens[i].type == TOK_IDENTIFIER_NAME &&
  590. token_val &&
  591. !strcmp(name, token_val) &&
  592. tokens[i+1].type == TOK_PAR_OPEN) {
  593. i += 2;
  594. for(j = 0;j < parameters_cnt && i < cnt;j++) {
  595. parameters[j] = &tokens[i++];
  596. if(j != parameters_cnt-1)
  597. while (tokens[i].type != TOK_COMMA && i < cnt) i++;
  598. else
  599. while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++;
  600. i++;
  601. }
  602. if(j == parameters_cnt)
  603. decode_de(parameters, &res->txtbuf);
  604. }
  605. }
  606. } else {
  607. while(i<cnt && tokens[i].type != TOK_PAR_OPEN) i++;
  608. ++i;
  609. if(i >= cnt) return;
  610. /* TODO: move this v to another func */
  611. for(j = 0;j < parameters_cnt && i < cnt;j++) {
  612. parameters[j] = &tokens[i++];
  613. if(j != parameters_cnt-1)
  614. while (tokens[i].type != TOK_COMMA && i < cnt) i++;
  615. else
  616. while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++;
  617. i++;
  618. }
  619. if(j == parameters_cnt)
  620. decode_de(parameters, &res->txtbuf);
  621. }
  622. if(parameters[0] && parameters[parameters_cnt-1]) {
  623. res->pos_begin = parameters[0] - tokens;
  624. res->pos_end = parameters[parameters_cnt-1] - tokens + 1;
  625. if(tokens[res->pos_end].type == TOK_BRACKET_OPEN &&
  626. tokens[res->pos_end+1].type == TOK_BRACKET_CLOSE &&
  627. tokens[res->pos_end+2].type == TOK_PAR_CLOSE)
  628. res->pos_end += 3; /* {}) */
  629. else
  630. res->pos_end++; /* ) */
  631. }
  632. }
  633. static int handle_unescape(struct tokens *tokens, size_t start)
  634. {
  635. if(tokens->data[start].type == TOK_StringLiteral) {
  636. char *R;
  637. struct tokens new_tokens;
  638. yystype tok;
  639. R = cli_unescape(TOKEN_GET(&tokens->data[start], cstring));
  640. tok.type = TOK_StringLiteral;
  641. TOKEN_SET(&tok, string, R);
  642. new_tokens.capacity = new_tokens.cnt = 1;
  643. new_tokens.data = &tok;
  644. if(replace_token_range(tokens, start-2, start+2, &new_tokens) < 0)
  645. return CL_EMEM;
  646. }
  647. return CL_SUCCESS;
  648. }
  649. /* scriptasylum dot com's JS encoder */
  650. static void handle_df(const yystype *tokens, size_t start, struct decode_result *res)
  651. {
  652. char *str, *s1;
  653. size_t len, s1_len, i;
  654. unsigned char clast;
  655. char *R;
  656. if(tokens[start].type != TOK_StringLiteral)
  657. return;
  658. str = TOKEN_GET(&tokens[start], string);
  659. if(!str)
  660. return;
  661. len = strlen(str);
  662. clast = str[len-1] - '0';
  663. str[len-1] = '\0';
  664. s1 = cli_unescape(str);
  665. s1_len = strlen(s1);
  666. for(i=0;i<s1_len;i++) {
  667. s1[i] -= clast;
  668. }
  669. R = cli_unescape(s1);
  670. free(s1);
  671. res->pos_begin = start-2;
  672. res->pos_end = start+2;
  673. res->txtbuf.data = R;
  674. res->txtbuf.pos = strlen(R);
  675. res->append = 1;
  676. }
  677. static void handle_eval(struct tokens *tokens, size_t start, struct decode_result *res)
  678. {
  679. res->txtbuf.data = TOKEN_GET(&tokens->data[start], string);
  680. if(res->txtbuf.data && tokens->data[start+1].type == TOK_PAR_CLOSE) {
  681. TOKEN_SET(&tokens->data[start], string, NULL);
  682. res->txtbuf.pos = strlen(res->txtbuf.data);
  683. res->pos_begin = start-2;
  684. res->pos_end = start+2;
  685. }
  686. }
  687. static void run_folders(struct tokens *tokens)
  688. {
  689. size_t i;
  690. for(i = 0; i < tokens->cnt; i++) {
  691. const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
  692. if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
  693. cstring &&
  694. !strcmp("unescape", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
  695. handle_unescape(tokens, i+2);
  696. }
  697. }
  698. }
  699. static inline int state_update_scope(struct parser_state *state, const yystype *token)
  700. {
  701. if(token->type == TOK_FUNCTION) {
  702. struct scope *scope = TOKEN_GET(token, scope);
  703. if(scope) {
  704. state->current = scope;
  705. }
  706. else {
  707. /* dummy token marking function end */
  708. if(state->current->parent)
  709. state->current = state->current->parent;
  710. /* don't output this token, it is just a dummy marker */
  711. return 0;
  712. }
  713. }
  714. return 1;
  715. }
  716. static void run_decoders(struct parser_state *state)
  717. {
  718. size_t i;
  719. const char* name;
  720. struct tokens *tokens = &state->tokens;
  721. for(i = 0; i < tokens->cnt; i++) {
  722. const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
  723. struct decode_result res;
  724. res.pos_begin = res.pos_end = 0;
  725. res.append = 0;
  726. if(tokens->data[i].type == TOK_FUNCTION && i+13 < tokens->cnt) {
  727. name = NULL;
  728. ++i;
  729. if(tokens->data[i].type == TOK_IDENTIFIER_NAME) {
  730. cstring = TOKEN_GET(&tokens->data[i], cstring);
  731. name = cstring;
  732. ++i;
  733. }
  734. if(match_parameters(&tokens->data[i], de_packer_3, sizeof(de_packer_3)/sizeof(de_packer_3[0])) != -1
  735. || match_parameters(&tokens->data[i], de_packer_2, sizeof(de_packer_2)/sizeof(de_packer_2[0])) != -1) {
  736. /* find function decl. end */
  737. handle_de(tokens->data, i, tokens->cnt, name, &res);
  738. }
  739. } else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
  740. cstring &&
  741. !strcmp("dF", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
  742. /* TODO: also match signature of dF function (possibly
  743. * declared using unescape */
  744. handle_df(tokens->data, i+2, &res);
  745. } else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
  746. cstring &&
  747. !strcmp("eval", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
  748. handle_eval(tokens, i+2, &res);
  749. }
  750. if(res.pos_end > res.pos_begin) {
  751. struct tokens parent_tokens;
  752. if(res.pos_end < tokens->cnt && tokens->data[res.pos_end].type == TOK_SEMICOLON)
  753. res.pos_end++;
  754. parent_tokens = state->tokens;/* save current tokens */
  755. /* initialize embedded context */
  756. memset(&state->tokens, 0, sizeof(state->tokens));
  757. if(++state->rec > 16)
  758. cli_dbgmsg(MODULE "recursion limit reached\n");
  759. else {
  760. cli_js_process_buffer(state, res.txtbuf.data, res.txtbuf.pos);
  761. --state->rec;
  762. }
  763. free(res.txtbuf.data);
  764. /* state->tokens still refers to the embedded/nested context
  765. * here */
  766. if(!res.append) {
  767. replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, &state->tokens);
  768. } else {
  769. /* delete tokens */
  770. replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, NULL);
  771. append_tokens(&parent_tokens, &state->tokens);
  772. }
  773. /* end of embedded context, restore tokens state */
  774. free(state->tokens.data);
  775. state->tokens = parent_tokens;
  776. }
  777. state_update_scope(state, &state->tokens.data[i]);
  778. }
  779. }
  780. void cli_js_parse_done(struct parser_state* state)
  781. {
  782. struct tokens * tokens = &state->tokens;
  783. size_t par_balance = 0, i;
  784. char end = '\0';
  785. YYSTYPE val;
  786. cli_dbgmsg(MODULE "in cli_js_parse_done()\n");
  787. /* close unfinished token */
  788. switch (state->scanner->state) {
  789. case DoubleQString:
  790. end = '"';
  791. break;
  792. case SingleQString:
  793. end = '\'';
  794. break;
  795. default: /* make gcc happy */
  796. break;
  797. }
  798. if (end != '\0')
  799. cli_js_process_buffer(state, &end, 1);
  800. /* close remaining paranthesis */
  801. for (i=0;i<tokens->cnt;i++) {
  802. if (tokens->data[i].type == TOK_PAR_OPEN)
  803. par_balance++;
  804. else if (tokens->data[i].type == TOK_PAR_CLOSE && par_balance > 0)
  805. par_balance--;
  806. }
  807. if (par_balance > 0) {
  808. memset(&val, 0, sizeof(val));
  809. val.type = TOK_PAR_CLOSE;
  810. TOKEN_SET(&val, cstring, ")");
  811. while (par_balance-- > 0) {
  812. add_token(state, &val);
  813. }
  814. }
  815. /* we had to close unfinished strings, paranthesis,
  816. * so that the folders/decoders can run properly */
  817. run_folders(&state->tokens);
  818. run_decoders(state);
  819. yylex_destroy(state->scanner);
  820. state->scanner = NULL;
  821. }
  822. void cli_js_output(struct parser_state *state, const char *tempdir)
  823. {
  824. unsigned i;
  825. struct buf buf;
  826. char lastchar = '\0';
  827. char filename[1024];
  828. snprintf(filename, 1024, "%s"PATHSEP"javascript", tempdir);
  829. buf.pos = 0;
  830. buf.outfd = open(filename, O_CREAT | O_WRONLY, 0600);
  831. if(buf.outfd < 0) {
  832. cli_errmsg(MODULE "cannot open output file for writing: %s\n", filename);
  833. return;
  834. }
  835. /* append to file */
  836. if(lseek(buf.outfd, 0, SEEK_END) != 0) {
  837. /* separate multiple scripts with \n */
  838. buf_outc('\n', &buf);
  839. }
  840. buf_outs("<script>", &buf);
  841. state->current = state->global;
  842. for(i = 0; i < state->tokens.cnt; i++) {
  843. if(state_update_scope(state, &state->tokens.data[i]))
  844. lastchar = output_token(&state->tokens.data[i], state->current, &buf, lastchar);
  845. }
  846. /* add /script if not already there */
  847. if(buf.pos < 9 || memcmp(buf.buf + buf.pos - 9, "</script>", 9))
  848. buf_outs("</script>", &buf);
  849. if(write(buf.outfd, buf.buf, buf.pos) < 0) {
  850. cli_dbgmsg(MODULE "I/O error\n");
  851. }
  852. close(buf.outfd);
  853. cli_dbgmsg(MODULE "dumped/appended normalized script to: %s\n",filename);
  854. }
  855. void cli_js_destroy(struct parser_state *state)
  856. {
  857. size_t i;
  858. if(!state)
  859. return;
  860. scope_free_all(state->list);
  861. for(i=0;i<state->tokens.cnt;i++) {
  862. free_token(&state->tokens.data[i]);
  863. }
  864. free(state->tokens.data);
  865. /* detect use after free */
  866. if(state->scanner)
  867. yylex_destroy(state->scanner);
  868. memset(state, 0x55, sizeof(*state));
  869. free(state);
  870. cli_dbgmsg(MODULE "cli_js_destroy() done\n");
  871. }
  872. /* buffer is html-normlike "chunk", if original file is bigger than buffer,
  873. * we rewind to a space, so we'll know that tokens won't be broken in half at
  874. * the end of a buffer. All tokens except string-literals of course.
  875. * So we can assume that after the buffer there is either a space, EOF, or a
  876. * chunk of text not containing whitespace at all (for which we care only if its
  877. * a stringliteral)*/
  878. void cli_js_process_buffer(struct parser_state *state, const char *buf, size_t n)
  879. {
  880. struct scope* current = state->current;
  881. YYSTYPE val;
  882. int yv;
  883. YY_BUFFER_STATE yyb;
  884. if(!state->global) {
  885. /* this state has either not been initialized,
  886. * or cli_js_parse_done() was already called on it */
  887. cli_warnmsg(MODULE "invalid state\n");
  888. return;
  889. }
  890. yyb = yy_scan_bytes(buf, n, state->scanner);
  891. memset(&val, 0, sizeof(val));
  892. val.vtype = vtype_undefined;
  893. /* on EOF yylex will return 0 */
  894. while( (yv=yylex(&val, state->scanner)) != 0)
  895. {
  896. const char *text;
  897. size_t leng;
  898. val.type = yv;
  899. switch(yv) {
  900. case TOK_VAR:
  901. current->fsm_state = InsideVar;
  902. break;
  903. case TOK_IDENTIFIER_NAME:
  904. text = yyget_text(state->scanner);
  905. leng = yyget_leng(state->scanner);
  906. if(current->last_token == TOK_DOT) {
  907. /* this is a member name, don't normalize
  908. */
  909. TOKEN_SET(&val, string, cli_strdup(text));
  910. val.type = TOK_UNNORM_IDENTIFIER;
  911. } else {
  912. switch(current->fsm_state) {
  913. case WaitParameterList:
  914. state->syntax_errors++;
  915. /* fall through */
  916. case Base:
  917. case InsideInitializer:
  918. TOKEN_SET(&val, cstring, scope_use(current, text, leng));
  919. break;
  920. case InsideVar:
  921. case InsideFunctionDecl:
  922. TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
  923. current->fsm_state = InsideInitializer;
  924. current->brackets = 0;
  925. break;
  926. case WaitFunctionName:
  927. TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
  928. current->fsm_state = WaitParameterList;
  929. break;
  930. }
  931. }
  932. break;
  933. case TOK_PAR_OPEN:
  934. switch(current->fsm_state) {
  935. case WaitFunctionName:
  936. /* fallthrough */
  937. case WaitParameterList:
  938. current->fsm_state = InsideFunctionDecl;
  939. break;
  940. default:
  941. /* noop */
  942. break;
  943. }
  944. break;
  945. case TOK_PAR_CLOSE:
  946. switch(current->fsm_state) {
  947. case WaitFunctionName:
  948. state->syntax_errors++;
  949. break;
  950. case WaitParameterList:
  951. current->fsm_state = Base;
  952. break;
  953. default:
  954. /* noop */
  955. break;
  956. }
  957. break;
  958. case TOK_CURLY_BRACE_OPEN:
  959. switch(current->fsm_state) {
  960. case WaitFunctionName:
  961. /* fallthrough */
  962. case WaitParameterList:
  963. case InsideFunctionDecl:
  964. /* in a syntactically correct
  965. * file, we would already be in
  966. * the Base state when we see a {
  967. */
  968. current->fsm_state = Base;
  969. /* fall-through */
  970. case InsideVar:
  971. case InsideInitializer:
  972. state->syntax_errors++;
  973. /* fall-through */
  974. case Base:
  975. default:
  976. current->blocks++;
  977. break;
  978. }
  979. break;
  980. case TOK_CURLY_BRACE_CLOSE:
  981. if(current->blocks > 0)
  982. current->blocks--;
  983. else
  984. state->syntax_errors++;
  985. if(!current->blocks) {
  986. if(current->parent) {
  987. /* add dummy FUNCTION token to
  988. * mark function end */
  989. TOKEN_SET(&val, cstring, "}");
  990. add_token(state, &val);
  991. TOKEN_SET(&val, scope, NULL);
  992. val.type = TOK_FUNCTION;
  993. state->current = current = current->parent;
  994. } else{
  995. /* extra } */
  996. state->syntax_errors++;
  997. }
  998. }
  999. break;
  1000. case TOK_BRACKET_OPEN:
  1001. current->brackets++;
  1002. break;
  1003. case TOK_BRACKET_CLOSE:
  1004. if(current->brackets > 0)
  1005. current->brackets--;
  1006. else
  1007. state->syntax_errors++;
  1008. break;
  1009. case TOK_COMMA:
  1010. if (current->fsm_state == InsideInitializer && current->brackets == 0 && current->blocks == 0) {
  1011. /* initializer ended only if we
  1012. * encountered a comma, and [] are
  1013. * balanced.
  1014. * This avoids switching state on:
  1015. * var x = [4,y,u];*/
  1016. current->fsm_state = InsideVar;
  1017. }
  1018. break;
  1019. case TOK_SEMICOLON:
  1020. if (current->brackets == 0 && current->blocks == 0) {
  1021. /* avoid switching state on unbalanced []:
  1022. * var x = [test;testi]; */
  1023. current->fsm_state = Base;
  1024. }
  1025. break;
  1026. case TOK_FUNCTION:
  1027. current = scope_new(state);
  1028. current->fsm_state = WaitFunctionName;
  1029. TOKEN_SET(&val, scope, state->current);
  1030. break;
  1031. case TOK_StringLiteral:
  1032. if(state->tokens.cnt > 0 && state->tokens.data[state->tokens.cnt-1].type == TOK_PLUS) {
  1033. /* see if can fold */
  1034. yystype *prev_string = &state->tokens.data[state->tokens.cnt-2];
  1035. if(prev_string->type == TOK_StringLiteral) {
  1036. char *str = TOKEN_GET(prev_string, string);
  1037. size_t str_len = strlen(str);
  1038. text = yyget_text(state->scanner);
  1039. leng = yyget_leng(state->scanner);
  1040. /* delete TOK_PLUS */
  1041. free_token(&state->tokens.data[--state->tokens.cnt]);
  1042. str = cli_realloc(str, str_len + leng + 1);
  1043. if (!str)
  1044. break;
  1045. strncpy(str+str_len, text, leng);
  1046. str[str_len + leng] = '\0';
  1047. TOKEN_SET(prev_string, string, str);
  1048. free(val.val.string);
  1049. memset(&val, 0, sizeof(val));
  1050. val.vtype = vtype_undefined;
  1051. continue;
  1052. }
  1053. }
  1054. break;
  1055. }
  1056. if(val.vtype == vtype_undefined) {
  1057. text = yyget_text(state->scanner);
  1058. TOKEN_SET(&val, string, cli_strdup(text));
  1059. abort();
  1060. }
  1061. add_token(state, &val);
  1062. current->last_token = yv;
  1063. memset(&val, 0, sizeof(val));
  1064. val.vtype = vtype_undefined;
  1065. }
  1066. }
  1067. struct parser_state *cli_js_init(void)
  1068. {
  1069. struct parser_state *state = cli_calloc(1, sizeof(*state));
  1070. if(!state)
  1071. return NULL;
  1072. if(!scope_new(state)) {
  1073. free(state);
  1074. return NULL;
  1075. }
  1076. state->global = state->current;
  1077. if(yylex_init(&state->scanner)) {
  1078. scope_done(state->global);
  1079. free(state);
  1080. return NULL;
  1081. }
  1082. cli_dbgmsg(MODULE "cli_js_init() done\n");
  1083. return state;
  1084. }
  1085. /*-------------- tokenizer ---------------------*/
  1086. enum char_class {
  1087. Whitespace,
  1088. Slash,
  1089. Operator,
  1090. DQuote,
  1091. SQuote,
  1092. Digit,
  1093. IdStart,
  1094. BracketOpen = TOK_BRACKET_OPEN,
  1095. BracketClose = TOK_BRACKET_CLOSE,
  1096. Comma = TOK_COMMA,
  1097. CurlyOpen = TOK_CURLY_BRACE_OPEN,
  1098. CurlyClose = TOK_CURLY_BRACE_CLOSE,
  1099. ParOpen = TOK_PAR_OPEN,
  1100. ParClose = TOK_PAR_CLOSE,
  1101. Dot = TOK_DOT,
  1102. SemiColon = TOK_SEMICOLON,
  1103. Nop
  1104. };
  1105. #define SL Slash
  1106. #define DG Digit
  1107. #define DQ DQuote
  1108. #define SQ SQuote
  1109. #define ID IdStart
  1110. #define OP Operator
  1111. #define WS Whitespace
  1112. #define BO BracketOpen
  1113. #define BC BracketClose
  1114. #define CM Comma
  1115. #define CO CurlyOpen
  1116. #define CC CurlyClose
  1117. #define PO ParOpen
  1118. #define PC ParClose
  1119. #define DT Dot
  1120. #define SC SemiColon
  1121. #define NA Nop
  1122. static const enum char_class ctype[256] = {
  1123. NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, WS, WS, NA, WS, NA, NA,
  1124. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1125. WS, OP, DQ, NA, ID, OP, OP, SQ, PO, PC, OP, OP, CM, OP, DT, SL,
  1126. DG, DG, DG, DG, DG, DG, DG, DG, DG, DG, OP, SC, OP, OP, OP, OP,
  1127. NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
  1128. ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, BO, ID, BC, OP, ID,
  1129. NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
  1130. ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, CO, OP, CC, OP, NA,
  1131. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1132. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1133. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1134. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1135. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1136. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1137. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1138. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
  1139. };
  1140. static const enum char_class id_ctype[256] = {
  1141. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1142. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1143. NA, NA, NA, NA, ID, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1144. ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA, NA,
  1145. NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
  1146. ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, OP, NA, NA, ID,
  1147. NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
  1148. ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA,
  1149. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1150. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1151. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1152. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1153. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1154. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1155. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1156. NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
  1157. };
  1158. #define CASE_SPECIAL_CHAR(C, S) case C: TOKEN_SET(lvalp, cstring, (S)); return cClass;
  1159. #define BUF_KEEP_SIZE 32768
  1160. static void textbuf_clean(struct text_buffer *buf)
  1161. {
  1162. if(buf->capacity > BUF_KEEP_SIZE) {
  1163. char *data= cli_realloc(buf->data, BUF_KEEP_SIZE);
  1164. if (data)
  1165. buf->data = data;
  1166. buf->capacity = BUF_KEEP_SIZE;
  1167. }
  1168. buf->pos = 0;
  1169. }
  1170. static inline int parseString(YYSTYPE *lvalp, yyscan_t scanner, const char q,
  1171. enum tokenizer_state tostate)
  1172. {
  1173. size_t len;
  1174. /* look for " terminating the string */
  1175. const char *start = &scanner->in[scanner->pos], *end = start;
  1176. do {
  1177. const size_t siz = &scanner->in[scanner->insize] - end;
  1178. end = memchr(end, q, siz);
  1179. if(end && end > start && end[-1] == '\\') {
  1180. ++end;
  1181. continue;
  1182. }
  1183. break;
  1184. } while (1);
  1185. if(end && end >= start)
  1186. len = end - start;
  1187. else
  1188. len = scanner->insize - scanner->pos;
  1189. cli_textbuffer_append_normalize(&scanner->buf, start, len);
  1190. if(end) {
  1191. char *str;
  1192. /* skip over end quote */
  1193. scanner->pos += len + 1;
  1194. textbuffer_putc(&scanner->buf, '\0');
  1195. str = textbuffer_done(scanner);
  1196. if (str) {
  1197. TOKEN_SET(lvalp, string, str);
  1198. } else {
  1199. TOKEN_SET(lvalp, cstring, "");
  1200. }
  1201. scanner->state = Initial;
  1202. assert(lvalp->val.string);
  1203. return TOK_StringLiteral;
  1204. } else {
  1205. scanner->pos += len;
  1206. /* unfinished string */
  1207. scanner->state = tostate;
  1208. return 0;
  1209. }
  1210. }
  1211. static inline int parseDQString(YYSTYPE *lvalp, yyscan_t scanner)
  1212. {
  1213. return parseString(lvalp, scanner, '"', DoubleQString);
  1214. }
  1215. static inline int parseSQString(YYSTYPE *lvalp, yyscan_t scanner)
  1216. {
  1217. return parseString(lvalp, scanner, '\'', SingleQString);
  1218. }
  1219. static inline int parseNumber(YYSTYPE *lvalp, yyscan_t scanner)
  1220. {
  1221. const unsigned char *in = (const unsigned char*)scanner->in;
  1222. int is_float = 0;
  1223. while(scanner->pos < scanner->insize) {
  1224. unsigned char c = in[scanner->pos++];
  1225. if(isdigit(c)) {
  1226. textbuffer_putc(&scanner->buf, c);
  1227. continue;
  1228. }
  1229. if(c =='.' && !is_float) {
  1230. is_float = 1;
  1231. textbuffer_putc(&scanner->buf, '.');
  1232. continue;
  1233. }
  1234. if((c=='e' || c=='E') && is_float) {
  1235. textbuffer_putc(&scanner->buf, c);
  1236. if(scanner->pos < scanner->insize) {
  1237. c = in[scanner->pos++];
  1238. if(c == '+' || c == '-' || isdigit(c)) {
  1239. textbuffer_putc(&scanner->buf, c);
  1240. continue;
  1241. }
  1242. }
  1243. }
  1244. scanner->pos--;
  1245. textbuffer_putc(&scanner->buf, '\0');
  1246. scanner->state = Initial;
  1247. if (!scanner->buf.data)
  1248. return 0;
  1249. if(is_float) {
  1250. TOKEN_SET(lvalp, dval, atof(scanner->buf.data));
  1251. return TOK_NumericFloat;
  1252. } else {
  1253. TOKEN_SET(lvalp, ival, atoi(scanner->buf.data));
  1254. return TOK_NumericInt;
  1255. }
  1256. }
  1257. scanner->state = Number;
  1258. return 0;
  1259. }
  1260. static inline int parseId(YYSTYPE *lvalp, yyscan_t scanner)
  1261. {
  1262. const struct keyword *kw;
  1263. const unsigned char *in = (const unsigned char*)scanner->in;
  1264. scanner->state = Initial;
  1265. while(scanner->pos < scanner->insize) {
  1266. unsigned char c = in[scanner->pos++];
  1267. enum char_class cClass = id_ctype[c];
  1268. switch(cClass) {
  1269. case IdStart:
  1270. textbuffer_putc(&scanner->buf, c);
  1271. break;
  1272. case Operator:
  1273. /* the table contains OP only for \ */
  1274. assert(c == '\\');
  1275. if(scanner->pos < scanner->insize &&
  1276. in[scanner->pos++] == 'u') {
  1277. textbuffer_putc(&scanner->buf, c);
  1278. break;
  1279. }
  1280. if(scanner->pos == scanner->insize) {
  1281. scanner->pos++;
  1282. }
  1283. /* else fallthrough */
  1284. default:
  1285. /* character is no longer part of identifier */
  1286. scanner->state = Initial;
  1287. textbuffer_putc(&scanner->buf, '\0');
  1288. scanner->pos--;
  1289. kw = in_word_set(scanner->buf.data, scanner->buf.pos-1);
  1290. if(kw) {
  1291. /* we got a keyword */
  1292. TOKEN_SET(lvalp, cstring, kw->name);
  1293. return kw->val;
  1294. }
  1295. /* it is not a keyword, just an identifier */
  1296. TOKEN_SET(lvalp, cstring, NULL);
  1297. return TOK_IDENTIFIER_NAME;
  1298. }
  1299. }
  1300. scanner->state = Identifier;
  1301. return 0;
  1302. }
  1303. static int parseOperator(YYSTYPE *lvalp, yyscan_t scanner)
  1304. {
  1305. size_t len = MIN(5, scanner->insize - scanner->pos);
  1306. while(len) {
  1307. const struct operator *kw = in_op_set(&scanner->in[scanner->pos], len);
  1308. if(kw) {
  1309. TOKEN_SET(lvalp, cstring, kw->name);
  1310. scanner->pos += len;
  1311. return kw->val;
  1312. }
  1313. len--;
  1314. }
  1315. /* never reached */
  1316. assert(0);
  1317. scanner->pos++;
  1318. TOKEN_SET(lvalp, cstring, NULL);
  1319. return TOK_ERROR;
  1320. }
  1321. static int yylex_init(yyscan_t *scanner)
  1322. {
  1323. *scanner = cli_calloc(1, sizeof(**scanner));
  1324. return *scanner ? 0 : -1;
  1325. }
  1326. static int yylex_destroy(yyscan_t scanner)
  1327. {
  1328. free(scanner->buf.data);
  1329. free(scanner);
  1330. return 0;
  1331. }
  1332. static int yy_scan_bytes(const char *p, size_t len, yyscan_t scanner)
  1333. {
  1334. scanner->in = p;
  1335. scanner->insize = len;
  1336. scanner->pos = 0;
  1337. scanner->lastpos = -1;
  1338. scanner->last_state = Dummy;
  1339. return 0;
  1340. }
  1341. static const char *yyget_text(yyscan_t scanner)
  1342. {
  1343. return scanner->yytext ? scanner->yytext : scanner->buf.data;
  1344. }
  1345. static int yyget_leng(yyscan_t scanner)
  1346. {
  1347. /* we have a \0 too */
  1348. return scanner->yylen ? scanner->yylen: (scanner->buf.pos > 0 ? scanner->buf.pos - 1 : 0);
  1349. }
  1350. static int yylex(YYSTYPE *lvalp, yyscan_t scanner)
  1351. {
  1352. const size_t len = scanner->insize;
  1353. const unsigned char *in = (const unsigned char*)scanner->in;
  1354. unsigned char lookahead;
  1355. enum char_class cClass;
  1356. scanner->yytext = NULL;
  1357. scanner->yylen = 0;
  1358. if(scanner->pos == scanner->lastpos) {
  1359. if(scanner->last_state == scanner->state) {
  1360. cli_dbgmsg(MODULE "infloop detected, skipping character\n");
  1361. scanner->pos++;
  1362. }
  1363. /* its not necesarely an infloop if it changed
  1364. * state, and it shouldn't infloop between states */
  1365. }
  1366. scanner->lastpos = scanner->pos;
  1367. scanner->last_state = scanner->state;
  1368. while(scanner->pos < scanner->insize) {
  1369. switch(scanner->state) {
  1370. case Initial:
  1371. textbuf_clean(&scanner->buf);
  1372. cClass = ctype[in[scanner->pos++]];
  1373. switch(cClass) {
  1374. case Whitespace:
  1375. /* eat whitespace */
  1376. continue;
  1377. case Slash:
  1378. if(scanner->pos < len) {
  1379. lookahead = in[scanner->pos];
  1380. switch(lookahead) {
  1381. case '*':
  1382. scanner->state = MultilineComment;
  1383. scanner->pos++;
  1384. continue;
  1385. case '/':
  1386. scanner->state = SinglelineComment;
  1387. scanner->pos++;
  1388. continue;
  1389. }
  1390. }
  1391. --scanner->pos;
  1392. return parseOperator(lvalp, scanner);
  1393. case Operator:
  1394. --scanner->pos;
  1395. return parseOperator(lvalp, scanner);
  1396. case DQuote:
  1397. return parseDQString(lvalp, scanner);
  1398. case SQuote:
  1399. return parseSQString(lvalp, scanner);
  1400. case Digit:
  1401. --scanner->pos;
  1402. return parseNumber(lvalp, scanner);
  1403. case IdStart:
  1404. --scanner->pos;
  1405. return parseId(lvalp,scanner);
  1406. CASE_SPECIAL_CHAR(BracketOpen, "[");
  1407. CASE_SPECIAL_CHAR(BracketClose, "]");
  1408. CASE_SPECIAL_CHAR(Comma, ",");
  1409. CASE_SPECIAL_CHAR(CurlyOpen, "{");
  1410. CASE_SPECIAL_CHAR(CurlyClose, "}");
  1411. CASE_SPECIAL_CHAR(ParOpen, "(");
  1412. CASE_SPECIAL_CHAR(ParClose, ")");
  1413. CASE_SPECIAL_CHAR(Dot, ".");
  1414. CASE_SPECIAL_CHAR(SemiColon, ";");
  1415. case Nop:
  1416. continue;
  1417. }
  1418. break;
  1419. case DoubleQString:
  1420. return parseString(lvalp, scanner, '"', DoubleQString);
  1421. case SingleQString:
  1422. return parseString(lvalp, scanner, '\'', SingleQString);
  1423. case Identifier:
  1424. return parseId(lvalp, scanner);
  1425. case MultilineComment:
  1426. while(scanner->pos+1 < scanner->insize) {
  1427. if(in[scanner->pos] == '*' && in[scanner->pos+1] == '/') {
  1428. scanner->state = Initial;
  1429. scanner->pos++;
  1430. break;
  1431. }
  1432. scanner->pos++;
  1433. }
  1434. scanner->pos++;
  1435. break;
  1436. case Number:
  1437. return parseNumber(lvalp, scanner);
  1438. case SinglelineComment:
  1439. while(scanner->pos < scanner->insize) {
  1440. /* htmlnorm converts \n to space, so
  1441. * stop on space too */
  1442. if(in[scanner->pos] == '\n' || in[scanner->pos] == ' ')
  1443. break;
  1444. scanner->pos++;
  1445. }
  1446. scanner->state = Initial;
  1447. break;
  1448. default:
  1449. assert(0 && "Not reached");
  1450. }
  1451. }
  1452. return 0;
  1453. }