PageRenderTime 50ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/src/parser/tokenizer.c

https://github.com/thouis/pandas
C | 2168 lines | 1625 code | 267 blank | 276 comment | 362 complexity | 54f46a25910a827d0c521e359bbe6f18 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*
  2. Copyright (c) 2012, Lambda Foundry, Inc., except where noted
  3. Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
  4. BSD
  5. See LICENSE for the license
  6. */
  7. /*
  8. Low-level ascii-file processing for pandas. Combines some elements from
  9. Python's built-in csv module and Warren Weckesser's textreader project on
  10. GitHub. See Python Software Foundation License and BSD licenses for these.
  11. */
  12. #include "tokenizer.h"
  13. #include <ctype.h>
  14. #include <math.h>
  15. #include <float.h>
  16. #define READ_ERROR_OUT_OF_MEMORY 1
  17. /*
  18. * restore:
  19. * RESTORE_NOT (0):
  20. * Free memory, but leave the file position wherever it
  21. * happend to be.
  22. * RESTORE_INITIAL (1):
  23. * Restore the file position to the location at which
  24. * the file_buffer was created.
  25. * RESTORE_FINAL (2):
  26. * Put the file position at the next byte after the
  27. * data read from the file_buffer.
  28. */
  29. #define RESTORE_NOT 0
  30. #define RESTORE_INITIAL 1
  31. #define RESTORE_FINAL 2
  32. static void *safe_realloc(void *buffer, size_t size) {
  33. void *result;
  34. // OS X is weird
  35. // http://stackoverflow.com/questions/9560609/
  36. // different-realloc-behaviour-in-linux-and-osx
  37. result = realloc(buffer, size);
  38. if (result != NULL) {
  39. // errno gets set to 12 on my OS Xmachine in some cases even when the
  40. // realloc succeeds. annoying
  41. errno = 0;
  42. } else {
  43. return buffer;
  44. }
  45. return result;
  46. }
  47. void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) {
  48. // column i, starting at 0
  49. self->words = parser->words;
  50. self->col = i;
  51. self->line_start = parser->line_start + start;
  52. }
  53. coliter_t *coliter_new(parser_t *self, int i) {
  54. // column i, starting at 0
  55. coliter_t *iter = (coliter_t*) malloc(sizeof(coliter_t));
  56. if (NULL == iter) {
  57. return NULL;
  58. }
  59. coliter_setup(iter, self, i, 0);
  60. return iter;
  61. }
  62. /* int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error); */
  63. /* uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); */
  64. static void free_if_not_null(void *ptr) {
  65. if (ptr != NULL) free(ptr);
  66. }
  67. /*
  68. Parser / tokenizer
  69. */
  70. static void *grow_buffer(void *buffer, int length, int *capacity,
  71. int space, int elsize, int *error) {
  72. int cap = *capacity;
  73. // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
  74. while (length + space > cap) {
  75. cap = cap? cap << 1 : 2;
  76. buffer = safe_realloc(buffer, elsize * cap);
  77. if (buffer == NULL) {
  78. // TODO: error codes
  79. *error = -1;
  80. }
  81. }
  82. // sigh, multiple return values
  83. *capacity = cap;
  84. *error = 0;
  85. return buffer;
  86. }
  87. void parser_set_default_options(parser_t *self) {
  88. self->decimal = '.';
  89. self->sci = 'E';
  90. // For tokenization
  91. self->state = START_RECORD;
  92. self->delimiter = ','; // XXX
  93. self->delim_whitespace = 0;
  94. self->doublequote = 0;
  95. self->quotechar = '"';
  96. self->escapechar = 0;
  97. self->lineterminator = '\0'; /* NUL->standard logic */
  98. self->skipinitialspace = 0;
  99. self->quoting = QUOTE_MINIMAL;
  100. self->allow_embedded_newline = 1;
  101. self->strict = 0;
  102. self->error_bad_lines = 0;
  103. self->warn_bad_lines = 0;
  104. self->commentchar = '#';
  105. self->thousands = '\0';
  106. self->skipset = NULL;
  107. self->skip_footer = 0;
  108. }
  109. int get_parser_memory_footprint(parser_t *self) {
  110. return 0;
  111. }
  112. parser_t* parser_new() {
  113. return (parser_t*) calloc(1, sizeof(parser_t));
  114. }
  115. int parser_clear_data_buffers(parser_t *self) {
  116. free_if_not_null(self->stream);
  117. free_if_not_null(self->words);
  118. free_if_not_null(self->word_starts);
  119. free_if_not_null(self->line_start);
  120. free_if_not_null(self->line_fields);
  121. return 0;
  122. }
  123. int parser_cleanup(parser_t *self) {
  124. if (self->cb_cleanup == NULL) {
  125. return 0;
  126. }
  127. if (self->cb_cleanup(self->source) < 0) {
  128. return -1;
  129. }
  130. if (parser_clear_data_buffers(self) < 0) {
  131. return -1;
  132. }
  133. // XXX where to put this
  134. free_if_not_null(self->error_msg);
  135. free_if_not_null(self->warn_msg);
  136. if (self->skipset != NULL)
  137. kh_destroy_int64((kh_int64_t*) self->skipset);
  138. return 0;
  139. }
  140. int parser_init(parser_t *self) {
  141. int sz;
  142. /*
  143. Initialize data buffers
  144. */
  145. self->stream = NULL;
  146. self->words = NULL;
  147. self->word_starts = NULL;
  148. self->line_start = NULL;
  149. self->line_fields = NULL;
  150. // token stream
  151. self->stream = (char*) malloc(STREAM_INIT_SIZE * sizeof(char));
  152. if (self->stream == NULL) {
  153. return PARSER_OUT_OF_MEMORY;
  154. }
  155. self->stream_cap = STREAM_INIT_SIZE;
  156. self->stream_len = 0;
  157. // word pointers and metadata
  158. sz = STREAM_INIT_SIZE / 10;
  159. sz = sz? sz : 1;
  160. self->words = (char**) malloc(sz * sizeof(char*));
  161. self->word_starts = (int*) malloc(sz * sizeof(int));
  162. self->words_cap = sz;
  163. self->words_len = 0;
  164. // line pointers and metadata
  165. self->line_start = (int*) malloc(sz * sizeof(int));
  166. self->line_fields = (int*) malloc(sz * sizeof(int));
  167. self->lines_cap = sz;
  168. self->lines = 0;
  169. self->file_lines = 0;
  170. if (self->stream == NULL || self->words == NULL ||
  171. self->word_starts == NULL || self->line_start == NULL ||
  172. self->line_fields == NULL) {
  173. parser_cleanup(self);
  174. return PARSER_OUT_OF_MEMORY;
  175. }
  176. /* amount of bytes buffered */
  177. self->datalen = 0;
  178. self->datapos = 0;
  179. self->line_start[0] = 0;
  180. self->line_fields[0] = 0;
  181. self->pword_start = self->stream;
  182. self->word_start = 0;
  183. self->state = START_RECORD;
  184. self->error_msg = NULL;
  185. self->warn_msg = NULL;
  186. self->commentchar = '\0';
  187. return 0;
  188. }
  189. void parser_free(parser_t *self) {
  190. // opposite of parser_init
  191. parser_cleanup(self);
  192. free(self);
  193. }
  194. static int make_stream_space(parser_t *self, size_t nbytes) {
  195. int i, status, cap;
  196. void *orig_ptr;
  197. // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
  198. /* TRACE(("maybe growing buffers\n")); */
  199. /*
  200. TOKEN STREAM
  201. */
  202. orig_ptr = (void *) self->stream;
  203. self->stream = (char*) grow_buffer((void *) self->stream,
  204. self->stream_len,
  205. &self->stream_cap, nbytes * 2,
  206. sizeof(char), &status);
  207. if (status != 0) {
  208. return PARSER_OUT_OF_MEMORY;
  209. }
  210. // realloc sets errno when moving buffer?
  211. if (self->stream != orig_ptr) {
  212. // uff
  213. /* TRACE(("Moving word pointers\n")) */
  214. self->pword_start = self->stream + self->word_start;
  215. for (i = 0; i < self->words_len; ++i)
  216. {
  217. self->words[i] = self->stream + self->word_starts[i];
  218. }
  219. }
  220. /*
  221. WORD VECTORS
  222. */
  223. cap = self->words_cap;
  224. self->words = (char**) grow_buffer((void *) self->words,
  225. self->words_len,
  226. &self->words_cap, nbytes,
  227. sizeof(char*), &status);
  228. if (status != 0) {
  229. return PARSER_OUT_OF_MEMORY;
  230. }
  231. // realloc took place
  232. if (cap != self->words_cap) {
  233. self->word_starts = (int*) safe_realloc((void *) self->word_starts,
  234. sizeof(int) * self->words_cap);
  235. if (self->word_starts == NULL) {
  236. return PARSER_OUT_OF_MEMORY;
  237. }
  238. }
  239. /*
  240. LINE VECTORS
  241. */
  242. /*
  243. printf("Line_start: ");
  244. for (j = 0; j < self->lines + 1; ++j) {
  245. printf("%d ", self->line_fields[j]);
  246. }
  247. printf("\n");
  248. printf("lines_cap: %d\n", self->lines_cap);
  249. */
  250. cap = self->lines_cap;
  251. self->line_start = (int*) grow_buffer((void *) self->line_start,
  252. self->lines + 1,
  253. &self->lines_cap, nbytes,
  254. sizeof(int), &status);
  255. if (status != 0) {
  256. return PARSER_OUT_OF_MEMORY;
  257. }
  258. // realloc took place
  259. if (cap != self->lines_cap) {
  260. self->line_fields = (int*) safe_realloc((void *) self->line_fields,
  261. sizeof(int) * self->lines_cap);
  262. if (self->line_fields == NULL) {
  263. return PARSER_OUT_OF_MEMORY;
  264. }
  265. }
  266. /* TRACE(("finished growing buffers\n")); */
  267. return 0;
  268. }
  269. static int push_char(parser_t *self, char c) {
  270. /* TRACE(("pushing %c \n", c)) */
  271. self->stream[self->stream_len++] = c;
  272. return 0;
  273. }
  274. static int P_INLINE end_field(parser_t *self) {
  275. // XXX cruft
  276. self->numeric_field = 0;
  277. // null terminate token
  278. push_char(self, '\0');
  279. // set pointer and metadata
  280. self->words[self->words_len] = self->pword_start;
  281. TRACE(("Char diff: %d\n", self->pword_start - self->words[0]));
  282. TRACE(("Saw word %s at: %d. Total: %d\n",
  283. self->pword_start, self->word_start, self->words_len + 1))
  284. self->word_starts[self->words_len] = self->word_start;
  285. self->words_len++;
  286. // increment line field count
  287. self->line_fields[self->lines]++;
  288. // New field begin in stream
  289. self->pword_start = self->stream + self->stream_len;
  290. self->word_start = self->stream_len;
  291. return 0;
  292. }
  293. static void append_warning(parser_t *self, const char *msg) {
  294. int ex_length;
  295. int length = strlen(msg);
  296. if (self->warn_msg == NULL) {
  297. self->warn_msg = (char*) malloc(length + 1);
  298. strcpy(self->warn_msg, msg);
  299. } else {
  300. ex_length = strlen(self->warn_msg);
  301. self->warn_msg = (char*) safe_realloc(self->warn_msg,
  302. ex_length + length + 1);
  303. strcpy(self->warn_msg + ex_length, msg);
  304. }
  305. }
  306. static int end_line(parser_t *self) {
  307. int fields;
  308. khiter_t k; /* for hash set detection */
  309. int ex_fields = -1;
  310. char *msg;
  311. fields = self->line_fields[self->lines];
  312. TRACE(("Line end, nfields: %d\n", fields));
  313. if (self->lines > 0) {
  314. ex_fields = self->line_fields[self->lines - 1];
  315. }
  316. if (self->skipset != NULL) {
  317. k = kh_get_int64((kh_int64_t*) self->skipset, self->file_lines);
  318. if (k != ((kh_int64_t*)self->skipset)->n_buckets) {
  319. TRACE(("Skipping row %d\n", self->file_lines));
  320. // increment file line count
  321. self->file_lines++;
  322. // skip the tokens from this bad line
  323. self->line_start[self->lines] += fields;
  324. // reset field count
  325. self->line_fields[self->lines] = 0;
  326. return 0;
  327. }
  328. }
  329. if (!(self->lines <= self->header + 1) && fields > ex_fields) {
  330. // increment file line count
  331. self->file_lines++;
  332. // skip the tokens from this bad line
  333. self->line_start[self->lines] += fields;
  334. // reset field count
  335. self->line_fields[self->lines] = 0;
  336. // file_lines is now the _actual_ file line number (starting at 1)
  337. if (self->error_bad_lines) {
  338. self->error_msg = (char*) malloc(100);
  339. sprintf(self->error_msg, "Expected %d fields in line %d, saw %d\n",
  340. ex_fields, self->file_lines, fields);
  341. TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
  342. return -1;
  343. } else {
  344. // simply skip bad lines
  345. if (self->warn_bad_lines) {
  346. // pass up error message
  347. msg = (char*) malloc(100);
  348. sprintf(msg, "Skipping line %d: expected %d fields, saw %d\n",
  349. self->file_lines, ex_fields, fields);
  350. append_warning(self, msg);
  351. free(msg);
  352. }
  353. }
  354. }
  355. else {
  356. /* missing trailing delimiters */
  357. if (self->lines >= self->header + 1 && self->lines > 0) {
  358. while (fields < ex_fields){
  359. end_field(self);
  360. fields++;
  361. }
  362. }
  363. // increment both line counts
  364. self->file_lines++;
  365. self->lines++;
  366. // good line, set new start point
  367. self->line_start[self->lines] = (self->line_start[self->lines - 1] +
  368. fields);
  369. TRACE(("new line start: %d\n", self->line_start[self->lines]));
  370. // new line start with 0 fields
  371. self->line_fields[self->lines] = 0;
  372. }
  373. TRACE(("Finished line, at %d\n", self->lines));
  374. return 0;
  375. }
  376. int parser_add_skiprow(parser_t *self, int64_t row) {
  377. khiter_t k;
  378. kh_int64_t *set;
  379. int ret = 0;
  380. if (self->skipset == NULL) {
  381. self->skipset = (void*) kh_init_int64();
  382. }
  383. set = (kh_int64_t*) self->skipset;
  384. k = kh_put_int64(set, row, &ret);
  385. set->keys[k] = row;
  386. return 0;
  387. }
  388. static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
  389. int status;
  390. size_t bytes_read;
  391. void *src = self->source;
  392. status = 0;
  393. self->datapos = 0;
  394. self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
  395. self->datalen = bytes_read;
  396. if (status != REACHED_EOF && self->data == NULL) {
  397. self->error_msg = (char*) malloc(200);
  398. if (status == CALLING_READ_FAILED) {
  399. sprintf(self->error_msg, ("Calling read(nbytes) on source failed. "
  400. "Try engine='python'."));
  401. } else {
  402. sprintf(self->error_msg, "Unknown error in IO callback");
  403. }
  404. return -1;
  405. }
  406. TRACE(("datalen: %d\n", self->datalen));
  407. return status;
  408. }
  409. /*
  410. Tokenization macros and state machine code
  411. */
  412. // printf("pushing %c\n", c);
  413. #if defined(VERBOSE)
  414. #define PUSH_CHAR(c) \
  415. printf("Pushing %c, slen now: %d\n", c, slen); \
  416. *stream++ = c; \
  417. slen++;
  418. #else
  419. #define PUSH_CHAR(c) \
  420. *stream++ = c; \
  421. slen++;
  422. #endif
  423. // This is a little bit of a hack but works for now
  424. #define END_FIELD() \
  425. self->stream_len = slen; \
  426. if (end_field(self) < 0) { \
  427. goto parsingerror; \
  428. } \
  429. stream = self->stream + self->stream_len; \
  430. slen = self->stream_len;
  431. #define END_LINE_STATE(STATE) \
  432. self->stream_len = slen; \
  433. if (end_line(self) < 0) { \
  434. goto parsingerror; \
  435. } \
  436. self->state = STATE; \
  437. if (line_limit > 0 && self->lines == start_lines + line_limit) { \
  438. goto linelimit; \
  439. \
  440. } \
  441. stream = self->stream + self->stream_len; \
  442. slen = self->stream_len;
  443. #define END_LINE_AND_FIELD_STATE(STATE) \
  444. self->stream_len = slen; \
  445. if (end_line(self) < 0) { \
  446. goto parsingerror; \
  447. } \
  448. if (end_field(self) < 0) { \
  449. goto parsingerror; \
  450. } \
  451. stream = self->stream + self->stream_len; \
  452. slen = self->stream_len; \
  453. self->state = STATE; \
  454. if (line_limit > 0 && self->lines == start_lines + line_limit) { \
  455. goto linelimit; \
  456. \
  457. }
  458. #define END_LINE() END_LINE_STATE(START_RECORD)
  459. #define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
  460. typedef int (*parser_op)(parser_t *self, size_t line_limit);
  461. #define _TOKEN_CLEANUP() \
  462. self->stream_len = slen; \
  463. self->datapos = i; \
  464. TRACE(("datapos: %d, datalen: %d\n", self->datapos, self->datalen));
  465. int tokenize_delimited(parser_t *self, size_t line_limit)
  466. {
  467. int i, slen, start_lines;
  468. char c;
  469. char *stream;
  470. char *buf = self->data + self->datapos;
  471. start_lines = self->lines;
  472. if (make_stream_space(self, self->datalen - self->datapos) < 0) {
  473. self->error_msg = "out of memory";
  474. return -1;
  475. }
  476. stream = self->stream + self->stream_len;
  477. slen = self->stream_len;
  478. TRACE(("%s\n", buf));
  479. for (i = self->datapos; i < self->datalen; ++i)
  480. {
  481. // Next character in file
  482. c = *buf++;
  483. TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
  484. i, c, self->file_lines + 1, self->line_fields[self->lines],
  485. self->state));
  486. switch(self->state) {
  487. case START_RECORD:
  488. // start of record
  489. if (c == '\n') {
  490. // \n\r possible?
  491. END_LINE();
  492. break;
  493. } else if (c == '\r') {
  494. self->state = EAT_CRNL;
  495. break;
  496. }
  497. /* normal character - handle as START_FIELD */
  498. self->state = START_FIELD;
  499. /* fallthru */
  500. case START_FIELD:
  501. /* expecting field */
  502. if (c == '\n') {
  503. END_FIELD();
  504. END_LINE();
  505. /* self->state = START_RECORD; */
  506. } else if (c == '\r') {
  507. END_FIELD();
  508. self->state = EAT_CRNL;
  509. }
  510. else if (c == self->quotechar &&
  511. self->quoting != QUOTE_NONE) {
  512. /* start quoted field */
  513. self->state = IN_QUOTED_FIELD;
  514. }
  515. else if (c == self->escapechar) {
  516. /* possible escaped character */
  517. self->state = ESCAPED_CHAR;
  518. }
  519. else if (c == ' ' && self->skipinitialspace)
  520. /* ignore space at start of field */
  521. ;
  522. else if (c == self->delimiter) {
  523. /* save empty field */
  524. END_FIELD();
  525. }
  526. else if (c == self->commentchar) {
  527. END_FIELD();
  528. self->state = EAT_COMMENT;
  529. }
  530. else {
  531. /* begin new unquoted field */
  532. if (self->quoting == QUOTE_NONNUMERIC)
  533. self->numeric_field = 1;
  534. // TRACE(("pushing %c", c));
  535. PUSH_CHAR(c);
  536. self->state = IN_FIELD;
  537. }
  538. break;
  539. case ESCAPED_CHAR:
  540. /* if (c == '\0') */
  541. /* c = '\n'; */
  542. PUSH_CHAR(c);
  543. self->state = IN_FIELD;
  544. break;
  545. case IN_FIELD:
  546. /* in unquoted field */
  547. if (c == '\n') {
  548. END_FIELD();
  549. END_LINE();
  550. /* self->state = START_RECORD; */
  551. } else if (c == '\r') {
  552. END_FIELD();
  553. self->state = EAT_CRNL;
  554. }
  555. else if (c == self->escapechar) {
  556. /* possible escaped character */
  557. self->state = ESCAPED_CHAR;
  558. }
  559. else if (c == self->delimiter) {
  560. // End of field. End of line not reached yet
  561. END_FIELD();
  562. self->state = START_FIELD;
  563. }
  564. else if (c == self->commentchar) {
  565. END_FIELD();
  566. self->state = EAT_COMMENT;
  567. }
  568. else {
  569. /* normal character - save in field */
  570. PUSH_CHAR(c);
  571. }
  572. break;
  573. case IN_QUOTED_FIELD:
  574. /* in quoted field */
  575. if (c == self->escapechar) {
  576. /* Possible escape character */
  577. self->state = ESCAPE_IN_QUOTED_FIELD;
  578. }
  579. else if (c == self->quotechar &&
  580. self->quoting != QUOTE_NONE) {
  581. if (self->doublequote) {
  582. /* doublequote; " represented by "" */
  583. self->state = QUOTE_IN_QUOTED_FIELD;
  584. }
  585. else {
  586. /* end of quote part of field */
  587. self->state = IN_FIELD;
  588. }
  589. }
  590. else {
  591. /* normal character - save in field */
  592. PUSH_CHAR(c);
  593. }
  594. break;
  595. case ESCAPE_IN_QUOTED_FIELD:
  596. /* if (c == '\0') */
  597. /* c = '\n'; */
  598. PUSH_CHAR(c);
  599. self->state = IN_QUOTED_FIELD;
  600. break;
  601. case QUOTE_IN_QUOTED_FIELD:
  602. /* doublequote - seen a quote in an quoted field */
  603. if (self->quoting != QUOTE_NONE && c == self->quotechar) {
  604. /* save "" as " */
  605. PUSH_CHAR(c);
  606. self->state = IN_QUOTED_FIELD;
  607. }
  608. else if (c == self->delimiter) {
  609. // End of field. End of line not reached yet
  610. END_FIELD();
  611. self->state = START_FIELD;
  612. }
  613. else if (c == '\n') {
  614. END_FIELD();
  615. END_LINE();
  616. /* self->state = START_RECORD; */
  617. }
  618. else if (c == '\r') {
  619. END_FIELD();
  620. self->state = EAT_CRNL;
  621. }
  622. else if (!self->strict) {
  623. PUSH_CHAR(c);
  624. self->state = IN_FIELD;
  625. }
  626. else {
  627. self->error_msg = (char*) malloc(50);
  628. sprintf(self->error_msg, "'%c' expected after '%c'",
  629. self->delimiter, self->quotechar);
  630. goto parsingerror;
  631. }
  632. break;
  633. case EAT_CRNL:
  634. if (c == '\n') {
  635. END_LINE();
  636. /* self->state = START_RECORD; */
  637. } else if (c == self->delimiter){
  638. // Handle \r-delimited files
  639. END_LINE_AND_FIELD_STATE(START_FIELD);
  640. } else {
  641. PUSH_CHAR(c);
  642. END_LINE_STATE(IN_FIELD);
  643. }
  644. break;
  645. case EAT_COMMENT:
  646. if (c == '\n') {
  647. END_LINE();
  648. } else if (c == '\r') {
  649. self->state = EAT_CRNL;
  650. }
  651. break;
  652. default:
  653. break;
  654. }
  655. }
  656. _TOKEN_CLEANUP();
  657. TRACE(("Finished tokenizing input\n"))
  658. return 0;
  659. parsingerror:
  660. i++;
  661. _TOKEN_CLEANUP();
  662. return -1;
  663. linelimit:
  664. i++;
  665. _TOKEN_CLEANUP();
  666. return 0;
  667. }
  668. /* custom line terminator */
  669. int tokenize_delim_customterm(parser_t *self, size_t line_limit)
  670. {
  671. int i, slen, start_lines;
  672. char c;
  673. char *stream;
  674. char *buf = self->data + self->datapos;
  675. start_lines = self->lines;
  676. if (make_stream_space(self, self->datalen - self->datapos) < 0) {
  677. self->error_msg = "out of memory";
  678. return -1;
  679. }
  680. stream = self->stream + self->stream_len;
  681. slen = self->stream_len;
  682. TRACE(("%s\n", buf));
  683. for (i = self->datapos; i < self->datalen; ++i)
  684. {
  685. // Next character in file
  686. c = *buf++;
  687. TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
  688. i, c, self->file_lines + 1, self->line_fields[self->lines],
  689. self->state));
  690. switch(self->state) {
  691. case START_RECORD:
  692. // start of record
  693. if (c == self->lineterminator) {
  694. // \n\r possible?
  695. END_LINE();
  696. break;
  697. }
  698. /* normal character - handle as START_FIELD */
  699. self->state = START_FIELD;
  700. /* fallthru */
  701. case START_FIELD:
  702. /* expecting field */
  703. if (c == self->lineterminator) {
  704. END_FIELD();
  705. END_LINE();
  706. /* self->state = START_RECORD; */
  707. }
  708. else if (c == self->quotechar &&
  709. self->quoting != QUOTE_NONE) {
  710. /* start quoted field */
  711. self->state = IN_QUOTED_FIELD;
  712. }
  713. else if (c == self->escapechar) {
  714. /* possible escaped character */
  715. self->state = ESCAPED_CHAR;
  716. }
  717. else if (c == ' ' && self->skipinitialspace)
  718. /* ignore space at start of field */
  719. ;
  720. else if (c == self->delimiter) {
  721. /* save empty field */
  722. END_FIELD();
  723. }
  724. else if (c == self->commentchar) {
  725. END_FIELD();
  726. self->state = EAT_COMMENT;
  727. }
  728. else {
  729. /* begin new unquoted field */
  730. if (self->quoting == QUOTE_NONNUMERIC)
  731. self->numeric_field = 1;
  732. // TRACE(("pushing %c", c));
  733. PUSH_CHAR(c);
  734. self->state = IN_FIELD;
  735. }
  736. break;
  737. case ESCAPED_CHAR:
  738. /* if (c == '\0') */
  739. /* c = '\n'; */
  740. PUSH_CHAR(c);
  741. self->state = IN_FIELD;
  742. break;
  743. case IN_FIELD:
  744. /* in unquoted field */
  745. if (c == self->lineterminator) {
  746. END_FIELD();
  747. END_LINE();
  748. /* self->state = START_RECORD; */
  749. }
  750. else if (c == self->escapechar) {
  751. /* possible escaped character */
  752. self->state = ESCAPED_CHAR;
  753. }
  754. else if (c == self->delimiter) {
  755. // End of field. End of line not reached yet
  756. END_FIELD();
  757. self->state = START_FIELD;
  758. }
  759. else if (c == self->commentchar) {
  760. END_FIELD();
  761. self->state = EAT_COMMENT;
  762. }
  763. else {
  764. /* normal character - save in field */
  765. PUSH_CHAR(c);
  766. }
  767. break;
  768. case IN_QUOTED_FIELD:
  769. /* in quoted field */
  770. if (c == self->escapechar) {
  771. /* Possible escape character */
  772. self->state = ESCAPE_IN_QUOTED_FIELD;
  773. }
  774. else if (c == self->quotechar &&
  775. self->quoting != QUOTE_NONE) {
  776. if (self->doublequote) {
  777. /* doublequote; " represented by "" */
  778. self->state = QUOTE_IN_QUOTED_FIELD;
  779. }
  780. else {
  781. /* end of quote part of field */
  782. self->state = IN_FIELD;
  783. }
  784. }
  785. else {
  786. /* normal character - save in field */
  787. PUSH_CHAR(c);
  788. }
  789. break;
  790. case ESCAPE_IN_QUOTED_FIELD:
  791. PUSH_CHAR(c);
  792. self->state = IN_QUOTED_FIELD;
  793. break;
  794. case QUOTE_IN_QUOTED_FIELD:
  795. /* doublequote - seen a quote in an quoted field */
  796. if (self->quoting != QUOTE_NONE && c == self->quotechar) {
  797. /* save "" as " */
  798. PUSH_CHAR(c);
  799. self->state = IN_QUOTED_FIELD;
  800. }
  801. else if (c == self->delimiter) {
  802. // End of field. End of line not reached yet
  803. END_FIELD();
  804. self->state = START_FIELD;
  805. }
  806. else if (c == self->lineterminator) {
  807. END_FIELD();
  808. END_LINE();
  809. /* self->state = START_RECORD; */
  810. }
  811. else if (!self->strict) {
  812. PUSH_CHAR(c);
  813. self->state = IN_FIELD;
  814. }
  815. else {
  816. self->error_msg = (char*) malloc(50);
  817. sprintf(self->error_msg, "'%c' expected after '%c'",
  818. self->delimiter, self->quotechar);
  819. goto parsingerror;
  820. }
  821. break;
  822. case EAT_COMMENT:
  823. if (c == self->lineterminator) {
  824. END_LINE();
  825. }
  826. break;
  827. default:
  828. break;
  829. }
  830. }
  831. _TOKEN_CLEANUP();
  832. TRACE(("Finished tokenizing input\n"))
  833. return 0;
  834. parsingerror:
  835. i++;
  836. _TOKEN_CLEANUP();
  837. return -1;
  838. linelimit:
  839. i++;
  840. _TOKEN_CLEANUP();
  841. return 0;
  842. }
  843. int tokenize_whitespace(parser_t *self, size_t line_limit)
  844. {
  845. int i, slen, start_lines;
  846. char c;
  847. char *stream;
  848. char *buf = self->data + self->datapos;
  849. start_lines = self->lines;
  850. if (make_stream_space(self, self->datalen - self->datapos) < 0) {
  851. self->error_msg = "out of memory";
  852. return -1;
  853. }
  854. stream = self->stream + self->stream_len;
  855. slen = self->stream_len;
  856. TRACE(("%s\n", buf));
  857. for (i = self->datapos; i < self->datalen; ++i)
  858. {
  859. // Next character in file
  860. c = *buf++;
  861. TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
  862. i, c, self->file_lines + 1, self->line_fields[self->lines],
  863. self->state));
  864. switch(self->state) {
  865. case EAT_WHITESPACE:
  866. if (!IS_WHITESPACE(c)) {
  867. // END_FIELD();
  868. self->state = START_FIELD;
  869. // Fall through to subsequent state
  870. } else {
  871. // if whitespace char, keep slurping
  872. break;
  873. }
  874. case START_RECORD:
  875. // start of record
  876. if (c == '\n') {
  877. // \n\r possible?
  878. END_LINE();
  879. break;
  880. } else if (c == '\r') {
  881. self->state = EAT_CRNL;
  882. break;
  883. } else if (IS_WHITESPACE(c)) {
  884. END_FIELD();
  885. self->state = EAT_WHITESPACE;
  886. break;
  887. } else {
  888. /* normal character - handle as START_FIELD */
  889. self->state = START_FIELD;
  890. }
  891. /* fallthru */
  892. case START_FIELD:
  893. /* expecting field */
  894. if (c == '\n') {
  895. END_FIELD();
  896. END_LINE();
  897. /* self->state = START_RECORD; */
  898. } else if (c == '\r') {
  899. END_FIELD();
  900. self->state = EAT_CRNL;
  901. }
  902. else if (c == self->quotechar &&
  903. self->quoting != QUOTE_NONE) {
  904. /* start quoted field */
  905. self->state = IN_QUOTED_FIELD;
  906. }
  907. else if (c == self->escapechar) {
  908. /* possible escaped character */
  909. self->state = ESCAPED_CHAR;
  910. }
  911. /* else if (c == ' ' && self->skipinitialspace) */
  912. /* /\* ignore space at start of field *\/ */
  913. /* ; */
  914. else if (IS_WHITESPACE(c)) {
  915. self->state = EAT_WHITESPACE;
  916. }
  917. else if (c == self->commentchar) {
  918. END_FIELD();
  919. self->state = EAT_COMMENT;
  920. }
  921. else {
  922. /* begin new unquoted field */
  923. if (self->quoting == QUOTE_NONNUMERIC)
  924. self->numeric_field = 1;
  925. // TRACE(("pushing %c", c));
  926. PUSH_CHAR(c);
  927. self->state = IN_FIELD;
  928. }
  929. break;
  930. case ESCAPED_CHAR:
  931. /* if (c == '\0') */
  932. /* c = '\n'; */
  933. PUSH_CHAR(c);
  934. self->state = IN_FIELD;
  935. break;
  936. case IN_FIELD:
  937. /* in unquoted field */
  938. if (c == '\n') {
  939. END_FIELD();
  940. END_LINE();
  941. /* self->state = START_RECORD; */
  942. } else if (c == '\r') {
  943. END_FIELD();
  944. self->state = EAT_CRNL;
  945. }
  946. else if (c == self->escapechar) {
  947. /* possible escaped character */
  948. self->state = ESCAPED_CHAR;
  949. }
  950. else if (IS_WHITESPACE(c)) {
  951. // End of field. End of line not reached yet
  952. END_FIELD();
  953. self->state = EAT_WHITESPACE;
  954. }
  955. else if (c == self->commentchar) {
  956. END_FIELD();
  957. self->state = EAT_COMMENT;
  958. }
  959. else {
  960. /* normal character - save in field */
  961. PUSH_CHAR(c);
  962. }
  963. break;
  964. case IN_QUOTED_FIELD:
  965. /* in quoted field */
  966. if (c == self->escapechar) {
  967. /* Possible escape character */
  968. self->state = ESCAPE_IN_QUOTED_FIELD;
  969. }
  970. else if (c == self->quotechar &&
  971. self->quoting != QUOTE_NONE) {
  972. if (self->doublequote) {
  973. /* doublequote; " represented by "" */
  974. self->state = QUOTE_IN_QUOTED_FIELD;
  975. }
  976. else {
  977. /* end of quote part of field */
  978. self->state = IN_FIELD;
  979. }
  980. }
  981. else {
  982. /* normal character - save in field */
  983. PUSH_CHAR(c);
  984. }
  985. break;
  986. case ESCAPE_IN_QUOTED_FIELD:
  987. /* if (c == '\0') */
  988. /* c = '\n'; */
  989. PUSH_CHAR(c);
  990. self->state = IN_QUOTED_FIELD;
  991. break;
  992. case QUOTE_IN_QUOTED_FIELD:
  993. /* doublequote - seen a quote in an quoted field */
  994. if (self->quoting != QUOTE_NONE && c == self->quotechar) {
  995. /* save "" as " */
  996. PUSH_CHAR(c);
  997. self->state = IN_QUOTED_FIELD;
  998. }
  999. else if (IS_WHITESPACE(c)) {
  1000. // End of field. End of line not reached yet
  1001. END_FIELD();
  1002. self->state = EAT_WHITESPACE;
  1003. }
  1004. else if (c == '\n') {
  1005. END_FIELD();
  1006. END_LINE();
  1007. /* self->state = START_RECORD; */
  1008. }
  1009. else if (c == '\r') {
  1010. END_FIELD();
  1011. self->state = EAT_CRNL;
  1012. }
  1013. else if (!self->strict) {
  1014. PUSH_CHAR(c);
  1015. self->state = IN_FIELD;
  1016. }
  1017. else {
  1018. self->error_msg = (char*) malloc(50);
  1019. sprintf(self->error_msg, "'%c' expected after '%c'",
  1020. self->delimiter, self->quotechar);
  1021. goto parsingerror;
  1022. }
  1023. break;
  1024. case EAT_CRNL:
  1025. if (c == '\n') {
  1026. END_LINE();
  1027. /* self->state = START_RECORD; */
  1028. } else if (IS_WHITESPACE(c)){
  1029. // Handle \r-delimited files
  1030. END_LINE_AND_FIELD_STATE(EAT_WHITESPACE);
  1031. } else {
  1032. PUSH_CHAR(c);
  1033. END_LINE_STATE(IN_FIELD);
  1034. }
  1035. break;
  1036. case EAT_COMMENT:
  1037. if (c == '\n') {
  1038. END_LINE();
  1039. } else if (c == '\r') {
  1040. self->state = EAT_CRNL;
  1041. }
  1042. break;
  1043. default:
  1044. break;
  1045. }
  1046. }
  1047. _TOKEN_CLEANUP();
  1048. TRACE(("Finished tokenizing input\n"))
  1049. return 0;
  1050. parsingerror:
  1051. i++;
  1052. _TOKEN_CLEANUP();
  1053. return -1;
  1054. linelimit:
  1055. i++;
  1056. _TOKEN_CLEANUP();
  1057. return 0;
  1058. }
  1059. static int parser_handle_eof(parser_t *self) {
  1060. TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
  1061. if (self->datalen == 0 && (self->state != START_RECORD)) {
  1062. // test cases needed here
  1063. // TODO: empty field at end of line
  1064. TRACE(("handling eof\n"));
  1065. if (self->state == IN_FIELD || self->state == START_FIELD) {
  1066. if (end_field(self) < 0)
  1067. return -1;
  1068. } else if (self->state == QUOTE_IN_QUOTED_FIELD) {
  1069. if (end_field(self) < 0)
  1070. return -1;
  1071. } else if (self->state == IN_QUOTED_FIELD) {
  1072. self->error_msg = (char*) malloc(100);
  1073. sprintf(self->error_msg, "EOF inside string starting at line %d",
  1074. self->file_lines);
  1075. return -1;
  1076. }
  1077. if (end_line(self) < 0)
  1078. return -1;
  1079. return 0;
  1080. }
  1081. else if (self->datalen == 0 && (self->state == START_RECORD)) {
  1082. return 0;
  1083. }
  1084. return -1;
  1085. }
  1086. int parser_consume_rows(parser_t *self, size_t nrows) {
  1087. int i, offset, word_deletions, char_count;
  1088. if (nrows > self->lines) {
  1089. nrows = self->lines;
  1090. }
  1091. /* do nothing */
  1092. if (nrows == 0)
  1093. return 0;
  1094. /* cannot guarantee that nrows + 1 has been observed */
  1095. word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
  1096. char_count = (self->word_starts[word_deletions - 1] +
  1097. strlen(self->words[word_deletions - 1]) + 1);
  1098. TRACE(("Deleting %d words, %d chars\n", word_deletions, char_count));
  1099. /* move stream, only if something to move */
  1100. if (char_count < self->stream_len) {
  1101. memmove((void*) self->stream, (void*) (self->stream + char_count),
  1102. self->stream_len - char_count);
  1103. }
  1104. /* buffer counts */
  1105. self->stream_len -= char_count;
  1106. /* move token metadata */
  1107. for (i = 0; i < self->words_len - word_deletions; ++i) {
  1108. offset = i + word_deletions;
  1109. self->words[i] = self->words[offset] - char_count;
  1110. self->word_starts[i] = self->word_starts[offset] - char_count;
  1111. }
  1112. self->words_len -= word_deletions;
  1113. /* move current word pointer to stream */
  1114. self->pword_start -= char_count;
  1115. self->word_start -= char_count;
  1116. /*
  1117. printf("Line_start: ");
  1118. for (i = 0; i < self->lines + 1; ++i) {
  1119. printf("%d ", self->line_fields[i]);
  1120. }
  1121. printf("\n");
  1122. */
  1123. /* move line metadata */
  1124. for (i = 0; i < self->lines - nrows + 1; ++i)
  1125. {
  1126. offset = i + nrows;
  1127. self->line_start[i] = self->line_start[offset] - word_deletions;
  1128. /* TRACE(("First word in line %d is now %s\n", i, */
  1129. /* self->words[self->line_start[i]])); */
  1130. self->line_fields[i] = self->line_fields[offset];
  1131. }
  1132. self->lines -= nrows;
  1133. /* self->line_fields[self->lines] = 0; */
  1134. return 0;
  1135. }
  1136. static size_t _next_pow2(size_t sz) {
  1137. size_t result = 1;
  1138. while (result < sz) result *= 2;
  1139. return result;
  1140. }
  1141. int parser_trim_buffers(parser_t *self) {
  1142. /*
  1143. Free memory
  1144. */
  1145. size_t new_cap;
  1146. /* trim stream */
  1147. new_cap = _next_pow2(self->stream_len) + 1;
  1148. if (new_cap < self->stream_cap) {
  1149. self->stream = safe_realloc((void*) self->stream, new_cap);
  1150. self->stream_cap = new_cap;
  1151. }
  1152. /* trim words, word_starts */
  1153. new_cap = _next_pow2(self->words_len) + 1;
  1154. if (new_cap < self->words_cap) {
  1155. self->words = (char**) safe_realloc((void*) self->words,
  1156. new_cap * sizeof(char*));
  1157. self->word_starts = (int*) safe_realloc((void*) self->word_starts,
  1158. new_cap * sizeof(int));
  1159. self->words_cap = new_cap;
  1160. }
  1161. /* trim line_start, line_fields */
  1162. new_cap = _next_pow2(self->lines) + 1;
  1163. if (new_cap < self->lines_cap) {
  1164. self->line_start = (int*) safe_realloc((void*) self->line_start,
  1165. new_cap * sizeof(int));
  1166. self->line_fields = (int*) safe_realloc((void*) self->line_fields,
  1167. new_cap * sizeof(int));
  1168. self->lines_cap = new_cap;
  1169. }
  1170. return 0;
  1171. }
  1172. void debug_print_parser(parser_t *self) {
  1173. int j, line;
  1174. char *token;
  1175. for (line = 0; line < self->lines; ++line)
  1176. {
  1177. printf("(Parsed) Line %d: ", line);
  1178. for (j = 0; j < self->line_fields[j]; ++j)
  1179. {
  1180. token = self->words[j + self->line_start[line]];
  1181. printf("%s ", token);
  1182. }
  1183. printf("\n");
  1184. }
  1185. }
  1186. int clear_parsed_lines(parser_t *self, size_t nlines) {
  1187. // TODO. move data up in stream, shift relevant word pointers
  1188. return 0;
  1189. }
  1190. /*
  1191. nrows : number of rows to tokenize (or until reach EOF)
  1192. all : tokenize all the data vs. certain number of rows
  1193. */
  1194. int _tokenize_helper(parser_t *self, size_t nrows, int all) {
  1195. parser_op tokenize_bytes;
  1196. int status = 0;
  1197. int start_lines = self->lines;
  1198. if (self->delim_whitespace) {
  1199. tokenize_bytes = tokenize_whitespace;
  1200. } else if (self->lineterminator == '\0') {
  1201. tokenize_bytes = tokenize_delimited;
  1202. } else {
  1203. tokenize_bytes = tokenize_delim_customterm;
  1204. }
  1205. if (self->state == FINISHED) {
  1206. return 0;
  1207. }
  1208. TRACE(("Asked to tokenize %d rows\n", (int) nrows));
  1209. while (1) {
  1210. if (!all && self->lines - start_lines >= nrows)
  1211. break;
  1212. if (self->datapos == self->datalen) {
  1213. status = parser_buffer_bytes(self, self->chunksize);
  1214. if (status == REACHED_EOF) {
  1215. // close out last line
  1216. status = parser_handle_eof(self);
  1217. self->state = FINISHED;
  1218. break;
  1219. } else if (status != 0) {
  1220. return status;
  1221. }
  1222. }
  1223. TRACE(("Trying to process %d bytes\n", self->datalen - self->datapos));
  1224. /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */
  1225. status = tokenize_bytes(self, nrows);
  1226. /* debug_print_parser(self); */
  1227. if (status < 0) {
  1228. // XXX
  1229. TRACE(("Status %d returned from tokenize_bytes, breaking\n",
  1230. status));
  1231. status = -1;
  1232. break;
  1233. }
  1234. }
  1235. TRACE(("leaving tokenize_helper\n"));
  1236. return status;
  1237. }
  1238. int tokenize_nrows(parser_t *self, size_t nrows) {
  1239. int status = _tokenize_helper(self, nrows, 0);
  1240. return status;
  1241. }
  1242. int tokenize_all_rows(parser_t *self) {
  1243. int status = _tokenize_helper(self, -1, 1);
  1244. return status;
  1245. }
  1246. void test_count_lines(char *fname) {
  1247. clock_t start = clock();
  1248. char *buffer, *tmp;
  1249. size_t bytes, lines = 0;
  1250. int i;
  1251. FILE *fp = fopen(fname, "rb");
  1252. buffer = (char*) malloc(CHUNKSIZE * sizeof(char));
  1253. while(1) {
  1254. tmp = buffer;
  1255. bytes = fread((void *) buffer, sizeof(char), CHUNKSIZE, fp);
  1256. // printf("Read %d bytes\n", bytes);
  1257. if (bytes == 0) {
  1258. break;
  1259. }
  1260. for (i = 0; i < bytes; ++i)
  1261. {
  1262. if (*tmp++ == '\n') {
  1263. lines++;
  1264. }
  1265. }
  1266. }
  1267. printf("Saw %d lines\n", (int) lines);
  1268. free(buffer);
  1269. fclose(fp);
  1270. printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
  1271. }
  1272. // forward declaration
  1273. static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing);
  1274. P_INLINE void lowercase(char *p) {
  1275. for ( ; *p; ++p) *p = tolower(*p);
  1276. }
  1277. P_INLINE void uppercase(char *p) {
  1278. for ( ; *p; ++p) *p = toupper(*p);
  1279. }
  1280. /*
  1281. * `item` must be the nul-terminated string that is to be
  1282. * converted to a double.
  1283. *
  1284. * To be successful, to_double() must use *all* the characters
  1285. * in `item`. E.g. "1.q25" will fail. Leading and trailing
  1286. * spaces are allowed.
  1287. *
  1288. * `sci` is the scientific notation exponent character, usually
  1289. * either 'E' or 'D'. Case is ignored.
  1290. *
  1291. * `decimal` is the decimal point character, usually either
  1292. * '.' or ','.
  1293. *
  1294. */
  1295. int to_double(char *item, double *p_value, char sci, char decimal)
  1296. {
  1297. char *p_end;
  1298. *p_value = xstrtod(item, &p_end, decimal, sci, TRUE);
  1299. return (errno == 0) && (!*p_end);
  1300. }
  1301. int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal)
  1302. {
  1303. char *p_end;
  1304. *p_real = xstrtod(item, &p_end, decimal, sci, FALSE);
  1305. if (*p_end == '\0') {
  1306. *p_imag = 0.0;
  1307. return errno == 0;
  1308. }
  1309. if (*p_end == 'i' || *p_end == 'j') {
  1310. *p_imag = *p_real;
  1311. *p_real = 0.0;
  1312. ++p_end;
  1313. }
  1314. else {
  1315. if (*p_end == '+') {
  1316. ++p_end;
  1317. }
  1318. *p_imag = xstrtod(p_end, &p_end, decimal, sci, FALSE);
  1319. if (errno || ((*p_end != 'i') && (*p_end != 'j'))) {
  1320. return FALSE;
  1321. }
  1322. ++p_end;
  1323. }
  1324. while(*p_end == ' ') {
  1325. ++p_end;
  1326. }
  1327. return *p_end == '\0';
  1328. }
  1329. int P_INLINE to_longlong(char *item, long long *p_value)
  1330. {
  1331. char *p_end;
  1332. // Try integer conversion. We explicitly give the base to be 10. If
  1333. // we used 0, strtoll() would convert '012' to 10, because the leading 0 in
  1334. // '012' signals an octal number in C. For a general purpose reader, that
  1335. // would be a bug, not a feature.
  1336. *p_value = strtoll(item, &p_end, 10);
  1337. // Allow trailing spaces.
  1338. while (isspace(*p_end)) ++p_end;
  1339. return (errno == 0) && (!*p_end);
  1340. }
  1341. int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep)
  1342. {
  1343. int i, pos, status, n = strlen(item), count = 0;
  1344. char *tmp;
  1345. char *p_end;
  1346. for (i = 0; i < n; ++i)
  1347. {
  1348. if (*(item + i) == tsep) {
  1349. count++;
  1350. }
  1351. }
  1352. if (count == 0) {
  1353. return to_longlong(item, p_value);
  1354. }
  1355. tmp = (char*) malloc((n - count + 1) * sizeof(char));
  1356. if (tmp == NULL) {
  1357. return 0;
  1358. }
  1359. pos = 0;
  1360. for (i = 0; i < n; ++i)
  1361. {
  1362. if (item[i] != tsep)
  1363. tmp[pos++] = item[i];
  1364. }
  1365. tmp[pos] = '\0';
  1366. status = to_longlong(tmp, p_value);
  1367. free(tmp);
  1368. return status;
  1369. }
  1370. int to_boolean(char *item, uint8_t *val) {
  1371. char *tmp;
  1372. int i, status = 0;
  1373. static const char *tstrs[1] = {"TRUE"};
  1374. static const char *fstrs[1] = {"FALSE"};
  1375. tmp = malloc(sizeof(char) * (strlen(item) + 1));
  1376. strcpy(tmp, item);
  1377. uppercase(tmp);
  1378. for (i = 0; i < 1; ++i)
  1379. {
  1380. if (strcmp(tmp, tstrs[i]) == 0) {
  1381. *val = 1;
  1382. goto done;
  1383. }
  1384. }
  1385. for (i = 0; i < 1; ++i)
  1386. {
  1387. if (strcmp(tmp, fstrs[i]) == 0) {
  1388. *val = 0;
  1389. goto done;
  1390. }
  1391. }
  1392. status = -1;
  1393. done:
  1394. free(tmp);
  1395. return status;
  1396. }
  1397. // #define TEST
  1398. #ifdef TEST
  1399. int main(int argc, char *argv[])
  1400. {
  1401. double x, y;
  1402. long long xi;
  1403. int status;
  1404. char *s;
  1405. //s = "0.10e-3-+5.5e2i";
  1406. // s = "1-0j";
  1407. // status = to_complex(s, &x, &y, 'e', '.');
  1408. s = "123,789";
  1409. status = to_longlong_thousands(s, &xi, ',');
  1410. printf("s = '%s'\n", s);
  1411. printf("status = %d\n", status);
  1412. printf("x = %d\n", (int) xi);
  1413. // printf("x = %lg, y = %lg\n", x, y);
  1414. return 0;
  1415. }
  1416. #endif
  1417. // ---------------------------------------------------------------------------
  1418. // Implementation of xstrtod
  1419. //
  1420. // strtod.c
  1421. //
  1422. // Convert string to double
  1423. //
  1424. // Copyright (C) 2002 Michael Ringgaard. All rights reserved.
  1425. //
  1426. // Redistribution and use in source and binary forms, with or without
  1427. // modification, are permitted provided that the following conditions
  1428. // are met:
  1429. //
  1430. // 1. Redistributions of source code must retain the above copyright
  1431. // notice, this list of conditions and the following disclaimer.
  1432. // 2. Redistributions in binary form must reproduce the above copyright
  1433. // notice, this list of conditions and the following disclaimer in the
  1434. // documentation and/or other materials provided with the distribution.
  1435. // 3. Neither the name of the project nor the names of its contributors
  1436. // may be used to endorse or promote products derived from this software
  1437. // without specific prior written permission.
  1438. //
  1439. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  1440. // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  1441. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  1442. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  1443. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  1444. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  1445. // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  1446. // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  1447. // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  1448. // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  1449. // SUCH DAMAGE.
  1450. //
  1451. // -----------------------------------------------------------------------
  1452. // Modifications by Warren Weckesser, March 2011:
  1453. // * Rename strtod() to xstrtod().
  1454. // * Added decimal and sci arguments.
  1455. // * Skip trailing spaces.
  1456. // * Commented out the other functions.
  1457. //
  1458. static double xstrtod(const char *str, char **endptr, char decimal,
  1459. char sci, int skip_trailing)
  1460. {
  1461. double number;
  1462. int exponent;
  1463. int negative;
  1464. char *p = (char *) str;
  1465. double p10;
  1466. int n;
  1467. int num_digits;
  1468. int num_decimals;
  1469. errno = 0;
  1470. // Skip leading whitespace
  1471. while (isspace(*p)) p++;
  1472. // Handle optional sign
  1473. negative = 0;
  1474. switch (*p)
  1475. {
  1476. case '-': negative = 1; // Fall through to increment position
  1477. case '+': p++;
  1478. }
  1479. number = 0.;
  1480. exponent = 0;
  1481. num_digits = 0;
  1482. num_decimals = 0;
  1483. // Process string of digits
  1484. while (isdigit(*p))
  1485. {
  1486. number = number * 10. + (*p - '0');
  1487. p++;
  1488. num_digits++;
  1489. }
  1490. // Process decimal part
  1491. if (*p == decimal)
  1492. {
  1493. p++;
  1494. while (isdigit(*p))
  1495. {
  1496. number = number * 10. + (*p - '0');
  1497. p++;
  1498. num_digits++;
  1499. num_decimals++;
  1500. }
  1501. exponent -= num_decimals;
  1502. }
  1503. if (num_digits == 0)
  1504. {
  1505. errno = ERANGE;
  1506. return 0.0;
  1507. }
  1508. // Correct for sign
  1509. if (negative) number = -number;
  1510. // Process an exponent string
  1511. if (toupper(*p) == toupper(sci))
  1512. {
  1513. // Handle optional sign
  1514. negative = 0;
  1515. switch (*++p)
  1516. {
  1517. case '-': negative = 1; // Fall through to increment pos
  1518. case '+': p++;
  1519. }
  1520. // Process string of digits
  1521. n = 0;
  1522. while (isdigit(*p))
  1523. {
  1524. n = n * 10 + (*p - '0');
  1525. p++;
  1526. }
  1527. if (negative)
  1528. exponent -= n;
  1529. else
  1530. exponent += n;
  1531. }
  1532. if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP)
  1533. {
  1534. errno = ERANGE;
  1535. return HUGE_VAL;
  1536. }
  1537. // Scale the result
  1538. p10 = 10.;
  1539. n = exponent;
  1540. if (n < 0) n = -n;
  1541. while (n)
  1542. {
  1543. if (n & 1)
  1544. {
  1545. if (exponent < 0)
  1546. number /= p10;
  1547. else
  1548. number *= p10;
  1549. }
  1550. n >>= 1;
  1551. p10 *= p10;
  1552. }
  1553. if (number == HUGE_VAL) {
  1554. errno = ERANGE;
  1555. }
  1556. if (skip_trailing) {
  1557. // Skip trailing whitespace
  1558. while (isspace(*p)) p++;
  1559. }
  1560. if (endptr) *endptr = p;
  1561. return number;
  1562. }
  1563. /*
  1564. float strtof(const char *str, char **endptr)
  1565. {
  1566. return (float) strtod(str, endptr);
  1567. }
  1568. long double strtold(const char *str, char **endptr)
  1569. {
  1570. return strtod(str, endptr);
  1571. }
  1572. double atof(const char *str)
  1573. {
  1574. return strtod(str, NULL);
  1575. }
  1576. */
  1577. // End of xstrtod code
  1578. // ---------------------------------------------------------------------------
  1579. int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
  1580. int *error, char tsep)
  1581. {
  1582. const char *p = (const char *) p_item;
  1583. int isneg = 0;
  1584. int64_t number = 0;
  1585. int d;
  1586. // Skip leading spaces.
  1587. while (isspace(*p)) {
  1588. ++p;
  1589. }
  1590. // Handle sign.
  1591. if (*p == '-') {
  1592. isneg = 1;
  1593. ++p;
  1594. }
  1595. else if (*p == '+') {
  1596. p++;
  1597. }
  1598. // Check that there is a first digit.
  1599. if (!isdigit(*p)) {
  1600. // Error...
  1601. *error = ERROR_NO_DIGITS;
  1602. return 0;
  1603. }
  1604. if (isneg) {
  1605. // If number is greater than pre_min, at least one more digit
  1606. // can be processed without overflowing.
  1607. int dig_pre_min = -(int_min % 10);
  1608. int64_t pre_min = int_min / 10;
  1609. // Process the digits.
  1610. d = *p;
  1611. if (tsep != '\0') {
  1612. while (1) {
  1613. if (d == tsep) {
  1614. d = *++p;
  1615. continue;
  1616. } else if (!isdigit(d)) {
  1617. break;
  1618. }
  1619. if ((number > pre_min) ||
  1620. ((number == pre_min) && (d - '0' <= dig_pre_min))) {
  1621. number = number * 10 - (d - '0');
  1622. d = *++p;
  1623. }
  1624. else {
  1625. *error = ERROR_OVERFLOW;
  1626. return 0;
  1627. }
  1628. }
  1629. } else {
  1630. while (isdigit(d)) {
  1631. if ((number > pre_min) ||
  1632. ((number == pre_min) && (d - '0' <= dig_pre_min))) {
  1633. number = number * 10 - (d - '0');
  1634. d = *++p;
  1635. }
  1636. else {
  1637. *error = ERROR_OVERFLOW;
  1638. return 0;
  1639. }
  1640. }
  1641. }
  1642. }
  1643. else {
  1644. // If number is less than pre_max, at least one more digit
  1645. // can be processed without overflowing.
  1646. int64_t pre_max = int_max / 10;
  1647. int dig_pre_max = int_max % 10;
  1648. //printf("pre_max = %lld dig_pre_max = %d\n", pre_max, dig_pre_max);
  1649. // Process the digits.
  1650. d = *p;
  1651. if (tsep != '\0') {
  1652. while (1) {
  1653. if (d == tsep) {
  1654. d = *++p;
  1655. continue;
  1656. } else if (!isdigit(d)) {
  1657. break;
  1658. }
  1659. if ((number < pre_max) ||
  1660. ((number == pre_max) && (d - '0' <= dig_pre_max))) {
  1661. number = number * 10 + (d - '0');
  1662. d = *++p;
  1663. }
  1664. else {
  1665. *error = ERROR_OVERFLOW;
  1666. return 0;
  1667. }
  1668. }
  1669. } else {
  1670. while (isdigit(d)) {
  1671. if ((number < pre_max) ||
  1672. ((number == pre_max) && (d - '0' <= dig_pre_max))) {
  1673. number = number * 10 + (d - '0');
  1674. d = *++p;
  1675. }
  1676. else {
  1677. *error = ERROR_OVERFLOW;
  1678. return 0;
  1679. }
  1680. }
  1681. }
  1682. }
  1683. // Skip trailing spaces.
  1684. while (isspace(*p)) {
  1685. ++p;
  1686. }
  1687. // Did we use up all the characters?
  1688. if (*p) {
  1689. *error = ERROR_INVALID_CHARS;
  1690. return 0;
  1691. }
  1692. *error = 0;
  1693. return number;
  1694. }
  1695. uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error)
  1696. {
  1697. int d, dig_pre_max;
  1698. uint64_t pre_max;
  1699. const char *p = (const char *) p_item;
  1700. uint64_t number = 0;
  1701. // Skip leading spaces.
  1702. while (isspace(*p)) {
  1703. ++p;
  1704. }
  1705. // Handle sign.
  1706. if (*p == '-') {
  1707. *error = ERROR_MINUS_SIGN;
  1708. return 0;
  1709. }
  1710. if (*p == '+') {
  1711. p++;
  1712. }
  1713. // Check that there is a first digit.
  1714. if (!isdigit(*p)) {
  1715. // Error...
  1716. *error = ERROR_NO_DIGITS;
  1717. return 0;
  1718. }
  1719. // If number is less than pre_max, at least one more digit
  1720. // can be processed without overflowing.
  1721. pre_max = uint_max / 10;
  1722. dig_pre_max = uint_max % 10;
  1723. // Process the digits.
  1724. d = *p;
  1725. while (isdigit(d)) {
  1726. if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) {
  1727. number = number * 10 + (d - '0');
  1728. d = *++p;
  1729. }
  1730. else {
  1731. *error = ERROR_OVERFLOW;
  1732. return 0;
  1733. }
  1734. }
  1735. // Skip trailing spaces.
  1736. while (isspace(*p)) {
  1737. ++p;
  1738. }
  1739. // Did we use up all the characters?
  1740. if (*p) {
  1741. *error = ERROR_INVALID_CHARS;
  1742. return 0;
  1743. }
  1744. *error = 0;
  1745. return number;
  1746. }