PageRenderTime 58ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/hphp/runtime/base/zend-string.cpp

http://github.com/facebook/hiphop-php
C++ | 2593 lines | 2132 code | 186 blank | 275 comment | 387 complexity | 107f089f48b874e6083c112535fa7315 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, MIT, LGPL-2.0, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. +----------------------------------------------------------------------+
  3. | HipHop for PHP |
  4. +----------------------------------------------------------------------+
  5. | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
  6. | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
  7. +----------------------------------------------------------------------+
  8. | This source file is subject to version 2.00 of the Zend license, |
  9. | that is bundled with this package in the file LICENSE, and is |
  10. | available through the world-wide-web at the following url: |
  11. | http://www.zend.com/license/2_00.txt. |
  12. | If you did not receive a copy of the Zend license and are unable to |
  13. | obtain it through the world-wide-web, please send a note to |
  14. | license@zend.com so we can mail you a copy immediately. |
  15. +----------------------------------------------------------------------+
  16. */
  17. // NOTE: See also "hphp/zend/zend-string.*".
  18. #include "hphp/runtime/base/zend-string.h"
  19. #include "hphp/runtime/base/zend-printf.h"
  20. #include "hphp/util/lock.h"
  21. #include "hphp/util/overflow.h"
  22. #include "hphp/zend/zend-math.h"
  23. #include <algorithm>
  24. #include <cmath>
  25. #ifndef _MSC_VER
  26. #include <monetary.h>
  27. #endif
  28. #include "hphp/util/bstring.h"
  29. #include "hphp/runtime/base/exceptions.h"
  30. #include "hphp/runtime/base/string-buffer.h"
  31. #include "hphp/runtime/base/runtime-error.h"
  32. #include "hphp/runtime/base/string-util.h"
  33. #include "hphp/runtime/base/builtin-functions.h"
  34. #include <folly/portability/String.h>
  35. #define PHP_QPRINT_MAXL 75
  36. namespace HPHP {
  37. ///////////////////////////////////////////////////////////////////////////////
  38. // helpers
  39. void string_charmask(const char *sinput, int len, char *mask) {
  40. const unsigned char *input = (unsigned char *)sinput;
  41. const unsigned char *end;
  42. unsigned char c;
  43. memset(mask, 0, 256);
  44. for (end = input+len; input < end; input++) {
  45. c=*input;
  46. if ((input+3 < end) && input[1] == '.' && input[2] == '.'
  47. && input[3] >= c) {
  48. memset(mask+c, 1, input[3] - c + 1);
  49. input+=3;
  50. } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') {
  51. /* Error, try to be as helpful as possible:
  52. (a range ending/starting with '.' won't be captured here) */
  53. if (end-len >= input) { /* there was no 'left' char */
  54. raise_invalid_argument_warning
  55. ("charlist: Invalid '..'-range, missing left of '..'");
  56. continue;
  57. }
  58. if (input+2 >= end) { /* there is no 'right' char */
  59. raise_invalid_argument_warning
  60. ("charlist: Invalid '..'-range, missing right of '..'");
  61. continue;
  62. }
  63. if (input[-1] > input[2]) { /* wrong order */
  64. raise_invalid_argument_warning
  65. ("charlist: '..'-range needs to be incrementing");
  66. continue;
  67. }
  68. /* FIXME: better error (a..b..c is the only left possibility?) */
  69. raise_invalid_argument_warning("charlist: Invalid '..'-range");
  70. continue;
  71. } else {
  72. mask[c]=1;
  73. }
  74. }
  75. }
  76. ///////////////////////////////////////////////////////////////////////////////
  77. void string_to_case(String& s, int (*tocase)(int)) {
  78. assertx(!s.isNull());
  79. assertx(tocase);
  80. auto data = s.mutableData();
  81. auto len = s.size();
  82. for (int i = 0; i < len; i++) {
  83. data[i] = tocase(data[i]);
  84. }
  85. }
  86. ///////////////////////////////////////////////////////////////////////////////
  87. #define STR_PAD_LEFT 0
  88. #define STR_PAD_RIGHT 1
  89. #define STR_PAD_BOTH 2
  90. String string_pad(const char *input, int len, int pad_length,
  91. const char *pad_string, int pad_str_len,
  92. int pad_type) {
  93. assertx(input);
  94. int num_pad_chars = pad_length - len;
  95. /* If resulting string turns out to be shorter than input string,
  96. we simply copy the input and return. */
  97. if (pad_length < 0 || num_pad_chars < 0) {
  98. return String(input, len, CopyString);
  99. }
  100. /* Setup the padding string values if specified. */
  101. if (pad_str_len == 0) {
  102. SystemLib::throwRuntimeExceptionObject(
  103. "Invalid argument: pad_string: (empty)");
  104. }
  105. String ret(pad_length, ReserveString);
  106. char *result = ret.mutableData();
  107. /* We need to figure out the left/right padding lengths. */
  108. int left_pad, right_pad;
  109. switch (pad_type) {
  110. case STR_PAD_RIGHT:
  111. left_pad = 0;
  112. right_pad = num_pad_chars;
  113. break;
  114. case STR_PAD_LEFT:
  115. left_pad = num_pad_chars;
  116. right_pad = 0;
  117. break;
  118. case STR_PAD_BOTH:
  119. left_pad = num_pad_chars / 2;
  120. right_pad = num_pad_chars - left_pad;
  121. break;
  122. default:
  123. SystemLib::throwRuntimeExceptionObject(
  124. folly::sformat("Invalid argument: pad_type: {}", pad_type));
  125. }
  126. /* First we pad on the left. */
  127. int result_len = 0;
  128. for (int i = 0; i < left_pad; i++) {
  129. result[result_len++] = pad_string[i % pad_str_len];
  130. }
  131. /* Then we copy the input string. */
  132. memcpy(result + result_len, input, len);
  133. result_len += len;
  134. /* Finally, we pad on the right. */
  135. for (int i = 0; i < right_pad; i++) {
  136. result[result_len++] = pad_string[i % pad_str_len];
  137. }
  138. ret.setSize(result_len);
  139. return ret;
  140. }
  141. ///////////////////////////////////////////////////////////////////////////////
  142. int string_find(const char *input, int len, char ch, int pos,
  143. bool case_sensitive) {
  144. assertx(input);
  145. if (pos < 0 || pos > len) {
  146. return -1;
  147. }
  148. const void *ptr;
  149. if (case_sensitive) {
  150. ptr = memchr(input + pos, ch, len - pos);
  151. } else {
  152. ptr = bstrcasechr(input + pos, ch, len - pos);
  153. }
  154. if (ptr != nullptr) {
  155. return (int)((const char *)ptr - input);
  156. }
  157. return -1;
  158. }
  159. int string_rfind(const char *input, int len, char ch, int pos,
  160. bool case_sensitive) {
  161. assertx(input);
  162. if (pos < -len || pos > len) {
  163. return -1;
  164. }
  165. const void *ptr;
  166. if (case_sensitive) {
  167. if (pos >= 0) {
  168. ptr = memrchr(input + pos, ch, len - pos);
  169. } else {
  170. ptr = memrchr(input, ch, len + pos + 1);
  171. }
  172. } else {
  173. if (pos >= 0) {
  174. ptr = bstrrcasechr(input + pos, ch, len - pos);
  175. } else {
  176. ptr = bstrrcasechr(input, ch, len + pos + 1);
  177. }
  178. }
  179. if (ptr != nullptr) {
  180. return (int)((const char *)ptr - input);
  181. }
  182. return -1;
  183. }
  184. int string_find(const char *input, int len, const char *s, int s_len,
  185. int pos, bool case_sensitive) {
  186. assertx(input);
  187. assertx(s);
  188. if (!s_len || pos < 0 || pos > len) {
  189. return -1;
  190. }
  191. void *ptr;
  192. if (case_sensitive) {
  193. ptr = (void*)string_memnstr(input + pos, s, s_len, input + len);
  194. } else {
  195. ptr = bstrcasestr(input + pos, len - pos, s, s_len);
  196. }
  197. if (ptr != nullptr) {
  198. return (int)((const char *)ptr - input);
  199. }
  200. return -1;
  201. }
  202. int string_rfind(const char *input, int len, const char *s, int s_len,
  203. int pos, bool case_sensitive) {
  204. assertx(input);
  205. assertx(s);
  206. if (!s_len || pos < -len || pos > len) {
  207. return -1;
  208. }
  209. void *ptr;
  210. if (case_sensitive) {
  211. if (pos >= 0) {
  212. ptr = bstrrstr(input + pos, len - pos, s, s_len);
  213. } else {
  214. ptr = bstrrstr(input, len + std::min(pos + s_len, 0), s, s_len);
  215. }
  216. } else {
  217. if (pos >= 0) {
  218. ptr = bstrrcasestr(input + pos, len - pos, s, s_len);
  219. } else {
  220. ptr = bstrrcasestr(input, len + std::min(pos + s_len, 0), s, s_len);
  221. }
  222. }
  223. if (ptr != nullptr) {
  224. return (int)((const char *)ptr - input);
  225. }
  226. return -1;
  227. }
  228. const char *string_memnstr(const char *haystack, const char *needle,
  229. int needle_len, const char *end) {
  230. const char *p = haystack;
  231. char ne = needle[needle_len-1];
  232. end -= needle_len;
  233. while (p <= end) {
  234. if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
  235. if (!memcmp(needle, p, needle_len-1)) {
  236. return p;
  237. }
  238. }
  239. if (p == nullptr) {
  240. return nullptr;
  241. }
  242. p++;
  243. }
  244. return nullptr;
  245. }
  246. String string_replace(const char *s, int len, int start, int length,
  247. const char *replacement, int len_repl) {
  248. assertx(s);
  249. assertx(replacement);
  250. assertx(len >= 0);
  251. // if "start" position is negative, count start position from the end
  252. // of the string
  253. if (start < 0) {
  254. start = len + start;
  255. if (start < 0) {
  256. start = 0;
  257. }
  258. }
  259. if (start > len) {
  260. start = len;
  261. }
  262. // if "length" position is negative, set it to the length
  263. // needed to stop that many chars from the end of the string
  264. if (length < 0) {
  265. length = (len - start) + length;
  266. if (length < 0) {
  267. length = 0;
  268. }
  269. }
  270. // check if length is too large
  271. if (length > len) {
  272. length = len;
  273. }
  274. // check if the length is too large adjusting for non-zero start
  275. // Write this way instead of start + length > len to avoid overflow
  276. if (length > len - start) {
  277. length = len - start;
  278. }
  279. String retString(len + len_repl - length, ReserveString);
  280. char *ret = retString.mutableData();
  281. int ret_len = 0;
  282. if (start) {
  283. memcpy(ret, s, start);
  284. ret_len += start;
  285. }
  286. if (len_repl) {
  287. memcpy(ret + ret_len, replacement, len_repl);
  288. ret_len += len_repl;
  289. }
  290. len -= (start + length);
  291. if (len) {
  292. memcpy(ret + ret_len, s + start + length, len);
  293. ret_len += len;
  294. }
  295. retString.setSize(ret_len);
  296. return retString;
  297. }
  298. String string_replace(const char *input, int len,
  299. const char *search, int len_search,
  300. const char *replacement, int len_replace,
  301. int &count, bool case_sensitive) {
  302. assertx(input);
  303. assertx(search && len_search);
  304. assertx(len >= 0);
  305. assertx(len_search >= 0);
  306. assertx(len_replace >= 0);
  307. if (len == 0) {
  308. return String();
  309. }
  310. req::vector<int> founds;
  311. founds.reserve(16);
  312. if (len_search == 1) {
  313. for (int pos = string_find(input, len, *search, 0, case_sensitive);
  314. pos >= 0;
  315. pos = string_find(input, len, *search, pos + len_search,
  316. case_sensitive)) {
  317. founds.push_back(pos);
  318. }
  319. } else {
  320. for (int pos = string_find(input, len, search, len_search, 0,
  321. case_sensitive);
  322. pos >= 0;
  323. pos = string_find(input, len, search, len_search,
  324. pos + len_search, case_sensitive)) {
  325. founds.push_back(pos);
  326. }
  327. }
  328. count = founds.size();
  329. if (count == 0) {
  330. return String(); // not found
  331. }
  332. int reserve;
  333. // Make sure the new size of the string wouldn't overflow int32_t. Don't
  334. // bother if the replacement wouldn't make the string longer.
  335. if (len_replace > len_search) {
  336. auto raise = [&] { raise_error("String too large"); };
  337. if (mul_overflow(len_replace - len_search, count)) {
  338. raise();
  339. }
  340. int diff = (len_replace - len_search) * count;
  341. if (add_overflow(len, diff)) {
  342. raise();
  343. }
  344. reserve = len + diff;
  345. } else {
  346. reserve = len + (len_replace - len_search) * count;
  347. }
  348. String retString(reserve, ReserveString);
  349. char *ret = retString.mutableData();
  350. char *p = ret;
  351. int pos = 0; // last position in input that hasn't been copied over yet
  352. int n;
  353. for (unsigned int i = 0; i < founds.size(); i++) {
  354. n = founds[i];
  355. if (n > pos) {
  356. n -= pos;
  357. memcpy(p, input, n);
  358. p += n;
  359. input += n;
  360. pos += n;
  361. }
  362. if (len_replace) {
  363. memcpy(p, replacement, len_replace);
  364. p += len_replace;
  365. }
  366. input += len_search;
  367. pos += len_search;
  368. }
  369. n = len;
  370. if (n > pos) {
  371. n -= pos;
  372. memcpy(p, input, n);
  373. p += n;
  374. }
  375. retString.setSize(p - ret);
  376. return retString;
  377. }
  378. ///////////////////////////////////////////////////////////////////////////////
  379. String string_chunk_split(const char *src, int srclen, const char *end,
  380. int endlen, int chunklen) {
  381. int chunks = srclen / chunklen; // complete chunks!
  382. int restlen = srclen - chunks * chunklen; /* srclen % chunklen */
  383. String ret(
  384. safe_address(
  385. chunks + 1,
  386. endlen,
  387. srclen
  388. ),
  389. ReserveString
  390. );
  391. char *dest = ret.mutableData();
  392. const char *p; char *q;
  393. const char *pMax = src + srclen - chunklen + 1;
  394. for (p = src, q = dest; p < pMax; ) {
  395. memcpy(q, p, chunklen);
  396. q += chunklen;
  397. memcpy(q, end, endlen);
  398. q += endlen;
  399. p += chunklen;
  400. }
  401. if (restlen) {
  402. memcpy(q, p, restlen);
  403. q += restlen;
  404. memcpy(q, end, endlen);
  405. q += endlen;
  406. }
  407. ret.setSize(q - dest);
  408. return ret;
  409. }
  410. ///////////////////////////////////////////////////////////////////////////////
  411. #define PHP_TAG_BUF_SIZE 1023
  412. /**
  413. * Check if tag is in a set of tags
  414. *
  415. * states:
  416. *
  417. * 0 start tag
  418. * 1 first non-whitespace char seen
  419. */
  420. static int string_tag_find(const char *tag, int len, const char *set) {
  421. char c, *n;
  422. const char *t;
  423. int state=0, done=0;
  424. char *norm;
  425. if (len <= 0) {
  426. return 0;
  427. }
  428. norm = (char *)req::malloc_noptrs(len+1);
  429. SCOPE_EXIT { req::free(norm); };
  430. n = norm;
  431. t = tag;
  432. c = tolower(*t);
  433. /*
  434. normalize the tag removing leading and trailing whitespace
  435. and turn any <a whatever...> into just <a> and any </tag>
  436. into <tag>
  437. */
  438. while (!done) {
  439. switch (c) {
  440. case '<':
  441. *(n++) = c;
  442. break;
  443. case '>':
  444. done =1;
  445. break;
  446. default:
  447. if (!isspace((int)c)) {
  448. if (state == 0) {
  449. state=1;
  450. }
  451. if (c != '/') {
  452. *(n++) = c;
  453. }
  454. } else {
  455. if (state == 1)
  456. done=1;
  457. }
  458. break;
  459. }
  460. c = tolower(*(++t));
  461. }
  462. *(n++) = '>';
  463. *n = '\0';
  464. if (strstr(set, norm)) {
  465. done=1;
  466. } else {
  467. done=0;
  468. }
  469. return done;
  470. }
  471. /**
  472. * A simple little state-machine to strip out html and php tags
  473. *
  474. * State 0 is the output state, State 1 means we are inside a
  475. * normal html tag and state 2 means we are inside a php tag.
  476. *
  477. * The state variable is passed in to allow a function like fgetss
  478. * to maintain state across calls to the function.
  479. *
  480. * lc holds the last significant character read and br is a bracket
  481. * counter.
  482. *
  483. * When an allow string is passed in we keep track of the string
  484. * in state 1 and when the tag is closed check it against the
  485. * allow string to see if we should allow it.
  486. * swm: Added ability to strip <?xml tags without assuming it PHP
  487. * code.
  488. */
  489. String string_strip_tags(const char *s, const int len,
  490. const char *allow, const int allow_len,
  491. bool allow_tag_spaces) {
  492. const char *abuf, *p;
  493. char *rbuf, *tbuf, *tp, *rp, c, lc;
  494. int br, i=0, depth=0, in_q = 0;
  495. int state = 0, pos;
  496. assertx(s);
  497. assertx(allow);
  498. String retString(s, len, CopyString);
  499. rbuf = retString.mutableData();
  500. String allowString;
  501. c = *s;
  502. lc = '\0';
  503. p = s;
  504. rp = rbuf;
  505. br = 0;
  506. if (allow_len) {
  507. assertx(allow);
  508. allowString = String(allow_len, ReserveString);
  509. char *atmp = allowString.mutableData();
  510. for (const char *tmp = allow; *tmp; tmp++, atmp++) {
  511. *atmp = tolower((int)*(const unsigned char *)tmp);
  512. }
  513. allowString.setSize(allow_len);
  514. abuf = allowString.data();
  515. tbuf = (char *)req::malloc_noptrs(PHP_TAG_BUF_SIZE+1);
  516. tp = tbuf;
  517. } else {
  518. abuf = nullptr;
  519. tbuf = tp = nullptr;
  520. }
  521. auto move = [&pos, &tbuf, &tp]() {
  522. if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
  523. pos = tp - tbuf;
  524. tbuf = (char*)req::realloc_noptrs(tbuf,
  525. (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
  526. tp = tbuf + pos;
  527. }
  528. };
  529. while (i < len) {
  530. switch (c) {
  531. case '\0':
  532. break;
  533. case '<':
  534. if (isspace(*(p + 1)) && !allow_tag_spaces) {
  535. goto reg_char;
  536. }
  537. if (state == 0) {
  538. lc = '<';
  539. state = 1;
  540. if (allow_len) {
  541. move();
  542. *(tp++) = '<';
  543. }
  544. } else if (state == 1) {
  545. depth++;
  546. }
  547. break;
  548. case '(':
  549. if (state == 2) {
  550. if (lc != '"' && lc != '\'') {
  551. lc = '(';
  552. br++;
  553. }
  554. } else if (allow_len && state == 1) {
  555. move();
  556. *(tp++) = c;
  557. } else if (state == 0) {
  558. *(rp++) = c;
  559. }
  560. break;
  561. case ')':
  562. if (state == 2) {
  563. if (lc != '"' && lc != '\'') {
  564. lc = ')';
  565. br--;
  566. }
  567. } else if (allow_len && state == 1) {
  568. move();
  569. *(tp++) = c;
  570. } else if (state == 0) {
  571. *(rp++) = c;
  572. }
  573. break;
  574. case '>':
  575. if (depth) {
  576. depth--;
  577. break;
  578. }
  579. if (in_q) {
  580. break;
  581. }
  582. switch (state) {
  583. case 1: /* HTML/XML */
  584. lc = '>';
  585. in_q = state = 0;
  586. if (allow_len) {
  587. move();
  588. *(tp++) = '>';
  589. *tp='\0';
  590. if (string_tag_find(tbuf, tp-tbuf, abuf)) {
  591. memcpy(rp, tbuf, tp-tbuf);
  592. rp += tp-tbuf;
  593. }
  594. tp = tbuf;
  595. }
  596. break;
  597. case 2: /* PHP */
  598. if (!br && lc != '\"' && *(p-1) == '?') {
  599. in_q = state = 0;
  600. tp = tbuf;
  601. }
  602. break;
  603. case 3:
  604. in_q = state = 0;
  605. tp = tbuf;
  606. break;
  607. case 4: /* JavaScript/CSS/etc... */
  608. if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') {
  609. in_q = state = 0;
  610. tp = tbuf;
  611. }
  612. break;
  613. default:
  614. *(rp++) = c;
  615. break;
  616. }
  617. break;
  618. case '"':
  619. case '\'':
  620. if (state == 4) {
  621. /* Inside <!-- comment --> */
  622. break;
  623. } else if (state == 2 && *(p-1) != '\\') {
  624. if (lc == c) {
  625. lc = '\0';
  626. } else if (lc != '\\') {
  627. lc = c;
  628. }
  629. } else if (state == 0) {
  630. *(rp++) = c;
  631. } else if (allow_len && state == 1) {
  632. move();
  633. *(tp++) = c;
  634. }
  635. if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) {
  636. if (in_q) {
  637. in_q = 0;
  638. } else {
  639. in_q = *p;
  640. }
  641. }
  642. break;
  643. case '!':
  644. /* JavaScript & Other HTML scripting languages */
  645. if (state == 1 && *(p-1) == '<') {
  646. state = 3;
  647. lc = c;
  648. } else {
  649. if (state == 0) {
  650. *(rp++) = c;
  651. } else if (allow_len && state == 1) {
  652. move();
  653. *(tp++) = c;
  654. }
  655. }
  656. break;
  657. case '-':
  658. if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') {
  659. state = 4;
  660. } else {
  661. goto reg_char;
  662. }
  663. break;
  664. case '?':
  665. if (state == 1 && *(p-1) == '<') {
  666. br=0;
  667. state=2;
  668. break;
  669. }
  670. case 'E':
  671. case 'e':
  672. /* !DOCTYPE exception */
  673. if (state==3 && p > s+6
  674. && tolower(*(p-1)) == 'p'
  675. && tolower(*(p-2)) == 'y'
  676. && tolower(*(p-3)) == 't'
  677. && tolower(*(p-4)) == 'c'
  678. && tolower(*(p-5)) == 'o'
  679. && tolower(*(p-6)) == 'd') {
  680. state = 1;
  681. break;
  682. }
  683. /* fall-through */
  684. case 'l':
  685. /* swm: If we encounter '<?xml' then we shouldn't be in
  686. * state == 2 (PHP). Switch back to HTML.
  687. */
  688. if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') {
  689. state = 1;
  690. break;
  691. }
  692. /* fall-through */
  693. default:
  694. reg_char:
  695. if (state == 0) {
  696. *(rp++) = c;
  697. } else if (allow_len && state == 1) {
  698. move();
  699. *(tp++) = c;
  700. }
  701. break;
  702. }
  703. c = *(++p);
  704. i++;
  705. }
  706. if (rp < rbuf + len) {
  707. *rp = '\0';
  708. }
  709. if (allow_len) {
  710. req::free(tbuf);
  711. }
  712. retString.setSize(rp - rbuf);
  713. return retString;
  714. }
  715. ///////////////////////////////////////////////////////////////////////////////
  716. static char string_hex2int(int c) {
  717. if (isdigit(c)) {
  718. return c - '0';
  719. }
  720. if (c >= 'A' && c <= 'F') {
  721. return c - 'A' + 10;
  722. }
  723. if (c >= 'a' && c <= 'f') {
  724. return c - 'a' + 10;
  725. }
  726. return -1;
  727. }
  728. String string_quoted_printable_encode(const char *input, int len) {
  729. size_t length = len;
  730. const unsigned char *str = (unsigned char*)input;
  731. unsigned long lp = 0;
  732. unsigned char c;
  733. char *d, *buffer;
  734. char *hex = "0123456789ABCDEF";
  735. String ret(
  736. safe_address(
  737. 3,
  738. length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1),
  739. 1),
  740. ReserveString
  741. );
  742. d = buffer = ret.mutableData();
  743. while (length--) {
  744. if (((c = *str++) == '\015') && (*str == '\012') && length > 0) {
  745. *d++ = '\015';
  746. *d++ = *str++;
  747. length--;
  748. lp = 0;
  749. } else {
  750. if (iscntrl (c) || (c == 0x7f) || (c & 0x80) ||
  751. (c == '=') || ((c == ' ') && (*str == '\015'))) {
  752. if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f))
  753. || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL))
  754. || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL))
  755. || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) {
  756. *d++ = '=';
  757. *d++ = '\015';
  758. *d++ = '\012';
  759. lp = 3;
  760. }
  761. *d++ = '=';
  762. *d++ = hex[c >> 4];
  763. *d++ = hex[c & 0xf];
  764. } else {
  765. if ((++lp) > PHP_QPRINT_MAXL) {
  766. *d++ = '=';
  767. *d++ = '\015';
  768. *d++ = '\012';
  769. lp = 1;
  770. }
  771. *d++ = c;
  772. }
  773. }
  774. }
  775. len = d - buffer;
  776. ret.setSize(len);
  777. return ret;
  778. }
  779. String string_quoted_printable_decode(const char *input, int len, bool is_q) {
  780. assertx(input);
  781. if (len == 0) {
  782. return String();
  783. }
  784. int i = 0, j = 0, k;
  785. const char *str_in = input;
  786. String ret(len, ReserveString);
  787. char *str_out = ret.mutableData();
  788. while (i < len && str_in[i]) {
  789. switch (str_in[i]) {
  790. case '=':
  791. if (i + 2 < len && str_in[i + 1] && str_in[i + 2] &&
  792. isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2]))
  793. {
  794. str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4)
  795. + string_hex2int((int) str_in[i + 2]);
  796. i += 3;
  797. } else /* check for soft line break according to RFC 2045*/ {
  798. k = 1;
  799. while (str_in[i + k] &&
  800. ((str_in[i + k] == 32) || (str_in[i + k] == 9))) {
  801. /* Possibly, skip spaces/tabs at the end of line */
  802. k++;
  803. }
  804. if (!str_in[i + k]) {
  805. /* End of line reached */
  806. i += k;
  807. }
  808. else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) {
  809. /* CRLF */
  810. i += k + 2;
  811. }
  812. else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) {
  813. /* CR or LF */
  814. i += k + 1;
  815. }
  816. else {
  817. str_out[j++] = str_in[i++];
  818. }
  819. }
  820. break;
  821. case '_':
  822. if (is_q) {
  823. str_out[j++] = ' ';
  824. i++;
  825. } else {
  826. str_out[j++] = str_in[i++];
  827. }
  828. break;
  829. default:
  830. str_out[j++] = str_in[i++];
  831. }
  832. }
  833. ret.setSize(j);
  834. return ret;
  835. }
  836. Variant string_base_to_numeric(const char *s, int len, int base) {
  837. int64_t num = 0;
  838. double fnum = 0;
  839. int mode = 0;
  840. int64_t cutoff;
  841. int cutlim;
  842. assertx(string_validate_base(base));
  843. cutoff = LONG_MAX / base;
  844. cutlim = LONG_MAX % base;
  845. for (int i = len; i > 0; i--) {
  846. char c = *s++;
  847. /* might not work for EBCDIC */
  848. if (c >= '0' && c <= '9')
  849. c -= '0';
  850. else if (c >= 'A' && c <= 'Z')
  851. c -= 'A' - 10;
  852. else if (c >= 'a' && c <= 'z')
  853. c -= 'a' - 10;
  854. else
  855. continue;
  856. if (c >= base)
  857. continue;
  858. switch (mode) {
  859. case 0: /* Integer */
  860. if (num < cutoff || (num == cutoff && c <= cutlim)) {
  861. num = num * base + c;
  862. break;
  863. } else {
  864. fnum = num;
  865. mode = 1;
  866. }
  867. /* fall-through */
  868. case 1: /* Float */
  869. fnum = fnum * base + c;
  870. }
  871. }
  872. if (mode == 1) {
  873. return fnum;
  874. }
  875. return num;
  876. }
  877. String string_long_to_base(unsigned long value, int base) {
  878. static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
  879. char buf[(sizeof(unsigned long) << 3) + 1];
  880. char *ptr, *end;
  881. assertx(string_validate_base(base));
  882. end = ptr = buf + sizeof(buf) - 1;
  883. do {
  884. *--ptr = digits[value % base];
  885. value /= base;
  886. } while (ptr > buf && value);
  887. return String(ptr, end - ptr, CopyString);
  888. }
  889. String string_numeric_to_base(const Variant& value, int base) {
  890. static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
  891. assertx(string_validate_base(base));
  892. if ((!value.isInteger() && !value.isDouble())) {
  893. return empty_string();
  894. }
  895. if (value.isDouble()) {
  896. double fvalue = floor(value.toDouble()); /* floor it just in case */
  897. char *ptr, *end;
  898. char buf[(sizeof(double) << 3) + 1];
  899. /* Don't try to convert +/- infinity */
  900. if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) {
  901. raise_warning("Number too large");
  902. return empty_string();
  903. }
  904. end = ptr = buf + sizeof(buf) - 1;
  905. do {
  906. *--ptr = digits[(int) fmod(fvalue, base)];
  907. fvalue /= base;
  908. } while (ptr > buf && fabs(fvalue) >= 1);
  909. return String(ptr, end - ptr, CopyString);
  910. }
  911. return string_long_to_base(value.toInt64(), base);
  912. }
  913. ///////////////////////////////////////////////////////////////////////////////
  914. // uuencode
  915. #define PHP_UU_ENC(c) \
  916. ((c) ? ((c) & 077) + ' ' : '`')
  917. #define PHP_UU_ENC_C2(c) \
  918. PHP_UU_ENC(((*(c) * 16) & 060) | ((*((c) + 1) >> 4) & 017))
  919. #define PHP_UU_ENC_C3(c) \
  920. PHP_UU_ENC(((*(c + 1) * 4) & 074) | ((*((c) + 2) >> 6) & 03))
  921. #define PHP_UU_DEC(c) \
  922. (((c) - ' ') & 077)
  923. String string_uuencode(const char *src, int src_len) {
  924. assertx(src);
  925. assertx(src_len);
  926. int len = 45;
  927. char *p;
  928. const char *s, *e, *ee;
  929. char *dest;
  930. /* encoded length is ~ 38% greater than the original */
  931. String ret((int)ceil(src_len * 1.38) + 45, ReserveString);
  932. p = dest = ret.mutableData();
  933. s = src;
  934. e = src + src_len;
  935. while ((s + 3) < e) {
  936. ee = s + len;
  937. if (ee > e) {
  938. ee = e;
  939. len = ee - s;
  940. if (len % 3) {
  941. ee = s + (int) (floor(len / 3) * 3);
  942. }
  943. }
  944. *p++ = PHP_UU_ENC(len);
  945. while (s < ee) {
  946. *p++ = PHP_UU_ENC(*s >> 2);
  947. *p++ = PHP_UU_ENC_C2(s);
  948. *p++ = PHP_UU_ENC_C3(s);
  949. *p++ = PHP_UU_ENC(*(s + 2) & 077);
  950. s += 3;
  951. }
  952. if (len == 45) {
  953. *p++ = '\n';
  954. }
  955. }
  956. if (s < e) {
  957. if (len == 45) {
  958. *p++ = PHP_UU_ENC(e - s);
  959. len = 0;
  960. }
  961. *p++ = PHP_UU_ENC(*s >> 2);
  962. *p++ = PHP_UU_ENC_C2(s);
  963. *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0');
  964. *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0');
  965. }
  966. if (len < 45) {
  967. *p++ = '\n';
  968. }
  969. *p++ = PHP_UU_ENC('\0');
  970. *p++ = '\n';
  971. *p = '\0';
  972. ret.setSize(p - dest);
  973. return ret;
  974. }
  975. String string_uudecode(const char *src, int src_len) {
  976. int total_len = 0;
  977. int len;
  978. const char *s, *e, *ee;
  979. char *p, *dest;
  980. String ret(ceil(src_len * 0.75), ReserveString);
  981. p = dest = ret.mutableData();
  982. s = src;
  983. e = src + src_len;
  984. while (s < e) {
  985. if ((len = PHP_UU_DEC(*s++)) <= 0) {
  986. break;
  987. }
  988. /* sanity check */
  989. if (len > src_len) {
  990. goto err;
  991. }
  992. total_len += len;
  993. ee = s + (len == 45 ? 60 : (int) floor(len * 1.33));
  994. /* sanity check */
  995. if (ee > e) {
  996. goto err;
  997. }
  998. while (s < ee) {
  999. if (s + 4 > e) goto err;
  1000. *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
  1001. *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
  1002. *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
  1003. s += 4;
  1004. }
  1005. if (len < 45) {
  1006. break;
  1007. }
  1008. /* skip \n */
  1009. s++;
  1010. }
  1011. if ((len = total_len > (p - dest))) {
  1012. *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
  1013. if (len > 1) {
  1014. *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
  1015. if (len > 2) {
  1016. *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
  1017. }
  1018. }
  1019. }
  1020. ret.setSize(total_len);
  1021. return ret;
  1022. err:
  1023. return String();
  1024. }
  1025. ///////////////////////////////////////////////////////////////////////////////
  1026. // base64
  1027. namespace {
  1028. const char base64_table[] = {
  1029. 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  1030. 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  1031. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  1032. 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  1033. '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
  1034. };
  1035. const char base64_pad = '=';
  1036. const short base64_reverse_table[256] = {
  1037. -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
  1038. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1039. -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
  1040. 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
  1041. -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  1042. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
  1043. -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
  1044. 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
  1045. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1046. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1047. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1048. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1049. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1050. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1051. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  1052. -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
  1053. };
  1054. folly::Optional<int> maxEncodedSize(int length) {
  1055. if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
  1056. return folly::none;
  1057. }
  1058. return ((length + 2) / 3) * 4;
  1059. }
  1060. // outstr must be at least maxEncodedSize(length) bytes
  1061. size_t php_base64_encode(const unsigned char *str, int length,
  1062. unsigned char* outstr) {
  1063. const unsigned char *current = str;
  1064. unsigned char *p = outstr;
  1065. while (length > 2) { /* keep going until we have less than 24 bits */
  1066. *p++ = base64_table[current[0] >> 2];
  1067. *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
  1068. *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)];
  1069. *p++ = base64_table[current[2] & 0x3f];
  1070. current += 3;
  1071. length -= 3; /* we just handle 3 octets of data */
  1072. }
  1073. /* now deal with the tail end of things */
  1074. if (length != 0) {
  1075. *p++ = base64_table[current[0] >> 2];
  1076. if (length > 1) {
  1077. *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
  1078. *p++ = base64_table[(current[1] & 0x0f) << 2];
  1079. *p++ = base64_pad;
  1080. } else {
  1081. *p++ = base64_table[(current[0] & 0x03) << 4];
  1082. *p++ = base64_pad;
  1083. *p++ = base64_pad;
  1084. }
  1085. }
  1086. return p - outstr;
  1087. }
  1088. // outstr must be at least length bytes
  1089. ssize_t php_base64_decode(const char *str, int length, bool strict,
  1090. unsigned char* outstr) {
  1091. const unsigned char *current = (unsigned char*)str;
  1092. int ch, i = 0, j = 0, k;
  1093. /* this sucks for threaded environments */
  1094. unsigned char* result = outstr;
  1095. /* run through the whole string, converting as we go */
  1096. while ((ch = *current++) != '\0' && length-- > 0) {
  1097. if (ch == base64_pad) {
  1098. if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
  1099. if ((i % 4) != 1) {
  1100. while (isspace(*(++current))) {
  1101. continue;
  1102. }
  1103. if (*current == '\0') {
  1104. continue;
  1105. }
  1106. }
  1107. return -1;
  1108. }
  1109. continue;
  1110. }
  1111. ch = base64_reverse_table[ch];
  1112. if ((!strict && ch < 0) || ch == -1) {
  1113. /* a space or some other separator character, we simply skip over */
  1114. continue;
  1115. } else if (ch == -2) {
  1116. return -1;
  1117. }
  1118. switch(i % 4) {
  1119. case 0:
  1120. result[j] = ch << 2;
  1121. break;
  1122. case 1:
  1123. result[j++] |= ch >> 4;
  1124. result[j] = (ch & 0x0f) << 4;
  1125. break;
  1126. case 2:
  1127. result[j++] |= ch >>2;
  1128. result[j] = (ch & 0x03) << 6;
  1129. break;
  1130. case 3:
  1131. result[j++] |= ch;
  1132. break;
  1133. }
  1134. i++;
  1135. }
  1136. k = j;
  1137. /* mop things up if we ended on a boundary */
  1138. if (ch == base64_pad) {
  1139. switch(i % 4) {
  1140. case 1:
  1141. return -1;
  1142. case 2:
  1143. k++;
  1144. case 3:
  1145. result[k] = 0;
  1146. }
  1147. }
  1148. return j;
  1149. }
  1150. }
  1151. String string_base64_encode(const char* input, int len) {
  1152. if (auto const wantedSize = maxEncodedSize(len)) {
  1153. String ret(*wantedSize, ReserveString);
  1154. auto actualSize = php_base64_encode((unsigned char*)input, len,
  1155. (unsigned char*)ret.mutableData());
  1156. ret.setSize(actualSize);
  1157. return ret;
  1158. }
  1159. return String();
  1160. }
  1161. String string_base64_decode(const char* input, int len, bool strict) {
  1162. String ret(len, ReserveString);
  1163. auto actualSize = php_base64_decode(input, len, strict,
  1164. (unsigned char*)ret.mutableData());
  1165. if (actualSize < 0) return String();
  1166. ret.setSize(actualSize);
  1167. return ret;
  1168. }
  1169. std::string base64_encode(const char* input, int len) {
  1170. if (auto const wantedSize = maxEncodedSize(len)) {
  1171. std::string ret;
  1172. ret.resize(*wantedSize);
  1173. auto actualSize = php_base64_encode((unsigned char*)input, len,
  1174. (unsigned char*)ret.data());
  1175. ret.resize(actualSize);
  1176. return ret;
  1177. }
  1178. return std::string();
  1179. }
  1180. std::string base64_decode(const char* input, int len, bool strict) {
  1181. if (!len) return std::string();
  1182. std::string ret;
  1183. ret.resize(len);
  1184. auto actualSize = php_base64_decode(input, len, strict,
  1185. (unsigned char*)ret.data());
  1186. if (!actualSize) return std::string();
  1187. ret.resize(actualSize);
  1188. return ret;
  1189. }
  1190. ///////////////////////////////////////////////////////////////////////////////
  1191. String string_escape_shell_arg(const char *str) {
  1192. int x, y, l;
  1193. char *cmd;
  1194. y = 0;
  1195. l = strlen(str);
  1196. String ret(safe_address(l, 4, 3), ReserveString); /* worst case */
  1197. cmd = ret.mutableData();
  1198. #ifdef _MSC_VER
  1199. cmd[y++] = '"';
  1200. #else
  1201. cmd[y++] = '\'';
  1202. #endif
  1203. for (x = 0; x < l; x++) {
  1204. switch (str[x]) {
  1205. #ifdef _MSC_VER
  1206. case '"':
  1207. case '%':
  1208. case '!':
  1209. cmd[y++] = ' ';
  1210. break;
  1211. #else
  1212. case '\'':
  1213. cmd[y++] = '\'';
  1214. cmd[y++] = '\\';
  1215. cmd[y++] = '\'';
  1216. #endif
  1217. /* fall-through */
  1218. default:
  1219. cmd[y++] = str[x];
  1220. }
  1221. }
  1222. #ifdef _MSC_VER
  1223. if (y > 0 && '\\' == cmd[y - 1]) {
  1224. int k = 0, n = y - 1;
  1225. for (; n >= 0 && '\\' == cmd[n]; n--, k++);
  1226. if (k % 2) {
  1227. cmd[y++] = '\\';
  1228. }
  1229. }
  1230. cmd[y++] = '"';
  1231. #else
  1232. cmd[y++] = '\'';
  1233. #endif
  1234. ret.setSize(y);
  1235. return ret;
  1236. }
  1237. String string_escape_shell_cmd(const char *str) {
  1238. register int x, y, l;
  1239. char *cmd;
  1240. char *p = nullptr;
  1241. l = strlen(str);
  1242. String ret(safe_address(l, 2, 1), ReserveString);
  1243. cmd = ret.mutableData();
  1244. for (x = 0, y = 0; x < l; x++) {
  1245. switch (str[x]) {
  1246. #ifndef _MSC_VER
  1247. case '"':
  1248. case '\'':
  1249. if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) {
  1250. /* noop */
  1251. } else if (p && *p == str[x]) {
  1252. p = nullptr;
  1253. } else {
  1254. cmd[y++] = '\\';
  1255. }
  1256. cmd[y++] = str[x];
  1257. break;
  1258. #else
  1259. /* % is Windows specific for environmental variables, ^%PATH% will
  1260. output PATH while ^%PATH^% will not. escapeshellcmd->val will
  1261. escape all % and !.
  1262. */
  1263. case '%':
  1264. case '!':
  1265. case '"':
  1266. case '\'':
  1267. #endif
  1268. case '#': /* This is character-set independent */
  1269. case '&':
  1270. case ';':
  1271. case '`':
  1272. case '|':
  1273. case '*':
  1274. case '?':
  1275. case '~':
  1276. case '<':
  1277. case '>':
  1278. case '^':
  1279. case '(':
  1280. case ')':
  1281. case '[':
  1282. case ']':
  1283. case '{':
  1284. case '}':
  1285. case '$':
  1286. case '\\':
  1287. case '\x0A': /* excluding these two */
  1288. case '\xFF':
  1289. #ifdef _MSC_VER
  1290. cmd[y++] = '^';
  1291. #else
  1292. cmd[y++] = '\\';
  1293. #endif
  1294. /* fall-through */
  1295. default:
  1296. cmd[y++] = str[x];
  1297. }
  1298. }
  1299. ret.setSize(y);
  1300. return ret;
  1301. }
  1302. ///////////////////////////////////////////////////////////////////////////////
  1303. static void string_similar_str(const char *txt1, int len1,
  1304. const char *txt2, int len2,
  1305. int *pos1, int *pos2, int *max) {
  1306. const char *p, *q;
  1307. const char *end1 = txt1 + len1;
  1308. const char *end2 = txt2 + len2;
  1309. int l;
  1310. *max = 0;
  1311. for (p = txt1; p < end1; p++) {
  1312. for (q = txt2; q < end2; q++) {
  1313. for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++);
  1314. if (l > *max) {
  1315. *max = l;
  1316. *pos1 = p - txt1;
  1317. *pos2 = q - txt2;
  1318. }
  1319. }
  1320. }
  1321. }
  1322. static int string_similar_char(const char *txt1, int len1,
  1323. const char *txt2, int len2) {
  1324. int sum;
  1325. int pos1 = 0, pos2 = 0, max;
  1326. string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);
  1327. if ((sum = max)) {
  1328. if (pos1 && pos2) {
  1329. sum += string_similar_char(txt1, pos1, txt2, pos2);
  1330. }
  1331. if ((pos1 + max < len1) && (pos2 + max < len2)) {
  1332. sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max,
  1333. txt2 + pos2 + max, len2 - pos2 - max);
  1334. }
  1335. }
  1336. return sum;
  1337. }
  1338. int string_similar_text(const char *t1, int len1,
  1339. const char *t2, int len2, double *percent) {
  1340. if (len1 == 0 && len2 == 0) {
  1341. if (percent) *percent = 0.0;
  1342. return 0;
  1343. }
  1344. int sim = string_similar_char(t1, len1, t2, len2);
  1345. if (percent) *percent = sim * 200.0 / (len1 + len2);
  1346. return sim;
  1347. }
  1348. ///////////////////////////////////////////////////////////////////////////////
  1349. #define LEVENSHTEIN_MAX_LENTH 255
  1350. // reference implementation, only optimized for memory usage, not speed
  1351. int string_levenshtein(const char *s1, int l1, const char *s2, int l2,
  1352. int cost_ins, int cost_rep, int cost_del ) {
  1353. int *p1, *p2, *tmp;
  1354. int i1, i2, c0, c1, c2;
  1355. if (l1==0) return l2*cost_ins;
  1356. if (l2==0) return l1*cost_del;
  1357. if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) {
  1358. raise_warning("levenshtein(): Argument string(s) too long");
  1359. return -1;
  1360. }
  1361. p1 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
  1362. SCOPE_EXIT { req::free(p1); };
  1363. p2 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
  1364. SCOPE_EXIT { req::free(p2); };
  1365. for(i2=0;i2<=l2;i2++) {
  1366. p1[i2] = i2*cost_ins;
  1367. }
  1368. for(i1=0;i1<l1;i1++) {
  1369. p2[0]=p1[0]+cost_del;
  1370. for(i2=0;i2<l2;i2++) {
  1371. c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep);
  1372. c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1;
  1373. c2=p2[i2]+cost_ins; if (c2<c0) c0=c2;
  1374. p2[i2+1]=c0;
  1375. }
  1376. tmp=p1; p1=p2; p2=tmp;
  1377. }
  1378. c0=p1[l2];
  1379. return c0;
  1380. }
  1381. ///////////////////////////////////////////////////////////////////////////////
  1382. String string_money_format(const char *format, double value) {
  1383. bool check = false;
  1384. const char *p = format;
  1385. while ((p = strchr(p, '%'))) {
  1386. if (*(p + 1) == '%') {
  1387. p += 2;
  1388. } else if (!check) {
  1389. check = true;
  1390. p++;
  1391. } else {
  1392. raise_invalid_argument_warning
  1393. ("format: Only a single %%i or %%n token can be used");
  1394. return String();
  1395. }
  1396. }
  1397. int format_len = strlen(format);
  1398. int str_len = safe_address(format_len, 1, 1024);
  1399. String ret(str_len, ReserveString);
  1400. char *str = ret.mutableData();
  1401. if ((str_len = strfmon(str, str_len, format, value)) < 0) {
  1402. return String();
  1403. }
  1404. ret.setSize(str_len);
  1405. return ret;
  1406. }
  1407. ///////////////////////////////////////////////////////////////////////////////
  1408. String string_number_format(double d, int dec,
  1409. const String& dec_point,
  1410. const String& thousand_sep) {
  1411. char *tmpbuf = nullptr, *resbuf;
  1412. char *s, *t; /* source, target */
  1413. char *dp;
  1414. int integral;
  1415. int tmplen, reslen=0;
  1416. int count=0;
  1417. int is_negative=0;
  1418. if (d < 0) {
  1419. is_negative = 1;
  1420. d = -d;
  1421. }
  1422. if (dec < 0) dec = 0;
  1423. d = php_math_round(d, dec);
  1424. // departure from PHP: we got rid of dependencies on spprintf() here.
  1425. // This actually means 63 bytes for characters + 1 byte for '\0'
  1426. String tmpstr(63, ReserveString);
  1427. tmpbuf = tmpstr.mutableData();
  1428. tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d);
  1429. // From the man page of snprintf, the return value is:
  1430. // The number of characters that would have been written if n had been
  1431. // sufficiently large, not counting the terminating null character.
  1432. if (tmplen < 0) return empty_string();
  1433. if (tmplen < 64 && (tmpbuf == nullptr || !isdigit((int)tmpbuf[0]))) {
  1434. tmpstr.setSize(tmplen);
  1435. return tmpstr;
  1436. }
  1437. if (tmplen >= 64) {
  1438. // Uncommon, asked for more than 64 chars worth of precision
  1439. tmpstr = String(tmplen, ReserveString);
  1440. tmpbuf = tmpstr.mutableData();
  1441. tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d);
  1442. if (tmplen < 0) return empty_string();
  1443. if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
  1444. tmpstr.setSize(tmplen);
  1445. return tmpstr;
  1446. }
  1447. }
  1448. /* find decimal point, if expected */
  1449. if (dec) {
  1450. dp = strpbrk(tmpbuf, ".,");
  1451. } else {
  1452. dp = nullptr;
  1453. }
  1454. /* calculate the length of the return buffer */
  1455. if (dp) {
  1456. integral = dp - tmpbuf;
  1457. } else {
  1458. /* no decimal point was found */
  1459. integral = tmplen;
  1460. }
  1461. /* allow for thousand separators */
  1462. if (!thousand_sep.empty()) {
  1463. if (integral + thousand_sep.size() * ((integral-1) / 3) < integral) {
  1464. /* overflow */
  1465. raise_error("String overflow");
  1466. }
  1467. integral += ((integral-1) / 3) * thousand_sep.size();
  1468. }
  1469. reslen = integral;
  1470. if (dec) {
  1471. reslen += dec;
  1472. if (!dec_point.empty()) {
  1473. if (reslen + dec_point.size() < dec_point.size()) {
  1474. /* overflow */
  1475. raise_error("String overflow");
  1476. }
  1477. reslen += dec_point.size();
  1478. }
  1479. }
  1480. /* add a byte for minus sign */
  1481. if (is_negative) {
  1482. reslen++;
  1483. }
  1484. String resstr(reslen, ReserveString);
  1485. resbuf = resstr.mutableData();
  1486. s = tmpbuf+tmplen-1;
  1487. t = resbuf+reslen-1;
  1488. /* copy the decimal places.
  1489. * Take care, as the sprintf implementation may return less places than
  1490. * we requested due to internal buffer limitations */
  1491. if (dec) {
  1492. int declen = dp ? s - dp : 0;
  1493. int topad = dec > declen ? dec - declen : 0;
  1494. /* pad with '0's */
  1495. while (topad--) {
  1496. *t-- = '0';
  1497. }
  1498. if (dp) {
  1499. s -= declen + 1; /* +1 to skip the point */
  1500. t -= declen;
  1501. /* now copy the chars after the point */
  1502. memcpy(t + 1, dp + 1, declen);
  1503. }
  1504. /* add decimal point */
  1505. if (!dec_point.empty()) {
  1506. memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size());
  1507. t -= dec_point.size();
  1508. }
  1509. }
  1510. /* copy the numbers before the decimal point, adding thousand
  1511. * separator every three digits */
  1512. while(s >= tmpbuf) {
  1513. *t-- = *s--;
  1514. if (thousand_sep && (++count%3)==0 && s>=tmpbuf) {
  1515. memcpy(t + (1 - thousand_sep.size()),
  1516. thousand_sep.data(),
  1517. thousand_sep.size());
  1518. t -= thousand_sep.size();
  1519. }
  1520. }
  1521. /* and a minus sign, if needed */
  1522. if (is_negative) {
  1523. *t-- = '-';
  1524. }
  1525. resstr.setSize(reslen);
  1526. return resstr;
  1527. }
  1528. ///////////////////////////////////////////////////////////////////////////////
  1529. // soundex
  1530. /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
  1531. String string_soundex(const String& str) {
  1532. assertx(!str.empty());
  1533. int _small, code, last;
  1534. String retString(4, ReserveString);
  1535. char* soundex = retString.mutableData();
  1536. static char soundex_table[26] = {
  1537. 0, /* A */
  1538. '1', /* B */
  1539. '2', /* C */
  1540. '3', /* D */
  1541. 0, /* E */
  1542. '1', /* F */
  1543. '2', /* G */
  1544. 0, /* H */
  1545. 0, /* I */
  1546. '2', /* J */
  1547. '2', /* K */
  1548. '4', /* L */
  1549. '5', /* M */
  1550. '5', /* N */
  1551. 0, /* O */
  1552. '1', /* P */
  1553. '2', /* Q */
  1554. '6', /* R */
  1555. '2', /* S */
  1556. '3', /* T */
  1557. 0, /* U */
  1558. '1', /* V */
  1559. 0, /* W */
  1560. '2', /* X */
  1561. 0, /* Y */
  1562. '2' /* Z */
  1563. };
  1564. /* build soundex string */
  1565. last = -1;
  1566. auto p = str.slice().data();
  1567. for (_small = 0; *p && _small < 4; p++) {
  1568. /* convert chars to upper case and strip non-letter chars */
  1569. /* BUG: should also map here accented letters used in non */
  1570. /* English words or names (also found in English text!): */
  1571. /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
  1572. code = toupper((int)(unsigned char)(*p));
  1573. if (code >= 'A' && code <= 'Z') {
  1574. if (_small == 0) {
  1575. /* remember first valid char */
  1576. soundex[_small++] = code;
  1577. last = soundex_table[code - 'A'];
  1578. } else {
  1579. /* ignore sequences of consonants with same soundex */
  1580. /* code in trail, and vowels unless they separate */
  1581. /* consonant letters */
  1582. code = soundex_table[code - 'A'];
  1583. if (code != last) {
  1584. if (code != 0) {
  1585. soundex[_small++] = code;
  1586. }
  1587. last = code;
  1588. }
  1589. }
  1590. }
  1591. }
  1592. /* pad with '0' and terminate with 0 ;-) */
  1593. while (_small < 4) {
  1594. soundex[_small++] = '0';
  1595. }
  1596. retString.setSize(4);
  1597. return retString;
  1598. }
  1599. ///////////////////////////////////////////////////////////////////////////////
  1600. // metaphone
  1601. /**
  1602. * this is now the original code by Michael G Schwern:
  1603. * i've changed it just a slightly bit (use emalloc,
  1604. * get rid of includes etc)
  1605. * - thies - 13.09.1999
  1606. */
  1607. /*----------------------------- */
  1608. /* this used to be "metaphone.h" */
  1609. /*----------------------------- */
  1610. /* Special encodings */
  1611. #define SH 'X'
  1612. #define TH '0'
  1613. /*----------------------------- */
  1614. /* end of "metaphone.h" */
  1615. /*----------------------------- */
  1616. /*----------------------------- */
  1617. /* this used to be "metachar.h" */
  1618. /*----------------------------- */
  1619. /* Metachar.h ... little bits about characters for metaphone */
  1620. /*-- Character encoding array & accessing macros --*/
  1621. /* Stolen directly out of the book... */
  1622. char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
  1623. #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
  1624. #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
  1625. /* These letters are passed through unchanged */
  1626. #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
  1627. /* These form dipthongs when preceding H */
  1628. #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
  1629. /* These make C and G soft */
  1630. #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
  1631. /* These prevent GH from becoming F */
  1632. #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
  1633. /*----------------------------- */
  1634. /* end of "metachar.h" */
  1635. /*----------------------------- */
  1636. /* I suppose I could have been using a character pointer instead of
  1637. * accesssing the array directly... */
  1638. /* Look at the next letter in the word */
  1639. #define Next_Letter ((char)toupper(word[w_idx+1]))
  1640. /* Look at the current letter in the word */
  1641. #define Curr_Letter ((char)toupper(word[w_idx]))
  1642. /* Go N letters back. */
  1643. #define Look_Back_Letter(n) (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
  1644. /* Previous letter. I dunno, should this return null on failure? */
  1645. #define Prev_Letter (Look_Back_Letter(1))
  1646. /* Look two letters down. It makes sure you don't walk off the string. */
  1647. #define After_Next_Letter (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
  1648. : '\0')
  1649. #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
  1650. /* Allows us to safely look ahead an arbitrary # of letters */
  1651. /* I probably could have just used strlen... */
  1652. static char Lookahead(unsigned char *word, int how_far) {
  1653. char letter_ahead = '\0'; /* null by default */
  1654. int idx;
  1655. for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
  1656. /* Edge forward in the string... */
  1657. letter_ahead = (char)word[idx]; /* idx will be either == to how_far or
  1658. * at the end of the string
  1659. */
  1660. return letter_ahead;
  1661. }
  1662. /* phonize one letter
  1663. * We don't know the buffers size in advance. On way to solve this is to just
  1664. * re-allocate the buffer size. We're using an extra of 2 characters (this
  1665. * could be one though; or more too). */
  1666. #define Phonize(c) { buffer.append(c); }
  1667. /* How long is the phoned word? */
  1668. #define Phone_Len (buffer.size())
  1669. /* Note is a letter is a 'break' in the word */
  1670. #define Isbreak(c) (!isalpha(c))
  1671. String string_metaphone(const char *input, int word_len, long max_phonemes,
  1672. int traditional) {
  1673. unsigned char *word = (unsigned char *)input;
  1674. int w_idx = 0; /* point in the phonization we're at. */
  1675. int max_buffer_len = 0; /* maximum length of the destination buffer */
  1676. /*-- Parameter checks --*/
  1677. /* Negative phoneme length is meaningless */
  1678. if (max_phonemes < 0)
  1679. return String();
  1680. /* Empty/null string is meaningless */
  1681. /* Overly paranoid */
  1682. /* always_assert(word != NULL && word[0] != '\0'); */
  1683. if (word == nullptr)
  1684. return String();
  1685. /*-- Allocate memory for our phoned_phrase --*/
  1686. if (max_phonemes == 0) { /* Assume largest possible */
  1687. max_buffer_len = word_len;
  1688. } else {
  1689. max_buffer_len = max_phonemes;
  1690. }
  1691. StringBuffer buffer(max_buffer_len);
  1692. /*-- The first phoneme has to be processed specially. --*/
  1693. /* Find our first letter */
  1694. for (; !isalpha(Curr_Letter); w_idx++) {
  1695. /* On the off chance we were given nothing but crap... */
  1696. if (Curr_Letter == '\0') {
  1697. return buffer.detach(); /* For testing */
  1698. }
  1699. }
  1700. switch (Curr_Letter) {
  1701. /* AE becomes E */
  1702. case 'A':
  1703. if (Next_Letter == 'E') {
  1704. Phonize('E');
  1705. w_idx += 2;
  1706. }
  1707. /* Remember, preserve vowels at the beginning */
  1708. else {
  1709. Phonize('A');
  1710. w_idx++;
  1711. }
  1712. break;
  1713. /* [GKP]N becomes N */
  1714. case 'G':
  1715. case 'K':
  1716. case 'P':
  1717. if (Next_Letter == 'N') {
  1718. Phonize('N');
  1719. w_idx += 2;
  1720. }

Large files files are truncated, but you can click here to view the full file