/hphp/runtime/base/zend-string.cpp
C++ | 2593 lines | 2132 code | 186 blank | 275 comment | 387 complexity | 107f089f48b874e6083c112535fa7315 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, MIT, LGPL-2.0, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- /*
- +----------------------------------------------------------------------+
- | HipHop for PHP |
- +----------------------------------------------------------------------+
- | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
- | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
- +----------------------------------------------------------------------+
- | This source file is subject to version 2.00 of the Zend license, |
- | that is bundled with this package in the file LICENSE, and is |
- | available through the world-wide-web at the following url: |
- | http://www.zend.com/license/2_00.txt. |
- | If you did not receive a copy of the Zend license and are unable to |
- | obtain it through the world-wide-web, please send a note to |
- | license@zend.com so we can mail you a copy immediately. |
- +----------------------------------------------------------------------+
- */
- // NOTE: See also "hphp/zend/zend-string.*".
- #include "hphp/runtime/base/zend-string.h"
- #include "hphp/runtime/base/zend-printf.h"
- #include "hphp/util/lock.h"
- #include "hphp/util/overflow.h"
- #include "hphp/zend/zend-math.h"
- #include <algorithm>
- #include <cmath>
- #ifndef _MSC_VER
- #include <monetary.h>
- #endif
- #include "hphp/util/bstring.h"
- #include "hphp/runtime/base/exceptions.h"
- #include "hphp/runtime/base/string-buffer.h"
- #include "hphp/runtime/base/runtime-error.h"
- #include "hphp/runtime/base/string-util.h"
- #include "hphp/runtime/base/builtin-functions.h"
- #include <folly/portability/String.h>
- #define PHP_QPRINT_MAXL 75
- namespace HPHP {
- ///////////////////////////////////////////////////////////////////////////////
- // helpers
- void string_charmask(const char *sinput, int len, char *mask) {
- const unsigned char *input = (unsigned char *)sinput;
- const unsigned char *end;
- unsigned char c;
- memset(mask, 0, 256);
- for (end = input+len; input < end; input++) {
- c=*input;
- if ((input+3 < end) && input[1] == '.' && input[2] == '.'
- && input[3] >= c) {
- memset(mask+c, 1, input[3] - c + 1);
- input+=3;
- } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') {
- /* Error, try to be as helpful as possible:
- (a range ending/starting with '.' won't be captured here) */
- if (end-len >= input) { /* there was no 'left' char */
- raise_invalid_argument_warning
- ("charlist: Invalid '..'-range, missing left of '..'");
- continue;
- }
- if (input+2 >= end) { /* there is no 'right' char */
- raise_invalid_argument_warning
- ("charlist: Invalid '..'-range, missing right of '..'");
- continue;
- }
- if (input[-1] > input[2]) { /* wrong order */
- raise_invalid_argument_warning
- ("charlist: '..'-range needs to be incrementing");
- continue;
- }
- /* FIXME: better error (a..b..c is the only left possibility?) */
- raise_invalid_argument_warning("charlist: Invalid '..'-range");
- continue;
- } else {
- mask[c]=1;
- }
- }
- }
- ///////////////////////////////////////////////////////////////////////////////
- void string_to_case(String& s, int (*tocase)(int)) {
- assertx(!s.isNull());
- assertx(tocase);
- auto data = s.mutableData();
- auto len = s.size();
- for (int i = 0; i < len; i++) {
- data[i] = tocase(data[i]);
- }
- }
- ///////////////////////////////////////////////////////////////////////////////
- #define STR_PAD_LEFT 0
- #define STR_PAD_RIGHT 1
- #define STR_PAD_BOTH 2
- String string_pad(const char *input, int len, int pad_length,
- const char *pad_string, int pad_str_len,
- int pad_type) {
- assertx(input);
- int num_pad_chars = pad_length - len;
- /* If resulting string turns out to be shorter than input string,
- we simply copy the input and return. */
- if (pad_length < 0 || num_pad_chars < 0) {
- return String(input, len, CopyString);
- }
- /* Setup the padding string values if specified. */
- if (pad_str_len == 0) {
- SystemLib::throwRuntimeExceptionObject(
- "Invalid argument: pad_string: (empty)");
- }
- String ret(pad_length, ReserveString);
- char *result = ret.mutableData();
- /* We need to figure out the left/right padding lengths. */
- int left_pad, right_pad;
- switch (pad_type) {
- case STR_PAD_RIGHT:
- left_pad = 0;
- right_pad = num_pad_chars;
- break;
- case STR_PAD_LEFT:
- left_pad = num_pad_chars;
- right_pad = 0;
- break;
- case STR_PAD_BOTH:
- left_pad = num_pad_chars / 2;
- right_pad = num_pad_chars - left_pad;
- break;
- default:
- SystemLib::throwRuntimeExceptionObject(
- folly::sformat("Invalid argument: pad_type: {}", pad_type));
- }
- /* First we pad on the left. */
- int result_len = 0;
- for (int i = 0; i < left_pad; i++) {
- result[result_len++] = pad_string[i % pad_str_len];
- }
- /* Then we copy the input string. */
- memcpy(result + result_len, input, len);
- result_len += len;
- /* Finally, we pad on the right. */
- for (int i = 0; i < right_pad; i++) {
- result[result_len++] = pad_string[i % pad_str_len];
- }
- ret.setSize(result_len);
- return ret;
- }
- ///////////////////////////////////////////////////////////////////////////////
- int string_find(const char *input, int len, char ch, int pos,
- bool case_sensitive) {
- assertx(input);
- if (pos < 0 || pos > len) {
- return -1;
- }
- const void *ptr;
- if (case_sensitive) {
- ptr = memchr(input + pos, ch, len - pos);
- } else {
- ptr = bstrcasechr(input + pos, ch, len - pos);
- }
- if (ptr != nullptr) {
- return (int)((const char *)ptr - input);
- }
- return -1;
- }
- int string_rfind(const char *input, int len, char ch, int pos,
- bool case_sensitive) {
- assertx(input);
- if (pos < -len || pos > len) {
- return -1;
- }
- const void *ptr;
- if (case_sensitive) {
- if (pos >= 0) {
- ptr = memrchr(input + pos, ch, len - pos);
- } else {
- ptr = memrchr(input, ch, len + pos + 1);
- }
- } else {
- if (pos >= 0) {
- ptr = bstrrcasechr(input + pos, ch, len - pos);
- } else {
- ptr = bstrrcasechr(input, ch, len + pos + 1);
- }
- }
- if (ptr != nullptr) {
- return (int)((const char *)ptr - input);
- }
- return -1;
- }
- int string_find(const char *input, int len, const char *s, int s_len,
- int pos, bool case_sensitive) {
- assertx(input);
- assertx(s);
- if (!s_len || pos < 0 || pos > len) {
- return -1;
- }
- void *ptr;
- if (case_sensitive) {
- ptr = (void*)string_memnstr(input + pos, s, s_len, input + len);
- } else {
- ptr = bstrcasestr(input + pos, len - pos, s, s_len);
- }
- if (ptr != nullptr) {
- return (int)((const char *)ptr - input);
- }
- return -1;
- }
- int string_rfind(const char *input, int len, const char *s, int s_len,
- int pos, bool case_sensitive) {
- assertx(input);
- assertx(s);
- if (!s_len || pos < -len || pos > len) {
- return -1;
- }
- void *ptr;
- if (case_sensitive) {
- if (pos >= 0) {
- ptr = bstrrstr(input + pos, len - pos, s, s_len);
- } else {
- ptr = bstrrstr(input, len + std::min(pos + s_len, 0), s, s_len);
- }
- } else {
- if (pos >= 0) {
- ptr = bstrrcasestr(input + pos, len - pos, s, s_len);
- } else {
- ptr = bstrrcasestr(input, len + std::min(pos + s_len, 0), s, s_len);
- }
- }
- if (ptr != nullptr) {
- return (int)((const char *)ptr - input);
- }
- return -1;
- }
- const char *string_memnstr(const char *haystack, const char *needle,
- int needle_len, const char *end) {
- const char *p = haystack;
- char ne = needle[needle_len-1];
- end -= needle_len;
- while (p <= end) {
- if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
- if (!memcmp(needle, p, needle_len-1)) {
- return p;
- }
- }
- if (p == nullptr) {
- return nullptr;
- }
- p++;
- }
- return nullptr;
- }
- String string_replace(const char *s, int len, int start, int length,
- const char *replacement, int len_repl) {
- assertx(s);
- assertx(replacement);
- assertx(len >= 0);
- // if "start" position is negative, count start position from the end
- // of the string
- if (start < 0) {
- start = len + start;
- if (start < 0) {
- start = 0;
- }
- }
- if (start > len) {
- start = len;
- }
- // if "length" position is negative, set it to the length
- // needed to stop that many chars from the end of the string
- if (length < 0) {
- length = (len - start) + length;
- if (length < 0) {
- length = 0;
- }
- }
- // check if length is too large
- if (length > len) {
- length = len;
- }
- // check if the length is too large adjusting for non-zero start
- // Write this way instead of start + length > len to avoid overflow
- if (length > len - start) {
- length = len - start;
- }
- String retString(len + len_repl - length, ReserveString);
- char *ret = retString.mutableData();
- int ret_len = 0;
- if (start) {
- memcpy(ret, s, start);
- ret_len += start;
- }
- if (len_repl) {
- memcpy(ret + ret_len, replacement, len_repl);
- ret_len += len_repl;
- }
- len -= (start + length);
- if (len) {
- memcpy(ret + ret_len, s + start + length, len);
- ret_len += len;
- }
- retString.setSize(ret_len);
- return retString;
- }
- String string_replace(const char *input, int len,
- const char *search, int len_search,
- const char *replacement, int len_replace,
- int &count, bool case_sensitive) {
- assertx(input);
- assertx(search && len_search);
- assertx(len >= 0);
- assertx(len_search >= 0);
- assertx(len_replace >= 0);
- if (len == 0) {
- return String();
- }
- req::vector<int> founds;
- founds.reserve(16);
- if (len_search == 1) {
- for (int pos = string_find(input, len, *search, 0, case_sensitive);
- pos >= 0;
- pos = string_find(input, len, *search, pos + len_search,
- case_sensitive)) {
- founds.push_back(pos);
- }
- } else {
- for (int pos = string_find(input, len, search, len_search, 0,
- case_sensitive);
- pos >= 0;
- pos = string_find(input, len, search, len_search,
- pos + len_search, case_sensitive)) {
- founds.push_back(pos);
- }
- }
- count = founds.size();
- if (count == 0) {
- return String(); // not found
- }
- int reserve;
- // Make sure the new size of the string wouldn't overflow int32_t. Don't
- // bother if the replacement wouldn't make the string longer.
- if (len_replace > len_search) {
- auto raise = [&] { raise_error("String too large"); };
- if (mul_overflow(len_replace - len_search, count)) {
- raise();
- }
- int diff = (len_replace - len_search) * count;
- if (add_overflow(len, diff)) {
- raise();
- }
- reserve = len + diff;
- } else {
- reserve = len + (len_replace - len_search) * count;
- }
- String retString(reserve, ReserveString);
- char *ret = retString.mutableData();
- char *p = ret;
- int pos = 0; // last position in input that hasn't been copied over yet
- int n;
- for (unsigned int i = 0; i < founds.size(); i++) {
- n = founds[i];
- if (n > pos) {
- n -= pos;
- memcpy(p, input, n);
- p += n;
- input += n;
- pos += n;
- }
- if (len_replace) {
- memcpy(p, replacement, len_replace);
- p += len_replace;
- }
- input += len_search;
- pos += len_search;
- }
- n = len;
- if (n > pos) {
- n -= pos;
- memcpy(p, input, n);
- p += n;
- }
- retString.setSize(p - ret);
- return retString;
- }
- ///////////////////////////////////////////////////////////////////////////////
- String string_chunk_split(const char *src, int srclen, const char *end,
- int endlen, int chunklen) {
- int chunks = srclen / chunklen; // complete chunks!
- int restlen = srclen - chunks * chunklen; /* srclen % chunklen */
- String ret(
- safe_address(
- chunks + 1,
- endlen,
- srclen
- ),
- ReserveString
- );
- char *dest = ret.mutableData();
- const char *p; char *q;
- const char *pMax = src + srclen - chunklen + 1;
- for (p = src, q = dest; p < pMax; ) {
- memcpy(q, p, chunklen);
- q += chunklen;
- memcpy(q, end, endlen);
- q += endlen;
- p += chunklen;
- }
- if (restlen) {
- memcpy(q, p, restlen);
- q += restlen;
- memcpy(q, end, endlen);
- q += endlen;
- }
- ret.setSize(q - dest);
- return ret;
- }
- ///////////////////////////////////////////////////////////////////////////////
- #define PHP_TAG_BUF_SIZE 1023
- /**
- * Check if tag is in a set of tags
- *
- * states:
- *
- * 0 start tag
- * 1 first non-whitespace char seen
- */
- static int string_tag_find(const char *tag, int len, const char *set) {
- char c, *n;
- const char *t;
- int state=0, done=0;
- char *norm;
- if (len <= 0) {
- return 0;
- }
- norm = (char *)req::malloc_noptrs(len+1);
- SCOPE_EXIT { req::free(norm); };
- n = norm;
- t = tag;
- c = tolower(*t);
- /*
- normalize the tag removing leading and trailing whitespace
- and turn any <a whatever...> into just <a> and any </tag>
- into <tag>
- */
- while (!done) {
- switch (c) {
- case '<':
- *(n++) = c;
- break;
- case '>':
- done =1;
- break;
- default:
- if (!isspace((int)c)) {
- if (state == 0) {
- state=1;
- }
- if (c != '/') {
- *(n++) = c;
- }
- } else {
- if (state == 1)
- done=1;
- }
- break;
- }
- c = tolower(*(++t));
- }
- *(n++) = '>';
- *n = '\0';
- if (strstr(set, norm)) {
- done=1;
- } else {
- done=0;
- }
- return done;
- }
- /**
- * A simple little state-machine to strip out html and php tags
- *
- * State 0 is the output state, State 1 means we are inside a
- * normal html tag and state 2 means we are inside a php tag.
- *
- * The state variable is passed in to allow a function like fgetss
- * to maintain state across calls to the function.
- *
- * lc holds the last significant character read and br is a bracket
- * counter.
- *
- * When an allow string is passed in we keep track of the string
- * in state 1 and when the tag is closed check it against the
- * allow string to see if we should allow it.
- * swm: Added ability to strip <?xml tags without assuming it PHP
- * code.
- */
- String string_strip_tags(const char *s, const int len,
- const char *allow, const int allow_len,
- bool allow_tag_spaces) {
- const char *abuf, *p;
- char *rbuf, *tbuf, *tp, *rp, c, lc;
- int br, i=0, depth=0, in_q = 0;
- int state = 0, pos;
- assertx(s);
- assertx(allow);
- String retString(s, len, CopyString);
- rbuf = retString.mutableData();
- String allowString;
- c = *s;
- lc = '\0';
- p = s;
- rp = rbuf;
- br = 0;
- if (allow_len) {
- assertx(allow);
- allowString = String(allow_len, ReserveString);
- char *atmp = allowString.mutableData();
- for (const char *tmp = allow; *tmp; tmp++, atmp++) {
- *atmp = tolower((int)*(const unsigned char *)tmp);
- }
- allowString.setSize(allow_len);
- abuf = allowString.data();
- tbuf = (char *)req::malloc_noptrs(PHP_TAG_BUF_SIZE+1);
- tp = tbuf;
- } else {
- abuf = nullptr;
- tbuf = tp = nullptr;
- }
- auto move = [&pos, &tbuf, &tp]() {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = (char*)req::realloc_noptrs(tbuf,
- (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- };
- while (i < len) {
- switch (c) {
- case '\0':
- break;
- case '<':
- if (isspace(*(p + 1)) && !allow_tag_spaces) {
- goto reg_char;
- }
- if (state == 0) {
- lc = '<';
- state = 1;
- if (allow_len) {
- move();
- *(tp++) = '<';
- }
- } else if (state == 1) {
- depth++;
- }
- break;
- case '(':
- if (state == 2) {
- if (lc != '"' && lc != '\'') {
- lc = '(';
- br++;
- }
- } else if (allow_len && state == 1) {
- move();
- *(tp++) = c;
- } else if (state == 0) {
- *(rp++) = c;
- }
- break;
- case ')':
- if (state == 2) {
- if (lc != '"' && lc != '\'') {
- lc = ')';
- br--;
- }
- } else if (allow_len && state == 1) {
- move();
- *(tp++) = c;
- } else if (state == 0) {
- *(rp++) = c;
- }
- break;
- case '>':
- if (depth) {
- depth--;
- break;
- }
- if (in_q) {
- break;
- }
- switch (state) {
- case 1: /* HTML/XML */
- lc = '>';
- in_q = state = 0;
- if (allow_len) {
- move();
- *(tp++) = '>';
- *tp='\0';
- if (string_tag_find(tbuf, tp-tbuf, abuf)) {
- memcpy(rp, tbuf, tp-tbuf);
- rp += tp-tbuf;
- }
- tp = tbuf;
- }
- break;
- case 2: /* PHP */
- if (!br && lc != '\"' && *(p-1) == '?') {
- in_q = state = 0;
- tp = tbuf;
- }
- break;
- case 3:
- in_q = state = 0;
- tp = tbuf;
- break;
- case 4: /* JavaScript/CSS/etc... */
- if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') {
- in_q = state = 0;
- tp = tbuf;
- }
- break;
- default:
- *(rp++) = c;
- break;
- }
- break;
- case '"':
- case '\'':
- if (state == 4) {
- /* Inside <!-- comment --> */
- break;
- } else if (state == 2 && *(p-1) != '\\') {
- if (lc == c) {
- lc = '\0';
- } else if (lc != '\\') {
- lc = c;
- }
- } else if (state == 0) {
- *(rp++) = c;
- } else if (allow_len && state == 1) {
- move();
- *(tp++) = c;
- }
- if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) {
- if (in_q) {
- in_q = 0;
- } else {
- in_q = *p;
- }
- }
- break;
- case '!':
- /* JavaScript & Other HTML scripting languages */
- if (state == 1 && *(p-1) == '<') {
- state = 3;
- lc = c;
- } else {
- if (state == 0) {
- *(rp++) = c;
- } else if (allow_len && state == 1) {
- move();
- *(tp++) = c;
- }
- }
- break;
- case '-':
- if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') {
- state = 4;
- } else {
- goto reg_char;
- }
- break;
- case '?':
- if (state == 1 && *(p-1) == '<') {
- br=0;
- state=2;
- break;
- }
- case 'E':
- case 'e':
- /* !DOCTYPE exception */
- if (state==3 && p > s+6
- && tolower(*(p-1)) == 'p'
- && tolower(*(p-2)) == 'y'
- && tolower(*(p-3)) == 't'
- && tolower(*(p-4)) == 'c'
- && tolower(*(p-5)) == 'o'
- && tolower(*(p-6)) == 'd') {
- state = 1;
- break;
- }
- /* fall-through */
- case 'l':
- /* swm: If we encounter '<?xml' then we shouldn't be in
- * state == 2 (PHP). Switch back to HTML.
- */
- if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') {
- state = 1;
- break;
- }
- /* fall-through */
- default:
- reg_char:
- if (state == 0) {
- *(rp++) = c;
- } else if (allow_len && state == 1) {
- move();
- *(tp++) = c;
- }
- break;
- }
- c = *(++p);
- i++;
- }
- if (rp < rbuf + len) {
- *rp = '\0';
- }
- if (allow_len) {
- req::free(tbuf);
- }
- retString.setSize(rp - rbuf);
- return retString;
- }
- ///////////////////////////////////////////////////////////////////////////////
- static char string_hex2int(int c) {
- if (isdigit(c)) {
- return c - '0';
- }
- if (c >= 'A' && c <= 'F') {
- return c - 'A' + 10;
- }
- if (c >= 'a' && c <= 'f') {
- return c - 'a' + 10;
- }
- return -1;
- }
- String string_quoted_printable_encode(const char *input, int len) {
- size_t length = len;
- const unsigned char *str = (unsigned char*)input;
- unsigned long lp = 0;
- unsigned char c;
- char *d, *buffer;
- char *hex = "0123456789ABCDEF";
- String ret(
- safe_address(
- 3,
- length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1),
- 1),
- ReserveString
- );
- d = buffer = ret.mutableData();
- while (length--) {
- if (((c = *str++) == '\015') && (*str == '\012') && length > 0) {
- *d++ = '\015';
- *d++ = *str++;
- length--;
- lp = 0;
- } else {
- if (iscntrl (c) || (c == 0x7f) || (c & 0x80) ||
- (c == '=') || ((c == ' ') && (*str == '\015'))) {
- if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f))
- || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL))
- || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL))
- || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) {
- *d++ = '=';
- *d++ = '\015';
- *d++ = '\012';
- lp = 3;
- }
- *d++ = '=';
- *d++ = hex[c >> 4];
- *d++ = hex[c & 0xf];
- } else {
- if ((++lp) > PHP_QPRINT_MAXL) {
- *d++ = '=';
- *d++ = '\015';
- *d++ = '\012';
- lp = 1;
- }
- *d++ = c;
- }
- }
- }
- len = d - buffer;
- ret.setSize(len);
- return ret;
- }
- String string_quoted_printable_decode(const char *input, int len, bool is_q) {
- assertx(input);
- if (len == 0) {
- return String();
- }
- int i = 0, j = 0, k;
- const char *str_in = input;
- String ret(len, ReserveString);
- char *str_out = ret.mutableData();
- while (i < len && str_in[i]) {
- switch (str_in[i]) {
- case '=':
- if (i + 2 < len && str_in[i + 1] && str_in[i + 2] &&
- isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2]))
- {
- str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4)
- + string_hex2int((int) str_in[i + 2]);
- i += 3;
- } else /* check for soft line break according to RFC 2045*/ {
- k = 1;
- while (str_in[i + k] &&
- ((str_in[i + k] == 32) || (str_in[i + k] == 9))) {
- /* Possibly, skip spaces/tabs at the end of line */
- k++;
- }
- if (!str_in[i + k]) {
- /* End of line reached */
- i += k;
- }
- else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) {
- /* CRLF */
- i += k + 2;
- }
- else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) {
- /* CR or LF */
- i += k + 1;
- }
- else {
- str_out[j++] = str_in[i++];
- }
- }
- break;
- case '_':
- if (is_q) {
- str_out[j++] = ' ';
- i++;
- } else {
- str_out[j++] = str_in[i++];
- }
- break;
- default:
- str_out[j++] = str_in[i++];
- }
- }
- ret.setSize(j);
- return ret;
- }
- Variant string_base_to_numeric(const char *s, int len, int base) {
- int64_t num = 0;
- double fnum = 0;
- int mode = 0;
- int64_t cutoff;
- int cutlim;
- assertx(string_validate_base(base));
- cutoff = LONG_MAX / base;
- cutlim = LONG_MAX % base;
- for (int i = len; i > 0; i--) {
- char c = *s++;
- /* might not work for EBCDIC */
- if (c >= '0' && c <= '9')
- c -= '0';
- else if (c >= 'A' && c <= 'Z')
- c -= 'A' - 10;
- else if (c >= 'a' && c <= 'z')
- c -= 'a' - 10;
- else
- continue;
- if (c >= base)
- continue;
- switch (mode) {
- case 0: /* Integer */
- if (num < cutoff || (num == cutoff && c <= cutlim)) {
- num = num * base + c;
- break;
- } else {
- fnum = num;
- mode = 1;
- }
- /* fall-through */
- case 1: /* Float */
- fnum = fnum * base + c;
- }
- }
- if (mode == 1) {
- return fnum;
- }
- return num;
- }
- String string_long_to_base(unsigned long value, int base) {
- static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
- char buf[(sizeof(unsigned long) << 3) + 1];
- char *ptr, *end;
- assertx(string_validate_base(base));
- end = ptr = buf + sizeof(buf) - 1;
- do {
- *--ptr = digits[value % base];
- value /= base;
- } while (ptr > buf && value);
- return String(ptr, end - ptr, CopyString);
- }
- String string_numeric_to_base(const Variant& value, int base) {
- static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
- assertx(string_validate_base(base));
- if ((!value.isInteger() && !value.isDouble())) {
- return empty_string();
- }
- if (value.isDouble()) {
- double fvalue = floor(value.toDouble()); /* floor it just in case */
- char *ptr, *end;
- char buf[(sizeof(double) << 3) + 1];
- /* Don't try to convert +/- infinity */
- if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) {
- raise_warning("Number too large");
- return empty_string();
- }
- end = ptr = buf + sizeof(buf) - 1;
- do {
- *--ptr = digits[(int) fmod(fvalue, base)];
- fvalue /= base;
- } while (ptr > buf && fabs(fvalue) >= 1);
- return String(ptr, end - ptr, CopyString);
- }
- return string_long_to_base(value.toInt64(), base);
- }
- ///////////////////////////////////////////////////////////////////////////////
- // uuencode
- #define PHP_UU_ENC(c) \
- ((c) ? ((c) & 077) + ' ' : '`')
- #define PHP_UU_ENC_C2(c) \
- PHP_UU_ENC(((*(c) * 16) & 060) | ((*((c) + 1) >> 4) & 017))
- #define PHP_UU_ENC_C3(c) \
- PHP_UU_ENC(((*(c + 1) * 4) & 074) | ((*((c) + 2) >> 6) & 03))
- #define PHP_UU_DEC(c) \
- (((c) - ' ') & 077)
- String string_uuencode(const char *src, int src_len) {
- assertx(src);
- assertx(src_len);
- int len = 45;
- char *p;
- const char *s, *e, *ee;
- char *dest;
- /* encoded length is ~ 38% greater than the original */
- String ret((int)ceil(src_len * 1.38) + 45, ReserveString);
- p = dest = ret.mutableData();
- s = src;
- e = src + src_len;
- while ((s + 3) < e) {
- ee = s + len;
- if (ee > e) {
- ee = e;
- len = ee - s;
- if (len % 3) {
- ee = s + (int) (floor(len / 3) * 3);
- }
- }
- *p++ = PHP_UU_ENC(len);
- while (s < ee) {
- *p++ = PHP_UU_ENC(*s >> 2);
- *p++ = PHP_UU_ENC_C2(s);
- *p++ = PHP_UU_ENC_C3(s);
- *p++ = PHP_UU_ENC(*(s + 2) & 077);
- s += 3;
- }
- if (len == 45) {
- *p++ = '\n';
- }
- }
- if (s < e) {
- if (len == 45) {
- *p++ = PHP_UU_ENC(e - s);
- len = 0;
- }
- *p++ = PHP_UU_ENC(*s >> 2);
- *p++ = PHP_UU_ENC_C2(s);
- *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0');
- *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0');
- }
- if (len < 45) {
- *p++ = '\n';
- }
- *p++ = PHP_UU_ENC('\0');
- *p++ = '\n';
- *p = '\0';
- ret.setSize(p - dest);
- return ret;
- }
- String string_uudecode(const char *src, int src_len) {
- int total_len = 0;
- int len;
- const char *s, *e, *ee;
- char *p, *dest;
- String ret(ceil(src_len * 0.75), ReserveString);
- p = dest = ret.mutableData();
- s = src;
- e = src + src_len;
- while (s < e) {
- if ((len = PHP_UU_DEC(*s++)) <= 0) {
- break;
- }
- /* sanity check */
- if (len > src_len) {
- goto err;
- }
- total_len += len;
- ee = s + (len == 45 ? 60 : (int) floor(len * 1.33));
- /* sanity check */
- if (ee > e) {
- goto err;
- }
- while (s < ee) {
- if (s + 4 > e) goto err;
- *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
- *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
- *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
- s += 4;
- }
- if (len < 45) {
- break;
- }
- /* skip \n */
- s++;
- }
- if ((len = total_len > (p - dest))) {
- *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
- if (len > 1) {
- *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
- if (len > 2) {
- *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
- }
- }
- }
- ret.setSize(total_len);
- return ret;
- err:
- return String();
- }
- ///////////////////////////////////////////////////////////////////////////////
- // base64
- namespace {
- const char base64_table[] = {
- 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
- 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
- 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
- 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
- '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
- };
- const char base64_pad = '=';
- const short base64_reverse_table[256] = {
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
- 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
- -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
- -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
- 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
- -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
- };
- folly::Optional<int> maxEncodedSize(int length) {
- if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
- return folly::none;
- }
- return ((length + 2) / 3) * 4;
- }
- // outstr must be at least maxEncodedSize(length) bytes
- size_t php_base64_encode(const unsigned char *str, int length,
- unsigned char* outstr) {
- const unsigned char *current = str;
- unsigned char *p = outstr;
- while (length > 2) { /* keep going until we have less than 24 bits */
- *p++ = base64_table[current[0] >> 2];
- *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
- *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)];
- *p++ = base64_table[current[2] & 0x3f];
- current += 3;
- length -= 3; /* we just handle 3 octets of data */
- }
- /* now deal with the tail end of things */
- if (length != 0) {
- *p++ = base64_table[current[0] >> 2];
- if (length > 1) {
- *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
- *p++ = base64_table[(current[1] & 0x0f) << 2];
- *p++ = base64_pad;
- } else {
- *p++ = base64_table[(current[0] & 0x03) << 4];
- *p++ = base64_pad;
- *p++ = base64_pad;
- }
- }
- return p - outstr;
- }
- // outstr must be at least length bytes
- ssize_t php_base64_decode(const char *str, int length, bool strict,
- unsigned char* outstr) {
- const unsigned char *current = (unsigned char*)str;
- int ch, i = 0, j = 0, k;
- /* this sucks for threaded environments */
- unsigned char* result = outstr;
- /* run through the whole string, converting as we go */
- while ((ch = *current++) != '\0' && length-- > 0) {
- if (ch == base64_pad) {
- if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
- if ((i % 4) != 1) {
- while (isspace(*(++current))) {
- continue;
- }
- if (*current == '\0') {
- continue;
- }
- }
- return -1;
- }
- continue;
- }
- ch = base64_reverse_table[ch];
- if ((!strict && ch < 0) || ch == -1) {
- /* a space or some other separator character, we simply skip over */
- continue;
- } else if (ch == -2) {
- return -1;
- }
- switch(i % 4) {
- case 0:
- result[j] = ch << 2;
- break;
- case 1:
- result[j++] |= ch >> 4;
- result[j] = (ch & 0x0f) << 4;
- break;
- case 2:
- result[j++] |= ch >>2;
- result[j] = (ch & 0x03) << 6;
- break;
- case 3:
- result[j++] |= ch;
- break;
- }
- i++;
- }
- k = j;
- /* mop things up if we ended on a boundary */
- if (ch == base64_pad) {
- switch(i % 4) {
- case 1:
- return -1;
- case 2:
- k++;
- case 3:
- result[k] = 0;
- }
- }
- return j;
- }
- }
- String string_base64_encode(const char* input, int len) {
- if (auto const wantedSize = maxEncodedSize(len)) {
- String ret(*wantedSize, ReserveString);
- auto actualSize = php_base64_encode((unsigned char*)input, len,
- (unsigned char*)ret.mutableData());
- ret.setSize(actualSize);
- return ret;
- }
- return String();
- }
- String string_base64_decode(const char* input, int len, bool strict) {
- String ret(len, ReserveString);
- auto actualSize = php_base64_decode(input, len, strict,
- (unsigned char*)ret.mutableData());
- if (actualSize < 0) return String();
- ret.setSize(actualSize);
- return ret;
- }
- std::string base64_encode(const char* input, int len) {
- if (auto const wantedSize = maxEncodedSize(len)) {
- std::string ret;
- ret.resize(*wantedSize);
- auto actualSize = php_base64_encode((unsigned char*)input, len,
- (unsigned char*)ret.data());
- ret.resize(actualSize);
- return ret;
- }
- return std::string();
- }
- std::string base64_decode(const char* input, int len, bool strict) {
- if (!len) return std::string();
- std::string ret;
- ret.resize(len);
- auto actualSize = php_base64_decode(input, len, strict,
- (unsigned char*)ret.data());
- if (!actualSize) return std::string();
- ret.resize(actualSize);
- return ret;
- }
- ///////////////////////////////////////////////////////////////////////////////
- String string_escape_shell_arg(const char *str) {
- int x, y, l;
- char *cmd;
- y = 0;
- l = strlen(str);
- String ret(safe_address(l, 4, 3), ReserveString); /* worst case */
- cmd = ret.mutableData();
- #ifdef _MSC_VER
- cmd[y++] = '"';
- #else
- cmd[y++] = '\'';
- #endif
- for (x = 0; x < l; x++) {
- switch (str[x]) {
- #ifdef _MSC_VER
- case '"':
- case '%':
- case '!':
- cmd[y++] = ' ';
- break;
- #else
- case '\'':
- cmd[y++] = '\'';
- cmd[y++] = '\\';
- cmd[y++] = '\'';
- #endif
- /* fall-through */
- default:
- cmd[y++] = str[x];
- }
- }
- #ifdef _MSC_VER
- if (y > 0 && '\\' == cmd[y - 1]) {
- int k = 0, n = y - 1;
- for (; n >= 0 && '\\' == cmd[n]; n--, k++);
- if (k % 2) {
- cmd[y++] = '\\';
- }
- }
- cmd[y++] = '"';
- #else
- cmd[y++] = '\'';
- #endif
- ret.setSize(y);
- return ret;
- }
- String string_escape_shell_cmd(const char *str) {
- register int x, y, l;
- char *cmd;
- char *p = nullptr;
- l = strlen(str);
- String ret(safe_address(l, 2, 1), ReserveString);
- cmd = ret.mutableData();
- for (x = 0, y = 0; x < l; x++) {
- switch (str[x]) {
- #ifndef _MSC_VER
- case '"':
- case '\'':
- if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) {
- /* noop */
- } else if (p && *p == str[x]) {
- p = nullptr;
- } else {
- cmd[y++] = '\\';
- }
- cmd[y++] = str[x];
- break;
- #else
- /* % is Windows specific for environmental variables, ^%PATH% will
- output PATH while ^%PATH^% will not. escapeshellcmd->val will
- escape all % and !.
- */
- case '%':
- case '!':
- case '"':
- case '\'':
- #endif
- case '#': /* This is character-set independent */
- case '&':
- case ';':
- case '`':
- case '|':
- case '*':
- case '?':
- case '~':
- case '<':
- case '>':
- case '^':
- case '(':
- case ')':
- case '[':
- case ']':
- case '{':
- case '}':
- case '$':
- case '\\':
- case '\x0A': /* excluding these two */
- case '\xFF':
- #ifdef _MSC_VER
- cmd[y++] = '^';
- #else
- cmd[y++] = '\\';
- #endif
- /* fall-through */
- default:
- cmd[y++] = str[x];
- }
- }
- ret.setSize(y);
- return ret;
- }
- ///////////////////////////////////////////////////////////////////////////////
- static void string_similar_str(const char *txt1, int len1,
- const char *txt2, int len2,
- int *pos1, int *pos2, int *max) {
- const char *p, *q;
- const char *end1 = txt1 + len1;
- const char *end2 = txt2 + len2;
- int l;
- *max = 0;
- for (p = txt1; p < end1; p++) {
- for (q = txt2; q < end2; q++) {
- for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++);
- if (l > *max) {
- *max = l;
- *pos1 = p - txt1;
- *pos2 = q - txt2;
- }
- }
- }
- }
- static int string_similar_char(const char *txt1, int len1,
- const char *txt2, int len2) {
- int sum;
- int pos1 = 0, pos2 = 0, max;
- string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);
- if ((sum = max)) {
- if (pos1 && pos2) {
- sum += string_similar_char(txt1, pos1, txt2, pos2);
- }
- if ((pos1 + max < len1) && (pos2 + max < len2)) {
- sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max,
- txt2 + pos2 + max, len2 - pos2 - max);
- }
- }
- return sum;
- }
- int string_similar_text(const char *t1, int len1,
- const char *t2, int len2, double *percent) {
- if (len1 == 0 && len2 == 0) {
- if (percent) *percent = 0.0;
- return 0;
- }
- int sim = string_similar_char(t1, len1, t2, len2);
- if (percent) *percent = sim * 200.0 / (len1 + len2);
- return sim;
- }
- ///////////////////////////////////////////////////////////////////////////////
- #define LEVENSHTEIN_MAX_LENTH 255
- // reference implementation, only optimized for memory usage, not speed
- int string_levenshtein(const char *s1, int l1, const char *s2, int l2,
- int cost_ins, int cost_rep, int cost_del ) {
- int *p1, *p2, *tmp;
- int i1, i2, c0, c1, c2;
- if (l1==0) return l2*cost_ins;
- if (l2==0) return l1*cost_del;
- if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) {
- raise_warning("levenshtein(): Argument string(s) too long");
- return -1;
- }
- p1 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
- SCOPE_EXIT { req::free(p1); };
- p2 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
- SCOPE_EXIT { req::free(p2); };
- for(i2=0;i2<=l2;i2++) {
- p1[i2] = i2*cost_ins;
- }
- for(i1=0;i1<l1;i1++) {
- p2[0]=p1[0]+cost_del;
- for(i2=0;i2<l2;i2++) {
- c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep);
- c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1;
- c2=p2[i2]+cost_ins; if (c2<c0) c0=c2;
- p2[i2+1]=c0;
- }
- tmp=p1; p1=p2; p2=tmp;
- }
- c0=p1[l2];
- return c0;
- }
- ///////////////////////////////////////////////////////////////////////////////
- String string_money_format(const char *format, double value) {
- bool check = false;
- const char *p = format;
- while ((p = strchr(p, '%'))) {
- if (*(p + 1) == '%') {
- p += 2;
- } else if (!check) {
- check = true;
- p++;
- } else {
- raise_invalid_argument_warning
- ("format: Only a single %%i or %%n token can be used");
- return String();
- }
- }
- int format_len = strlen(format);
- int str_len = safe_address(format_len, 1, 1024);
- String ret(str_len, ReserveString);
- char *str = ret.mutableData();
- if ((str_len = strfmon(str, str_len, format, value)) < 0) {
- return String();
- }
- ret.setSize(str_len);
- return ret;
- }
- ///////////////////////////////////////////////////////////////////////////////
- String string_number_format(double d, int dec,
- const String& dec_point,
- const String& thousand_sep) {
- char *tmpbuf = nullptr, *resbuf;
- char *s, *t; /* source, target */
- char *dp;
- int integral;
- int tmplen, reslen=0;
- int count=0;
- int is_negative=0;
- if (d < 0) {
- is_negative = 1;
- d = -d;
- }
- if (dec < 0) dec = 0;
- d = php_math_round(d, dec);
- // departure from PHP: we got rid of dependencies on spprintf() here.
- // This actually means 63 bytes for characters + 1 byte for '\0'
- String tmpstr(63, ReserveString);
- tmpbuf = tmpstr.mutableData();
- tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d);
- // From the man page of snprintf, the return value is:
- // The number of characters that would have been written if n had been
- // sufficiently large, not counting the terminating null character.
- if (tmplen < 0) return empty_string();
- if (tmplen < 64 && (tmpbuf == nullptr || !isdigit((int)tmpbuf[0]))) {
- tmpstr.setSize(tmplen);
- return tmpstr;
- }
- if (tmplen >= 64) {
- // Uncommon, asked for more than 64 chars worth of precision
- tmpstr = String(tmplen, ReserveString);
- tmpbuf = tmpstr.mutableData();
- tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d);
- if (tmplen < 0) return empty_string();
- if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
- tmpstr.setSize(tmplen);
- return tmpstr;
- }
- }
- /* find decimal point, if expected */
- if (dec) {
- dp = strpbrk(tmpbuf, ".,");
- } else {
- dp = nullptr;
- }
- /* calculate the length of the return buffer */
- if (dp) {
- integral = dp - tmpbuf;
- } else {
- /* no decimal point was found */
- integral = tmplen;
- }
- /* allow for thousand separators */
- if (!thousand_sep.empty()) {
- if (integral + thousand_sep.size() * ((integral-1) / 3) < integral) {
- /* overflow */
- raise_error("String overflow");
- }
- integral += ((integral-1) / 3) * thousand_sep.size();
- }
- reslen = integral;
- if (dec) {
- reslen += dec;
- if (!dec_point.empty()) {
- if (reslen + dec_point.size() < dec_point.size()) {
- /* overflow */
- raise_error("String overflow");
- }
- reslen += dec_point.size();
- }
- }
- /* add a byte for minus sign */
- if (is_negative) {
- reslen++;
- }
- String resstr(reslen, ReserveString);
- resbuf = resstr.mutableData();
- s = tmpbuf+tmplen-1;
- t = resbuf+reslen-1;
- /* copy the decimal places.
- * Take care, as the sprintf implementation may return less places than
- * we requested due to internal buffer limitations */
- if (dec) {
- int declen = dp ? s - dp : 0;
- int topad = dec > declen ? dec - declen : 0;
- /* pad with '0's */
- while (topad--) {
- *t-- = '0';
- }
- if (dp) {
- s -= declen + 1; /* +1 to skip the point */
- t -= declen;
- /* now copy the chars after the point */
- memcpy(t + 1, dp + 1, declen);
- }
- /* add decimal point */
- if (!dec_point.empty()) {
- memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size());
- t -= dec_point.size();
- }
- }
- /* copy the numbers before the decimal point, adding thousand
- * separator every three digits */
- while(s >= tmpbuf) {
- *t-- = *s--;
- if (thousand_sep && (++count%3)==0 && s>=tmpbuf) {
- memcpy(t + (1 - thousand_sep.size()),
- thousand_sep.data(),
- thousand_sep.size());
- t -= thousand_sep.size();
- }
- }
- /* and a minus sign, if needed */
- if (is_negative) {
- *t-- = '-';
- }
- resstr.setSize(reslen);
- return resstr;
- }
- ///////////////////////////////////////////////////////////////////////////////
- // soundex
- /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
- String string_soundex(const String& str) {
- assertx(!str.empty());
- int _small, code, last;
- String retString(4, ReserveString);
- char* soundex = retString.mutableData();
- static char soundex_table[26] = {
- 0, /* A */
- '1', /* B */
- '2', /* C */
- '3', /* D */
- 0, /* E */
- '1', /* F */
- '2', /* G */
- 0, /* H */
- 0, /* I */
- '2', /* J */
- '2', /* K */
- '4', /* L */
- '5', /* M */
- '5', /* N */
- 0, /* O */
- '1', /* P */
- '2', /* Q */
- '6', /* R */
- '2', /* S */
- '3', /* T */
- 0, /* U */
- '1', /* V */
- 0, /* W */
- '2', /* X */
- 0, /* Y */
- '2' /* Z */
- };
- /* build soundex string */
- last = -1;
- auto p = str.slice().data();
- for (_small = 0; *p && _small < 4; p++) {
- /* convert chars to upper case and strip non-letter chars */
- /* BUG: should also map here accented letters used in non */
- /* English words or names (also found in English text!): */
- /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
- code = toupper((int)(unsigned char)(*p));
- if (code >= 'A' && code <= 'Z') {
- if (_small == 0) {
- /* remember first valid char */
- soundex[_small++] = code;
- last = soundex_table[code - 'A'];
- } else {
- /* ignore sequences of consonants with same soundex */
- /* code in trail, and vowels unless they separate */
- /* consonant letters */
- code = soundex_table[code - 'A'];
- if (code != last) {
- if (code != 0) {
- soundex[_small++] = code;
- }
- last = code;
- }
- }
- }
- }
- /* pad with '0' and terminate with 0 ;-) */
- while (_small < 4) {
- soundex[_small++] = '0';
- }
- retString.setSize(4);
- return retString;
- }
- ///////////////////////////////////////////////////////////////////////////////
- // metaphone
- /**
- * this is now the original code by Michael G Schwern:
- * i've changed it just a slightly bit (use emalloc,
- * get rid of includes etc)
- * - thies - 13.09.1999
- */
- /*----------------------------- */
- /* this used to be "metaphone.h" */
- /*----------------------------- */
- /* Special encodings */
- #define SH 'X'
- #define TH '0'
- /*----------------------------- */
- /* end of "metaphone.h" */
- /*----------------------------- */
- /*----------------------------- */
- /* this used to be "metachar.h" */
- /*----------------------------- */
- /* Metachar.h ... little bits about characters for metaphone */
- /*-- Character encoding array & accessing macros --*/
- /* Stolen directly out of the book... */
- char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
- #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
- #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
- /* These letters are passed through unchanged */
- #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
- /* These form dipthongs when preceding H */
- #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
- /* These make C and G soft */
- #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
- /* These prevent GH from becoming F */
- #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
- /*----------------------------- */
- /* end of "metachar.h" */
- /*----------------------------- */
- /* I suppose I could have been using a character pointer instead of
- * accesssing the array directly... */
- /* Look at the next letter in the word */
- #define Next_Letter ((char)toupper(word[w_idx+1]))
- /* Look at the current letter in the word */
- #define Curr_Letter ((char)toupper(word[w_idx]))
- /* Go N letters back. */
- #define Look_Back_Letter(n) (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
- /* Previous letter. I dunno, should this return null on failure? */
- #define Prev_Letter (Look_Back_Letter(1))
- /* Look two letters down. It makes sure you don't walk off the string. */
- #define After_Next_Letter (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
- : '\0')
- #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
- /* Allows us to safely look ahead an arbitrary # of letters */
- /* I probably could have just used strlen... */
- static char Lookahead(unsigned char *word, int how_far) {
- char letter_ahead = '\0'; /* null by default */
- int idx;
- for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
- /* Edge forward in the string... */
- letter_ahead = (char)word[idx]; /* idx will be either == to how_far or
- * at the end of the string
- */
- return letter_ahead;
- }
- /* phonize one letter
- * We don't know the buffers size in advance. On way to solve this is to just
- * re-allocate the buffer size. We're using an extra of 2 characters (this
- * could be one though; or more too). */
- #define Phonize(c) { buffer.append(c); }
- /* How long is the phoned word? */
- #define Phone_Len (buffer.size())
- /* Note is a letter is a 'break' in the word */
- #define Isbreak(c) (!isalpha(c))
- String string_metaphone(const char *input, int word_len, long max_phonemes,
- int traditional) {
- unsigned char *word = (unsigned char *)input;
- int w_idx = 0; /* point in the phonization we're at. */
- int max_buffer_len = 0; /* maximum length of the destination buffer */
- /*-- Parameter checks --*/
- /* Negative phoneme length is meaningless */
- if (max_phonemes < 0)
- return String();
- /* Empty/null string is meaningless */
- /* Overly paranoid */
- /* always_assert(word != NULL && word[0] != '\0'); */
- if (word == nullptr)
- return String();
- /*-- Allocate memory for our phoned_phrase --*/
- if (max_phonemes == 0) { /* Assume largest possible */
- max_buffer_len = word_len;
- } else {
- max_buffer_len = max_phonemes;
- }
- StringBuffer buffer(max_buffer_len);
- /*-- The first phoneme has to be processed specially. --*/
- /* Find our first letter */
- for (; !isalpha(Curr_Letter); w_idx++) {
- /* On the off chance we were given nothing but crap... */
- if (Curr_Letter == '\0') {
- return buffer.detach(); /* For testing */
- }
- }
- switch (Curr_Letter) {
- /* AE becomes E */
- case 'A':
- if (Next_Letter == 'E') {
- Phonize('E');
- w_idx += 2;
- }
- /* Remember, preserve vowels at the beginning */
- else {
- Phonize('A');
- w_idx++;
- }
- break;
- /* [GKP]N becomes N */
- case 'G':
- case 'K':
- case 'P':
- if (Next_Letter == 'N') {
- Phonize('N');
- w_idx += 2;
- }…
Large files files are truncated, but you can click here to view the full file