zend-string.cpp | searchcode

/hphp/runtime/base/zend-string.cpp

Large files files are truncated, but you can click here to view the full file

/*
   +----------------------------------------------------------------------+
   | HipHop for PHP                                                       |
   +----------------------------------------------------------------------+
   | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
   +----------------------------------------------------------------------+
   | This source file is subject to version 2.00 of the Zend license,     |
   | that is bundled with this package in the file LICENSE, and is        |
   | available through the world-wide-web at the following url:           |
   | http://www.zend.com/license/2_00.txt.                                |
   | If you did not receive a copy of the Zend license and are unable to  |
   | obtain it through the world-wide-web, please send a note to          |
   | license@zend.com so we can mail you a copy immediately.              |
   +----------------------------------------------------------------------+
*/

// NOTE: See also "hphp/zend/zend-string.*".

#include "hphp/runtime/base/zend-string.h"
#include "hphp/runtime/base/zend-printf.h"

#include "hphp/util/lock.h"
#include "hphp/util/overflow.h"

#include "hphp/zend/zend-math.h"

#include <algorithm>
#include <cmath>

#ifndef _MSC_VER
#include <monetary.h>
#endif

#include "hphp/util/bstring.h"
#include "hphp/runtime/base/exceptions.h"
#include "hphp/runtime/base/string-buffer.h"
#include "hphp/runtime/base/runtime-error.h"
#include "hphp/runtime/base/string-util.h"
#include "hphp/runtime/base/builtin-functions.h"

#include <folly/portability/String.h>

#define PHP_QPRINT_MAXL 75

namespace HPHP {
///////////////////////////////////////////////////////////////////////////////
// helpers

void string_charmask(const char *sinput, int len, char *mask) {
  const unsigned char *input = (unsigned char *)sinput;
  const unsigned char *end;
  unsigned char c;

  memset(mask, 0, 256);
  for (end = input+len; input < end; input++) {
    c=*input;
    if ((input+3 < end) && input[1] == '.' && input[2] == '.'
        && input[3] >= c) {
      memset(mask+c, 1, input[3] - c + 1);
      input+=3;
    } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') {
      /* Error, try to be as helpful as possible:
         (a range ending/starting with '.' won't be captured here) */
      if (end-len >= input) { /* there was no 'left' char */
        raise_invalid_argument_warning
          ("charlist: Invalid '..'-range, missing left of '..'");
        continue;
      }
      if (input+2 >= end) { /* there is no 'right' char */
        raise_invalid_argument_warning
          ("charlist: Invalid '..'-range, missing right of '..'");
        continue;
      }
      if (input[-1] > input[2]) { /* wrong order */
        raise_invalid_argument_warning
          ("charlist: '..'-range needs to be incrementing");
        continue;
      }
      /* FIXME: better error (a..b..c is the only left possibility?) */
      raise_invalid_argument_warning("charlist: Invalid '..'-range");
      continue;
    } else {
      mask[c]=1;
    }
  }
}

///////////////////////////////////////////////////////////////////////////////

void string_to_case(String& s, int (*tocase)(int)) {
  assertx(!s.isNull());
  assertx(tocase);
  auto data = s.mutableData();
  auto len = s.size();
  for (int i = 0; i < len; i++) {
    data[i] = tocase(data[i]);
  }
}

///////////////////////////////////////////////////////////////////////////////

#define STR_PAD_LEFT            0
#define STR_PAD_RIGHT           1
#define STR_PAD_BOTH            2

String string_pad(const char *input, int len, int pad_length,
                  const char *pad_string, int pad_str_len,
                  int pad_type) {
  assertx(input);
  int num_pad_chars = pad_length - len;

  /* If resulting string turns out to be shorter than input string,
     we simply copy the input and return. */
  if (pad_length < 0 || num_pad_chars < 0) {
    return String(input, len, CopyString);
  }

  /* Setup the padding string values if specified. */
  if (pad_str_len == 0) {
    SystemLib::throwRuntimeExceptionObject(
      "Invalid argument: pad_string: (empty)");
  }

  String ret(pad_length, ReserveString);
  char *result = ret.mutableData();

  /* We need to figure out the left/right padding lengths. */
  int left_pad, right_pad;
  switch (pad_type) {
  case STR_PAD_RIGHT:
    left_pad = 0;
    right_pad = num_pad_chars;
    break;
  case STR_PAD_LEFT:
    left_pad = num_pad_chars;
    right_pad = 0;
    break;
  case STR_PAD_BOTH:
    left_pad = num_pad_chars / 2;
    right_pad = num_pad_chars - left_pad;
    break;
  default:
    SystemLib::throwRuntimeExceptionObject(
      folly::sformat("Invalid argument: pad_type: {}", pad_type));
  }

  /* First we pad on the left. */
  int result_len = 0;
  for (int i = 0; i < left_pad; i++) {
    result[result_len++] = pad_string[i % pad_str_len];
  }

  /* Then we copy the input string. */
  memcpy(result + result_len, input, len);
  result_len += len;

  /* Finally, we pad on the right. */
  for (int i = 0; i < right_pad; i++) {
    result[result_len++] = pad_string[i % pad_str_len];
  }
  ret.setSize(result_len);
  return ret;
}

///////////////////////////////////////////////////////////////////////////////

int string_find(const char *input, int len, char ch, int pos,
                bool case_sensitive) {
  assertx(input);
  if (pos < 0 || pos > len) {
    return -1;
  }
  const void *ptr;
  if (case_sensitive) {
    ptr = memchr(input + pos, ch, len - pos);
  } else {
    ptr = bstrcasechr(input + pos, ch, len - pos);
  }
  if (ptr != nullptr) {
    return (int)((const char *)ptr - input);
  }
  return -1;
}

int string_rfind(const char *input, int len, char ch, int pos,
                 bool case_sensitive) {
  assertx(input);
  if (pos < -len || pos > len) {
    return -1;
  }
  const void *ptr;
  if (case_sensitive) {
    if (pos >= 0) {
      ptr = memrchr(input + pos, ch, len - pos);
    } else {
      ptr = memrchr(input, ch, len + pos + 1);
    }
  } else {
    if (pos >= 0) {
      ptr = bstrrcasechr(input + pos, ch, len - pos);
    } else {
      ptr = bstrrcasechr(input, ch, len + pos + 1);
    }
  }
  if (ptr != nullptr) {
    return (int)((const char *)ptr - input);
  }
  return -1;
}

int string_find(const char *input, int len, const char *s, int s_len,
                int pos, bool case_sensitive) {
  assertx(input);
  assertx(s);
  if (!s_len || pos < 0 || pos > len) {
    return -1;
  }
  void *ptr;
  if (case_sensitive) {
    ptr = (void*)string_memnstr(input + pos, s, s_len, input + len);
  } else {
    ptr = bstrcasestr(input + pos, len - pos, s, s_len);
  }
  if (ptr != nullptr) {
    return (int)((const char *)ptr - input);
  }
  return -1;
}

int string_rfind(const char *input, int len, const char *s, int s_len,
                 int pos, bool case_sensitive) {
  assertx(input);
  assertx(s);
  if (!s_len || pos < -len || pos > len) {
    return -1;
  }
  void *ptr;
  if (case_sensitive) {
    if (pos >= 0) {
      ptr = bstrrstr(input + pos, len - pos, s, s_len);
    } else {
      ptr = bstrrstr(input, len + std::min(pos + s_len, 0), s, s_len);
    }
  } else {
    if (pos >= 0) {
      ptr = bstrrcasestr(input + pos, len - pos, s, s_len);
    } else {
      ptr = bstrrcasestr(input, len + std::min(pos + s_len, 0), s, s_len);
    }
  }
  if (ptr != nullptr) {
    return (int)((const char *)ptr - input);
  }
  return -1;
}

const char *string_memnstr(const char *haystack, const char *needle,
                           int needle_len, const char *end) {
  const char *p = haystack;
  char ne = needle[needle_len-1];

  end -= needle_len;
  while (p <= end) {
    if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
      if (!memcmp(needle, p, needle_len-1)) {
        return p;
      }
    }
    if (p == nullptr) {
      return nullptr;
    }
    p++;
  }
  return nullptr;
}

String string_replace(const char *s, int len, int start, int length,
                      const char *replacement, int len_repl) {
  assertx(s);
  assertx(replacement);
  assertx(len >= 0);

  // if "start" position is negative, count start position from the end
  // of the string
  if (start < 0) {
    start = len + start;
    if (start < 0) {
      start = 0;
    }
  }
  if (start > len) {
    start = len;
  }
  // if "length" position is negative, set it to the length
  // needed to stop that many chars from the end of the string
  if (length < 0) {
    length = (len - start) + length;
    if (length < 0) {
      length = 0;
    }
  }
  // check if length is too large
  if (length > len) {
    length = len;
  }
  // check if the length is too large adjusting for non-zero start
  // Write this way instead of start + length > len to avoid overflow
  if (length > len - start) {
    length = len - start;
  }

  String retString(len + len_repl - length, ReserveString);
  char *ret = retString.mutableData();

  int ret_len = 0;
  if (start) {
    memcpy(ret, s, start);
    ret_len += start;
  }
  if (len_repl) {
    memcpy(ret + ret_len, replacement, len_repl);
    ret_len += len_repl;
  }
  len -= (start + length);
  if (len) {
    memcpy(ret + ret_len, s + start + length, len);
    ret_len += len;
  }
  retString.setSize(ret_len);
  return retString;
}

String string_replace(const char *input, int len,
                      const char *search, int len_search,
                      const char *replacement, int len_replace,
                      int &count, bool case_sensitive) {
  assertx(input);
  assertx(search && len_search);
  assertx(len >= 0);
  assertx(len_search >= 0);
  assertx(len_replace >= 0);

  if (len == 0) {
    return String();
  }

  req::vector<int> founds;
  founds.reserve(16);
  if (len_search == 1) {
    for (int pos = string_find(input, len, *search, 0, case_sensitive);
         pos >= 0;
         pos = string_find(input, len, *search, pos + len_search,
                           case_sensitive)) {
      founds.push_back(pos);
    }
  } else {
    for (int pos = string_find(input, len, search, len_search, 0,
                               case_sensitive);
         pos >= 0;
         pos = string_find(input, len, search, len_search,
                           pos + len_search, case_sensitive)) {
      founds.push_back(pos);
    }
  }

  count = founds.size();
  if (count == 0) {
    return String(); // not found
  }

  int reserve;

  // Make sure the new size of the string wouldn't overflow int32_t. Don't
  // bother if the replacement wouldn't make the string longer.
  if (len_replace > len_search) {
    auto raise = [&] { raise_error("String too large"); };
    if (mul_overflow(len_replace - len_search, count)) {
      raise();
    }
    int diff = (len_replace - len_search) * count;
    if (add_overflow(len, diff)) {
      raise();
    }
    reserve = len + diff;
  } else {
    reserve = len + (len_replace - len_search) * count;
  }

  String retString(reserve, ReserveString);
  char *ret = retString.mutableData();
  char *p = ret;
  int pos = 0; // last position in input that hasn't been copied over yet
  int n;
  for (unsigned int i = 0; i < founds.size(); i++) {
    n = founds[i];
    if (n > pos) {
      n -= pos;
      memcpy(p, input, n);
      p += n;
      input += n;
      pos += n;
    }
    if (len_replace) {
      memcpy(p, replacement, len_replace);
      p += len_replace;
    }
    input += len_search;
    pos += len_search;
  }
  n = len;
  if (n > pos) {
    n -= pos;
    memcpy(p, input, n);
    p += n;
  }
  retString.setSize(p - ret);
  return retString;
}

///////////////////////////////////////////////////////////////////////////////

String string_chunk_split(const char *src, int srclen, const char *end,
                          int endlen, int chunklen) {
  int chunks = srclen / chunklen; // complete chunks!
  int restlen = srclen - chunks * chunklen; /* srclen % chunklen */

  String ret(
    safe_address(
      chunks + 1,
      endlen,
      srclen
    ),
    ReserveString
  );
  char *dest = ret.mutableData();

  const char *p; char *q;
  const char *pMax = src + srclen - chunklen + 1;
  for (p = src, q = dest; p < pMax; ) {
    memcpy(q, p, chunklen);
    q += chunklen;
    memcpy(q, end, endlen);
    q += endlen;
    p += chunklen;
  }

  if (restlen) {
    memcpy(q, p, restlen);
    q += restlen;
    memcpy(q, end, endlen);
    q += endlen;
  }

  ret.setSize(q - dest);
  return ret;
}

///////////////////////////////////////////////////////////////////////////////

#define PHP_TAG_BUF_SIZE 1023

/**
 * Check if tag is in a set of tags
 *
 * states:
 *
 * 0 start tag
 * 1 first non-whitespace char seen
 */
static int string_tag_find(const char *tag, int len, const char *set) {
  char c, *n;
  const char *t;
  int state=0, done=0;
  char *norm;

  if (len <= 0) {
    return 0;
  }

  norm = (char *)req::malloc_noptrs(len+1);
  SCOPE_EXIT { req::free(norm); };

  n = norm;
  t = tag;
  c = tolower(*t);
  /*
    normalize the tag removing leading and trailing whitespace
    and turn any <a whatever...> into just <a> and any </tag>
    into <tag>
  */
  while (!done) {
    switch (c) {
    case '<':
      *(n++) = c;
      break;
    case '>':
      done =1;
      break;
    default:
      if (!isspace((int)c)) {
        if (state == 0) {
          state=1;
        }
        if (c != '/') {
          *(n++) = c;
        }
      } else {
        if (state == 1)
          done=1;
      }
      break;
    }
    c = tolower(*(++t));
  }
  *(n++) = '>';
  *n = '\0';
  if (strstr(set, norm)) {
    done=1;
  } else {
    done=0;
  }
  return done;
}

/**
 * A simple little state-machine to strip out html and php tags
 *
 * State 0 is the output state, State 1 means we are inside a
 * normal html tag and state 2 means we are inside a php tag.
 *
 * The state variable is passed in to allow a function like fgetss
 * to maintain state across calls to the function.
 *
 * lc holds the last significant character read and br is a bracket
 * counter.
 *
 * When an allow string is passed in we keep track of the string
 * in state 1 and when the tag is closed check it against the
 * allow string to see if we should allow it.

 * swm: Added ability to strip <?xml tags without assuming it PHP
 * code.
 */
String string_strip_tags(const char *s, const int len,
                         const char *allow, const int allow_len,
                         bool allow_tag_spaces) {
  const char *abuf, *p;
  char *rbuf, *tbuf, *tp, *rp, c, lc;

  int br, i=0, depth=0, in_q = 0;
  int state = 0, pos;

  assertx(s);
  assertx(allow);

  String retString(s, len, CopyString);
  rbuf = retString.mutableData();
  String allowString;

  c = *s;
  lc = '\0';
  p = s;
  rp = rbuf;
  br = 0;
  if (allow_len) {
    assertx(allow);

    allowString = String(allow_len, ReserveString);
    char *atmp = allowString.mutableData();
    for (const char *tmp = allow; *tmp; tmp++, atmp++) {
      *atmp = tolower((int)*(const unsigned char *)tmp);
    }
    allowString.setSize(allow_len);
    abuf = allowString.data();

    tbuf = (char *)req::malloc_noptrs(PHP_TAG_BUF_SIZE+1);
    tp = tbuf;
  } else {
    abuf = nullptr;
    tbuf = tp = nullptr;
  }

  auto move = [&pos, &tbuf, &tp]() {
    if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
      pos = tp - tbuf;
      tbuf = (char*)req::realloc_noptrs(tbuf,
                                        (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
      tp = tbuf + pos;
    }
  };

  while (i < len) {
    switch (c) {
    case '\0':
      break;
    case '<':
      if (isspace(*(p + 1)) && !allow_tag_spaces) {
        goto reg_char;
      }
      if (state == 0) {
        lc = '<';
        state = 1;
        if (allow_len) {
          move();
          *(tp++) = '<';
        }
      } else if (state == 1) {
        depth++;
      }
      break;

    case '(':
      if (state == 2) {
        if (lc != '"' && lc != '\'') {
          lc = '(';
          br++;
        }
      } else if (allow_len && state == 1) {
        move();
        *(tp++) = c;
      } else if (state == 0) {
        *(rp++) = c;
      }
      break;

    case ')':
      if (state == 2) {
        if (lc != '"' && lc != '\'') {
          lc = ')';
          br--;
        }
      } else if (allow_len && state == 1) {
        move();
        *(tp++) = c;
      } else if (state == 0) {
        *(rp++) = c;
      }
      break;

    case '>':
      if (depth) {
        depth--;
        break;
      }

      if (in_q) {
        break;
      }

      switch (state) {
      case 1: /* HTML/XML */
        lc = '>';
        in_q = state = 0;
        if (allow_len) {
          move();
          *(tp++) = '>';
          *tp='\0';
          if (string_tag_find(tbuf, tp-tbuf, abuf)) {
            memcpy(rp, tbuf, tp-tbuf);
            rp += tp-tbuf;
          }
          tp = tbuf;
        }
        break;

      case 2: /* PHP */
        if (!br && lc != '\"' && *(p-1) == '?') {
          in_q = state = 0;
          tp = tbuf;
        }
        break;

      case 3:
        in_q = state = 0;
        tp = tbuf;
        break;

      case 4: /* JavaScript/CSS/etc... */
        if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') {
          in_q = state = 0;
          tp = tbuf;
        }
        break;

      default:
        *(rp++) = c;
        break;
      }
      break;

    case '"':
    case '\'':
      if (state == 4) {
        /* Inside <!-- comment --> */
        break;
      } else if (state == 2 && *(p-1) != '\\') {
        if (lc == c) {
          lc = '\0';
        } else if (lc != '\\') {
          lc = c;
        }
      } else if (state == 0) {
        *(rp++) = c;
      } else if (allow_len && state == 1) {
        move();
        *(tp++) = c;
      }
      if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) {
        if (in_q) {
          in_q = 0;
        } else {
          in_q = *p;
        }
      }
      break;

    case '!':
      /* JavaScript & Other HTML scripting languages */
      if (state == 1 && *(p-1) == '<') {
        state = 3;
        lc = c;
      } else {
        if (state == 0) {
          *(rp++) = c;
        } else if (allow_len && state == 1) {
          move();
          *(tp++) = c;
        }
      }
      break;

    case '-':
      if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') {
        state = 4;
      } else {
        goto reg_char;
      }
      break;

    case '?':

      if (state == 1 && *(p-1) == '<') {
        br=0;
        state=2;
        break;
      }

    case 'E':
    case 'e':
      /* !DOCTYPE exception */
      if (state==3 && p > s+6
          && tolower(*(p-1)) == 'p'
          && tolower(*(p-2)) == 'y'
          && tolower(*(p-3)) == 't'
          && tolower(*(p-4)) == 'c'
          && tolower(*(p-5)) == 'o'
          && tolower(*(p-6)) == 'd') {
        state = 1;
        break;
      }
      /* fall-through */

    case 'l':

      /* swm: If we encounter '<?xml' then we shouldn't be in
       * state == 2 (PHP). Switch back to HTML.
       */

      if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') {
        state = 1;
        break;
      }

      /* fall-through */
    default:
    reg_char:
      if (state == 0) {
        *(rp++) = c;
      } else if (allow_len && state == 1) {
        move();
        *(tp++) = c;
      }
      break;
    }
    c = *(++p);
    i++;
  }
  if (rp < rbuf + len) {
    *rp = '\0';
  }
  if (allow_len) {
    req::free(tbuf);
  }

  retString.setSize(rp - rbuf);
  return retString;
}

///////////////////////////////////////////////////////////////////////////////

static char string_hex2int(int c) {
  if (isdigit(c)) {
    return c - '0';
  }
  if (c >= 'A' && c <= 'F') {
    return c - 'A' + 10;
  }
  if (c >= 'a' && c <= 'f') {
    return c - 'a' + 10;
  }
  return -1;
}

String string_quoted_printable_encode(const char *input, int len) {
  size_t length = len;
  const unsigned char *str = (unsigned char*)input;

  unsigned long lp = 0;
  unsigned char c;
  char *d, *buffer;
  char *hex = "0123456789ABCDEF";

  String ret(
    safe_address(
      3,
      length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1),
      1),
    ReserveString
  );
  d = buffer = ret.mutableData();

  while (length--) {
    if (((c = *str++) == '\015') && (*str == '\012') && length > 0) {
      *d++ = '\015';
      *d++ = *str++;
      length--;
      lp = 0;
    } else {
      if (iscntrl (c) || (c == 0x7f) || (c & 0x80) ||
          (c == '=') || ((c == ' ') && (*str == '\015'))) {
        if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f))
            || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL))
            || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL))
            || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) {
          *d++ = '=';
          *d++ = '\015';
          *d++ = '\012';
          lp = 3;
        }
        *d++ = '=';
        *d++ = hex[c >> 4];
        *d++ = hex[c & 0xf];
      } else {
        if ((++lp) > PHP_QPRINT_MAXL) {
          *d++ = '=';
          *d++ = '\015';
          *d++ = '\012';
          lp = 1;
        }
        *d++ = c;
      }
    }
  }
  len = d - buffer;

  ret.setSize(len);
  return ret;
}

String string_quoted_printable_decode(const char *input, int len, bool is_q) {
  assertx(input);
  if (len == 0) {
    return String();
  }

  int i = 0, j = 0, k;
  const char *str_in = input;
  String ret(len, ReserveString);
  char *str_out = ret.mutableData();
  while (i < len && str_in[i]) {
    switch (str_in[i]) {
    case '=':
      if (i + 2 < len && str_in[i + 1] && str_in[i + 2] &&
          isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2]))
        {
          str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4)
            + string_hex2int((int) str_in[i + 2]);
          i += 3;
        } else  /* check for soft line break according to RFC 2045*/ {
        k = 1;
        while (str_in[i + k] &&
               ((str_in[i + k] == 32) || (str_in[i + k] == 9))) {
          /* Possibly, skip spaces/tabs at the end of line */
          k++;
        }
        if (!str_in[i + k]) {
          /* End of line reached */
          i += k;
        }
        else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) {
          /* CRLF */
          i += k + 2;
        }
        else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) {
          /* CR or LF */
          i += k + 1;
        }
        else {
          str_out[j++] = str_in[i++];
        }
      }
      break;
    case '_':
      if (is_q) {
        str_out[j++] = ' ';
        i++;
      } else {
        str_out[j++] = str_in[i++];
      }
      break;
    default:
      str_out[j++] = str_in[i++];
    }
  }
  ret.setSize(j);
  return ret;
}

Variant string_base_to_numeric(const char *s, int len, int base) {
  int64_t num = 0;
  double fnum = 0;
  int mode = 0;
  int64_t cutoff;
  int cutlim;

  assertx(string_validate_base(base));

  cutoff = LONG_MAX / base;
  cutlim = LONG_MAX % base;

  for (int i = len; i > 0; i--) {
    char c = *s++;

    /* might not work for EBCDIC */
    if (c >= '0' && c <= '9')
      c -= '0';
    else if (c >= 'A' && c <= 'Z')
      c -= 'A' - 10;
    else if (c >= 'a' && c <= 'z')
      c -= 'a' - 10;
    else
      continue;

    if (c >= base)
      continue;

    switch (mode) {
    case 0: /* Integer */
      if (num < cutoff || (num == cutoff && c <= cutlim)) {
        num = num * base + c;
        break;
      } else {
        fnum = num;
        mode = 1;
      }
      /* fall-through */
    case 1: /* Float */
      fnum = fnum * base + c;
    }
  }

  if (mode == 1) {
    return fnum;
  }
  return num;
}

String string_long_to_base(unsigned long value, int base) {
  static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
  char buf[(sizeof(unsigned long) << 3) + 1];
  char *ptr, *end;

  assertx(string_validate_base(base));

  end = ptr = buf + sizeof(buf) - 1;

  do {
    *--ptr = digits[value % base];
    value /= base;
  } while (ptr > buf && value);

  return String(ptr, end - ptr, CopyString);
}

String string_numeric_to_base(const Variant& value, int base) {
  static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";

  assertx(string_validate_base(base));
  if ((!value.isInteger() && !value.isDouble())) {
    return empty_string();
  }

  if (value.isDouble()) {
    double fvalue = floor(value.toDouble()); /* floor it just in case */
    char *ptr, *end;
    char buf[(sizeof(double) << 3) + 1];

    /* Don't try to convert +/- infinity */
    if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) {
      raise_warning("Number too large");
      return empty_string();
    }

    end = ptr = buf + sizeof(buf) - 1;

    do {
      *--ptr = digits[(int) fmod(fvalue, base)];
      fvalue /= base;
    } while (ptr > buf && fabs(fvalue) >= 1);

    return String(ptr, end - ptr, CopyString);
  }

  return string_long_to_base(value.toInt64(), base);
}

///////////////////////////////////////////////////////////////////////////////
// uuencode

#define PHP_UU_ENC(c) \
  ((c) ? ((c) & 077) + ' ' : '`')
#define PHP_UU_ENC_C2(c) \
  PHP_UU_ENC(((*(c) * 16) & 060) | ((*((c) + 1) >> 4) & 017))
#define PHP_UU_ENC_C3(c) \
  PHP_UU_ENC(((*(c + 1) * 4) & 074) | ((*((c) + 2) >> 6) & 03))
#define PHP_UU_DEC(c) \
  (((c) - ' ') & 077)

String string_uuencode(const char *src, int src_len) {
  assertx(src);
  assertx(src_len);

  int len = 45;
  char *p;
  const char *s, *e, *ee;
  char *dest;

  /* encoded length is ~ 38% greater than the original */
  String ret((int)ceil(src_len * 1.38) + 45, ReserveString);
  p = dest = ret.mutableData();
  s = src;
  e = src + src_len;

  while ((s + 3) < e) {
    ee = s + len;
    if (ee > e) {
      ee = e;
      len = ee - s;
      if (len % 3) {
        ee = s + (int) (floor(len / 3) * 3);
      }
    }
    *p++ = PHP_UU_ENC(len);

    while (s < ee) {
      *p++ = PHP_UU_ENC(*s >> 2);
      *p++ = PHP_UU_ENC_C2(s);
      *p++ = PHP_UU_ENC_C3(s);
      *p++ = PHP_UU_ENC(*(s + 2) & 077);

      s += 3;
    }

    if (len == 45) {
      *p++ = '\n';
    }
  }

  if (s < e) {
    if (len == 45) {
      *p++ = PHP_UU_ENC(e - s);
      len = 0;
    }

    *p++ = PHP_UU_ENC(*s >> 2);
    *p++ = PHP_UU_ENC_C2(s);
    *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0');
    *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0');
  }

  if (len < 45) {
    *p++ = '\n';
  }

  *p++ = PHP_UU_ENC('\0');
  *p++ = '\n';
  *p = '\0';

  ret.setSize(p - dest);
  return ret;
}

String string_uudecode(const char *src, int src_len) {
  int total_len = 0;
  int len;
  const char *s, *e, *ee;
  char *p, *dest;

  String ret(ceil(src_len * 0.75), ReserveString);
  p = dest = ret.mutableData();
  s = src;
  e = src + src_len;

  while (s < e) {
    if ((len = PHP_UU_DEC(*s++)) <= 0) {
      break;
    }
    /* sanity check */
    if (len > src_len) {
      goto err;
    }

    total_len += len;

    ee = s + (len == 45 ? 60 : (int) floor(len * 1.33));
    /* sanity check */
    if (ee > e) {
      goto err;
    }

    while (s < ee) {
      if (s + 4 > e) goto err;

      *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
      *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
      *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
      s += 4;
    }

    if (len < 45) {
      break;
    }

    /* skip \n */
    s++;
  }

  if ((len = total_len > (p - dest))) {
    *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
    if (len > 1) {
      *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
      if (len > 2) {
        *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
      }
    }
  }

  ret.setSize(total_len);
  return ret;

 err:
  return String();
}

///////////////////////////////////////////////////////////////////////////////
// base64

namespace {

const char base64_table[] = {
  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
};

const char base64_pad = '=';

const short base64_reverse_table[256] = {
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
  -2,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
  -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
  41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
};

folly::Optional<int> maxEncodedSize(int length) {
  if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
    return folly::none;
  }
  return ((length + 2) / 3) * 4;
}

// outstr must be at least maxEncodedSize(length) bytes
size_t php_base64_encode(const unsigned char *str, int length,
                         unsigned char* outstr) {
  const unsigned char *current = str;
  unsigned char *p = outstr;

  while (length > 2) { /* keep going until we have less than 24 bits */
    *p++ = base64_table[current[0] >> 2];
    *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
    *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)];
    *p++ = base64_table[current[2] & 0x3f];

    current += 3;
    length -= 3; /* we just handle 3 octets of data */
  }

  /* now deal with the tail end of things */
  if (length != 0) {
    *p++ = base64_table[current[0] >> 2];
    if (length > 1) {
      *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
      *p++ = base64_table[(current[1] & 0x0f) << 2];
      *p++ = base64_pad;
    } else {
      *p++ = base64_table[(current[0] & 0x03) << 4];
      *p++ = base64_pad;
      *p++ = base64_pad;
    }
  }
  return p - outstr;
}

// outstr must be at least length bytes
ssize_t php_base64_decode(const char *str, int length, bool strict,
                         unsigned char* outstr) {
  const unsigned char *current = (unsigned char*)str;
  int ch, i = 0, j = 0, k;
  /* this sucks for threaded environments */

  unsigned char* result = outstr;

  /* run through the whole string, converting as we go */
  while ((ch = *current++) != '\0' && length-- > 0) {
    if (ch == base64_pad) {
      if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
        if ((i % 4) != 1) {
          while (isspace(*(++current))) {
            continue;
          }
          if (*current == '\0') {
            continue;
          }
        }
        return -1;
      }
      continue;
    }

    ch = base64_reverse_table[ch];
    if ((!strict && ch < 0) || ch == -1) {
      /* a space or some other separator character, we simply skip over */
      continue;
    } else if (ch == -2) {
      return -1;
    }

    switch(i % 4) {
    case 0:
      result[j] = ch << 2;
      break;
    case 1:
      result[j++] |= ch >> 4;
      result[j] = (ch & 0x0f) << 4;
      break;
    case 2:
      result[j++] |= ch >>2;
      result[j] = (ch & 0x03) << 6;
      break;
    case 3:
      result[j++] |= ch;
      break;
    }
    i++;
  }

  k = j;
  /* mop things up if we ended on a boundary */
  if (ch == base64_pad) {
    switch(i % 4) {
    case 1:
      return -1;
    case 2:
      k++;
    case 3:
      result[k] = 0;
    }
  }
  return j;
}

}

String string_base64_encode(const char* input, int len) {
  if (auto const wantedSize = maxEncodedSize(len)) {
    String ret(*wantedSize, ReserveString);
    auto actualSize = php_base64_encode((unsigned char*)input, len,
                                        (unsigned char*)ret.mutableData());
    ret.setSize(actualSize);
    return ret;
  }
  return String();
}

String string_base64_decode(const char* input, int len, bool strict) {
  String ret(len, ReserveString);
  auto actualSize = php_base64_decode(input, len, strict,
                                      (unsigned char*)ret.mutableData());
  if (actualSize < 0) return String();

  ret.setSize(actualSize);
  return ret;
}

std::string base64_encode(const char* input, int len) {
  if (auto const wantedSize = maxEncodedSize(len)) {
    std::string ret;
    ret.resize(*wantedSize);
    auto actualSize = php_base64_encode((unsigned char*)input, len,
                                        (unsigned char*)ret.data());
    ret.resize(actualSize);
    return ret;
  }
  return std::string();
}

std::string base64_decode(const char* input, int len, bool strict) {
  if (!len) return std::string();
  std::string ret;
  ret.resize(len);
  auto actualSize = php_base64_decode(input, len, strict,
                                      (unsigned char*)ret.data());
  if (!actualSize) return std::string();

  ret.resize(actualSize);
  return ret;
}

///////////////////////////////////////////////////////////////////////////////

String string_escape_shell_arg(const char *str) {
  int x, y, l;
  char *cmd;

  y = 0;
  l = strlen(str);

  String ret(safe_address(l, 4, 3), ReserveString); /* worst case */
  cmd = ret.mutableData();

#ifdef _MSC_VER
  cmd[y++] = '"';
#else
  cmd[y++] = '\'';
#endif

  for (x = 0; x < l; x++) {
    switch (str[x]) {
#ifdef _MSC_VER
    case '"':
    case '%':
    case '!':
      cmd[y++] = ' ';
      break;
#else
    case '\'':
      cmd[y++] = '\'';
      cmd[y++] = '\\';
      cmd[y++] = '\'';
#endif
      /* fall-through */
    default:
      cmd[y++] = str[x];
    }
  }
#ifdef _MSC_VER
  if (y > 0 && '\\' == cmd[y - 1]) {
    int k = 0, n = y - 1;
    for (; n >= 0 && '\\' == cmd[n]; n--, k++);
    if (k % 2) {
      cmd[y++] = '\\';
    }
  }

  cmd[y++] = '"';
#else
  cmd[y++] = '\'';
#endif
  ret.setSize(y);
  return ret;
}

String string_escape_shell_cmd(const char *str) {
  register int x, y, l;
  char *cmd;
  char *p = nullptr;

  l = strlen(str);
  String ret(safe_address(l, 2, 1), ReserveString);
  cmd = ret.mutableData();

  for (x = 0, y = 0; x < l; x++) {
    switch (str[x]) {
#ifndef _MSC_VER
    case '"':
    case '\'':
      if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) {
        /* noop */
      } else if (p && *p == str[x]) {
        p = nullptr;
      } else {
        cmd[y++] = '\\';
      }
      cmd[y++] = str[x];
      break;
#else
    /* % is Windows specific for environmental variables, ^%PATH% will
    output PATH while ^%PATH^% will not. escapeshellcmd->val will
    escape all % and !.
    */
    case '%':
    case '!':
    case '"':
    case '\'':
#endif
    case '#': /* This is character-set independent */
    case '&':
    case ';':
    case '`':
    case '|':
    case '*':
    case '?':
    case '~':
    case '<':
    case '>':
    case '^':
    case '(':
    case ')':
    case '[':
    case ']':
    case '{':
    case '}':
    case '$':
    case '\\':
    case '\x0A': /* excluding these two */
    case '\xFF':
#ifdef _MSC_VER
      cmd[y++] = '^';
#else
      cmd[y++] = '\\';
#endif
      /* fall-through */
    default:
      cmd[y++] = str[x];
    }
  }
  ret.setSize(y);
  return ret;
}

///////////////////////////////////////////////////////////////////////////////

static void string_similar_str(const char *txt1, int len1,
                               const char *txt2, int len2,
                               int *pos1, int *pos2, int *max) {
  const char *p, *q;
  const char *end1 = txt1 + len1;
  const char *end2 = txt2 + len2;
  int l;

  *max = 0;
  for (p = txt1; p < end1; p++) {
    for (q = txt2; q < end2; q++) {
      for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++);
      if (l > *max) {
        *max = l;
        *pos1 = p - txt1;
        *pos2 = q - txt2;
      }
    }
  }
}

static int string_similar_char(const char *txt1, int len1,
                               const char *txt2, int len2) {
  int sum;
  int pos1 = 0, pos2 = 0, max;

  string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);
  if ((sum = max)) {
    if (pos1 && pos2) {
      sum += string_similar_char(txt1, pos1, txt2, pos2);
    }
    if ((pos1 + max < len1) && (pos2 + max < len2)) {
      sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max,
                                 txt2 + pos2 + max, len2 - pos2 - max);
    }
  }

  return sum;
}

int string_similar_text(const char *t1, int len1,
                        const char *t2, int len2, double *percent) {
  if (len1 == 0 && len2 == 0) {
    if (percent) *percent = 0.0;
    return 0;
  }

  int sim = string_similar_char(t1, len1, t2, len2);
  if (percent) *percent = sim * 200.0 / (len1 + len2);
  return sim;
}

///////////////////////////////////////////////////////////////////////////////

#define LEVENSHTEIN_MAX_LENTH 255

// reference implementation, only optimized for memory usage, not speed
int string_levenshtein(const char *s1, int l1, const char *s2, int l2,
                       int cost_ins, int cost_rep, int cost_del ) {
  int *p1, *p2, *tmp;
  int i1, i2, c0, c1, c2;

  if (l1==0) return l2*cost_ins;
  if (l2==0) return l1*cost_del;

  if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) {
    raise_warning("levenshtein(): Argument string(s) too long");
    return -1;
  }

  p1 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
  SCOPE_EXIT { req::free(p1); };
  p2 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
  SCOPE_EXIT { req::free(p2); };

  for(i2=0;i2<=l2;i2++) {
    p1[i2] = i2*cost_ins;
  }

  for(i1=0;i1<l1;i1++) {
    p2[0]=p1[0]+cost_del;
    for(i2=0;i2<l2;i2++) {
      c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep);
      c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1;
      c2=p2[i2]+cost_ins; if (c2<c0) c0=c2;
      p2[i2+1]=c0;
    }
    tmp=p1; p1=p2; p2=tmp;
  }

  c0=p1[l2];
  return c0;
}

///////////////////////////////////////////////////////////////////////////////

String string_money_format(const char *format, double value) {
  bool check = false;
  const char *p = format;
  while ((p = strchr(p, '%'))) {
    if (*(p + 1) == '%') {
      p += 2;
    } else if (!check) {
      check = true;
      p++;
    } else {
      raise_invalid_argument_warning
        ("format: Only a single %%i or %%n token can be used");
      return String();
    }
  }

  int format_len = strlen(format);
  int str_len = safe_address(format_len, 1, 1024);
  String ret(str_len, ReserveString);
  char *str = ret.mutableData();
  if ((str_len = strfmon(str, str_len, format, value)) < 0) {
    return String();
  }
  ret.setSize(str_len);
  return ret;
}

///////////////////////////////////////////////////////////////////////////////

String string_number_format(double d, int dec,
                            const String& dec_point,
                            const String& thousand_sep) {
  char *tmpbuf = nullptr, *resbuf;
  char *s, *t;  /* source, target */
  char *dp;
  int integral;
  int tmplen, reslen=0;
  int count=0;
  int is_negative=0;

  if (d < 0) {
    is_negative = 1;
    d = -d;
  }

  if (dec < 0) dec = 0;
  d = php_math_round(d, dec);

  // departure from PHP: we got rid of dependencies on spprintf() here.
  // This actually means 63 bytes for characters + 1 byte for '\0'
  String tmpstr(63, ReserveString);
  tmpbuf = tmpstr.mutableData();
  tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d);
  // From the man page of snprintf, the return value is:
  // The number of characters that would have been written if n had been
  // sufficiently large, not counting the terminating null character.
  if (tmplen < 0) return empty_string();
  if (tmplen < 64 && (tmpbuf == nullptr || !isdigit((int)tmpbuf[0]))) {
    tmpstr.setSize(tmplen);
    return tmpstr;
  }
  if (tmplen >= 64) {
    // Uncommon, asked for more than 64 chars worth of precision
    tmpstr = String(tmplen, ReserveString);
    tmpbuf = tmpstr.mutableData();
    tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d);
    if (tmplen < 0) return empty_string();
    if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
      tmpstr.setSize(tmplen);
      return tmpstr;
    }
  }

  /* find decimal point, if expected */
  if (dec) {
    dp = strpbrk(tmpbuf, ".,");
  } else {
    dp = nullptr;
  }

  /* calculate the length of the return buffer */
  if (dp) {
    integral = dp - tmpbuf;
  } else {
    /* no decimal point was found */
    integral = tmplen;
  }

  /* allow for thousand separators */
  if (!thousand_sep.empty()) {
    if (integral + thousand_sep.size() * ((integral-1) / 3) < integral) {
      /* overflow */
      raise_error("String overflow");
    }

    integral += ((integral-1) / 3) * thousand_sep.size();
  }

  reslen = integral;

  if (dec) {
    reslen += dec;

    if (!dec_point.empty()) {
      if (reslen + dec_point.size() < dec_point.size()) {
        /* overflow */
        raise_error("String overflow");
      }
      reslen += dec_point.size();
    }
  }

  /* add a byte for minus sign */
  if (is_negative) {
    reslen++;
  }
  String resstr(reslen, ReserveString);
  resbuf = resstr.mutableData();

  s = tmpbuf+tmplen-1;
  t = resbuf+reslen-1;

  /* copy the decimal places.
   * Take care, as the sprintf implementation may return less places than
   * we requested due to internal buffer limitations */
  if (dec) {
    int declen = dp ? s - dp : 0;
    int topad = dec > declen ? dec - declen : 0;

    /* pad with '0's */
    while (topad--) {
      *t-- = '0';
    }

    if (dp) {
      s -= declen + 1; /* +1 to skip the point */
      t -= declen;

      /* now copy the chars after the point */
      memcpy(t + 1, dp + 1, declen);
    }

    /* add decimal point */
    if (!dec_point.empty()) {
      memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size());
      t -= dec_point.size();
    }
  }

  /* copy the numbers before the decimal point, adding thousand
   * separator every three digits */
  while(s >= tmpbuf) {
    *t-- = *s--;
    if (thousand_sep && (++count%3)==0 && s>=tmpbuf) {
      memcpy(t + (1 - thousand_sep.size()),
             thousand_sep.data(),
             thousand_sep.size());
      t -= thousand_sep.size();
    }
  }

  /* and a minus sign, if needed */
  if (is_negative) {
    *t-- = '-';
  }

  resstr.setSize(reslen);
  return resstr;
}

///////////////////////////////////////////////////////////////////////////////
// soundex

/* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
String string_soundex(const String& str) {
  assertx(!str.empty());
  int _small, code, last;
  String retString(4, ReserveString);
  char* soundex = retString.mutableData();

  static char soundex_table[26] = {
    0,              /* A */
    '1',            /* B */
    '2',            /* C */
    '3',            /* D */
    0,              /* E */
    '1',            /* F */
    '2',            /* G */
    0,              /* H */
    0,              /* I */
    '2',            /* J */
    '2',            /* K */
    '4',            /* L */
    '5',            /* M */
    '5',            /* N */
    0,              /* O */
    '1',            /* P */
    '2',            /* Q */
    '6',            /* R */
    '2',            /* S */
    '3',            /* T */
    0,              /* U */
    '1',            /* V */
    0,              /* W */
    '2',            /* X */
    0,              /* Y */
    '2'             /* Z */
  };

  /* build soundex string */
  last = -1;
  auto p = str.slice().data();
  for (_small = 0; *p && _small < 4; p++) {
    /* convert chars to upper case and strip non-letter chars */
    /* BUG: should also map here accented letters used in non */
    /* English words or names (also found in English text!): */
    /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
    code = toupper((int)(unsigned char)(*p));
    if (code >= 'A' && code <= 'Z') {
      if (_small == 0) {
        /* remember first valid char */
        soundex[_small++] = code;
        last = soundex_table[code - 'A'];
      } else {
        /* ignore sequences of consonants with same soundex */
        /* code in trail, and vowels unless they separate */
        /* consonant letters */
        code = soundex_table[code - 'A'];
        if (code != last) {
          if (code != 0) {
            soundex[_small++] = code;
          }
          last = code;
        }
      }
    }
  }
  /* pad with '0' and terminate with 0 ;-) */
  while (_small < 4) {
    soundex[_small++] = '0';
  }
  retString.setSize(4);
  return retString;
}

///////////////////////////////////////////////////////////////////////////////
// metaphone

/**
 * this is now the original code by Michael G Schwern:
 * i've changed it just a slightly bit (use emalloc,
 * get rid of includes etc)
 * - thies - 13.09.1999
 */

/*-----------------------------  */
/* this used to be "metaphone.h" */
/*-----------------------------  */

/* Special encodings */
#define  SH   'X'
#define  TH   '0'

/*-----------------------------  */
/* end of "metaphone.h"          */
/*-----------------------------  */

/*----------------------------- */
/* this used to be "metachar.h" */
/*----------------------------- */

/* Metachar.h ... little bits about characters for metaphone */
/*-- Character encoding array & accessing macros --*/
/* Stolen directly out of the book... */
char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};

#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)

#define isvowel(c)  (ENCODE(c) & 1)    /* AEIOU */

/* These letters are passed through unchanged */
#define NOCHANGE(c) (ENCODE(c) & 2)    /* FJMNR */

/* These form dipthongs when preceding H */
#define AFFECTH(c)  (ENCODE(c) & 4)    /* CGPST */

/* These make C and G soft */
#define MAKESOFT(c) (ENCODE(c) & 8)    /* EIY */

/* These prevent GH from becoming F */
#define NOGHTOF(c)  (ENCODE(c) & 16)  /* BDH */

/*----------------------------- */
/* end of "metachar.h"          */
/*----------------------------- */

/* I suppose I could have been using a character pointer instead of
 * accesssing the array directly... */

/* Look at the next letter in the word */
#define Next_Letter ((char)toupper(word[w_idx+1]))
/* Look at the current letter in the word */
#define Curr_Letter ((char)toupper(word[w_idx]))
/* Go N letters back. */
#define Look_Back_Letter(n)  (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
/* Previous letter.  I dunno, should this return null on failure? */
#define Prev_Letter (Look_Back_Letter(1))
/* Look two letters down.  It makes sure you don't walk off the string. */
#define After_Next_Letter  (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
                           : '\0')
#define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))

/* Allows us to safely look ahead an arbitrary # of letters */
/* I probably could have just used strlen... */
static char Lookahead(unsigned char *word, int how_far) {
  char letter_ahead = '\0';  /* null by default */
  int idx;
  for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
  /* Edge forward in the string... */

  letter_ahead = (char)word[idx];  /* idx will be either == to how_far or
                                    * at the end of the string
                                    */
  return letter_ahead;
}

/* phonize one letter
 * We don't know the buffers size in advance. On way to solve this is to just
 * re-allocate the buffer size. We're using an extra of 2 characters (this
 * could be one though; or more too). */
#define Phonize(c)  { buffer.append(c); }
/* How long is the phoned word? */
#define Phone_Len  (buffer.size())

/* Note is a letter is a 'break' in the word */
#define Isbreak(c)  (!isalpha(c))

String string_metaphone(const char *input, int word_len, long max_phonemes,
                        int traditional) {
  unsigned char *word = (unsigned char *)input;

  int w_idx = 0;        /* point in the phonization we're at. */
  int max_buffer_len = 0;    /* maximum length of the destination buffer */

  /*-- Parameter checks --*/
  /* Negative phoneme length is meaningless */

  if (max_phonemes < 0)
    return String();

  /* Empty/null string is meaningless */
  /* Overly paranoid */
  /* always_assert(word != NULL && word[0] != '\0'); */

  if (word == nullptr)
    return String();

  /*-- Allocate memory for our phoned_phrase --*/
  if (max_phonemes == 0) {  /* Assume largest possible */
    max_buffer_len = word_len;
  } else {
    max_buffer_len = max_phonemes;
  }
  StringBuffer buffer(max_buffer_len);

  /*-- The first phoneme has to be processed specially. --*/
  /* Find our first letter */
  for (; !isalpha(Curr_Letter); w_idx++) {
    /* On the off chance we were given nothing but crap... */
    if (Curr_Letter == '\0') {
      return buffer.detach();  /* For testing */
    }
  }

  switch (Curr_Letter) {
    /* AE becomes E */
  case 'A':
    if (Next_Letter == 'E') {
      Phonize('E');
      w_idx += 2;
    }
    /* Remember, preserve vowels at the beginning */
    else {
      Phonize('A');
      w_idx++;
    }
    break;
    /* [GKP]N becomes N */
  case 'G':
  case 'K':
  case 'P':
    if (Next_Letter == 'N') {
      Phonize('N');
      w_idx += 2;
    }…
Large files files are truncated, but you can click here to view the full file