/wyszukiwanie/soundex.c
C | 449 lines | 225 code | 63 blank | 161 comment | 135 complexity | 8e0e96fbcc053f7cf53d2a042afc1953 MD5 | raw file
- /*
- * v 1.0d TESTED-OK 20060308
- * -----------------------
- *
- * The following SoundEx function is:
- *
- * (C) Copyright 2002 - 2006, Creativyst, Inc.
- * ALL RIGHTS RESERVED
- *
- * For more information go to:
- * http://www.Creativyst.com
- * or email:
- * Support@Creativyst.com
- *
- * Redistribution and use in source and binary
- * forms, with or without modification, are
- * permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must
- * retain the above copyright notice, this
- * list of conditions and the following
- * disclaimer.
- *
- * 2. Redistributions in binary form must
- * reproduce the above copyright notice,
- * this list of conditions and the
- * following disclaimer in the
- * documentation and/or other materials
- * provided with the distribution.
- *
- * 3. All advertising materials mentioning
- * features or use of this software must
- * display the following acknowledgement:
- * This product includes software developed
- * by Creativyst, Inc.
- *
- * 4. The name of Creativyst, Inc. may not be
- * used to endorse or promote products
- * derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY CREATIVYST CORPORATION
- *`AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
- * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
- * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * ------------------
- * ------------------
- * FUNCTION NOTES:
- * 1. To avoid all possibility of overwrites make
- * sure *SoundEx points to a buffer with at least
- * 11 bytes of storage.
- *
- * 2. This function is for 7/8-bit ASCII characters.
- * Modifications are required for UTF16/32, or for
- * anything other than the first 7-bits of utf-8.
- *
- * 3. For those embedded guys who will understand this:
- * This is a true library-grade (i.e. re-usable) function,
- * meaning it has no dependencies on outside functions
- * and requires no non-standard libraries be linked in
- * order for it to work. In this case, since it doesn't
- * even require the standard C library, it is what C99
- * (I think) calls a: strictly conforming freestanding
- * function.
- *
- */
- int SoundEx(char *SoundEx,
- char *WordString,
- int LengthOption,
- int CensusOption)
- {
- int InSz = 31;
- char WordStr[32]; /* one bigger than InSz */
- int SoundExLen, WSLen, i;
- char FirstLetter, *p, *p2;
- SoundExLen = WSLen = 0;
- SoundEx[0] = 0;
- if(CensusOption) {
- LengthOption = 4;
- }
- if(LengthOption) {
- SoundExLen = LengthOption;
- }
- if(SoundExLen > 10) {
- SoundExLen = 10;
- }
- if(SoundExLen < 4) {
- SoundExLen = 4;
- }
- if(!WordString) {
- return(0);
- }
- /* Copy WordString to WordStr
- * without using funcs from other
- * libraries.
- */
- for(p = WordString,p2 = WordStr,i = 0;(*p);p++,p2++,i++) {
- if(i >= InSz) break;
- (*p2) = (*p);
- }
- (*p2) = 0;
- /* Convert WordStr to
- * upper-case, without using funcs
- * from other libraries
- */
- for(p = WordStr;(*p);p++) {
- if( (*p) >= 'a' && (*p) <= 'z' ) {
- (*p) -= 0x20;
- }
- }
- /* convert all non-alpha
- * chars to spaces
- */
- for(p = WordStr;(*p);p++) {
- if( (*p) < 'A' || (*p) > 'Z' ) {
- (*p) = ' ';
- }
- }
- /* Remove leading spaces
- */
- for(i = 0, p = p2 = WordStr;(*p);p++) {
- if(!i) {
- if( (*p) != ' ' ) {
- (*p2) = (*p);
- p2++;
- i++;
- }
- }
- else {
- (*p2) = (*p);
- p2++;
- }
- }
- (*p2) = 0;
- /* Get length of WordStr
- */
- for(i = 0,p = WordStr;(*p);p++) i++;
- /* Remove trailing spaces
- */
- for(;i;i--) {
- if(WordStr[i] == ' ') {
- WordStr[i] = 0;
- }
- else {
- break;
- }
- }
- /* Get length of WordStr
- */
- for(WSLen = 0,p = WordStr;(*p);p++) WSLen++;
- if(!WSLen) {
- return(0);
- }
- /* Perform our own multi-letter
- * improvements
- *
- * underscore placeholders (_) will be
- * removed below.
- */
- if(!CensusOption) {
- if(WordStr[0] == 'P' && WordStr[1] == 'S') {
- WordStr[0] = '_';
- }
- if(WordStr[0] == 'P' && WordStr[1] == 'F') {
- WordStr[0] = '_';
- }
- for(i = 0;i < WSLen;i++) {
- if(WordStr[i] == 'D' && WordStr[i+1] == 'G') {
- WordStr[i] = '_';
- i++;
- continue;
- }
- if(WordStr[i] == 'G' && WordStr[i+1] == 'H') {
- WordStr[i] = '_';
- i++;
- continue;
- }
- if(WordStr[i] == 'K' && WordStr[i+1] == 'N') {
- WordStr[i] = '_';
- i++;
- continue;
- }
- if(WordStr[i] == 'G' && WordStr[i+1] == 'N') {
- WordStr[i] = '_';
- i++;
- continue;
- }
- if(WordStr[i] == 'M' && WordStr[i+1] == 'B') {
- WordStr[i+1] = '_';
- i++;
- continue;
- }
- if(WordStr[i] == 'P' && WordStr[i+1] == 'H') {
- WordStr[i] = 'F';
- WordStr[i+1] = '_';
- i++;
- continue;
- }
- if(WordStr[i] == 'T' &&
- WordStr[i+1] == 'C' &&
- WordStr[i+2] == 'H'
- ) {
- WordStr[i] = '_';
- i++; i++;
- continue;
- }
- if(WordStr[i] == 'M' && WordStr[i+1] == 'P'
- && (WordStr[i+2] == 'S' ||
- WordStr[i+2] == 'T' ||
- WordStr[i+2] == 'Z')
- ) {
- WordStr[i+1] = '_';
- i++;
- }
- }
- } /* end if(!CensusOption) */
- /* squeeze out underscore characters
- * added as a byproduct of above process
- * (only needed in c styled replace)
- */
- for(p = p2 = WordStr;(*p);p++) {
- (*p2) = (*p);
- if( (*p2) != '_' ) {
- p2++;
- }
- }
- (*p2) = 0;
- /* This must be done AFTER our
- * multi-letter replacements
- * since they could change
- * the first letter
- */
- FirstLetter = WordStr[0];
- /* In case we're in CensusOption
- * 1 and the word starts with
- * an 'H' or 'W'
- * (v1.0c djr: add test for H or W)
- */
- if(FirstLetter == 'H' || FirstLetter == 'W') {
- WordStr[0] = '-';
- }
- /* In properly done census
- * SoundEx, the H and W will
- * be squezed out before
- * performing the test
- * for adjacent digits
- * (this differs from how
- * 'real' vowels are handled)
- */
- if(CensusOption == 1) {
- for(p = &(WordStr[1]);(*p);p++) {
- if((*p) == 'H' || (*p) == 'W') {
- (*p) = '.';
- }
- }
- }
- /* Perform classic SoundEx
- * replacements.
- */
- for(p = WordStr;(*p);p++) {
- if( (*p) == 'A' ||
- (*p) == 'E' ||
- (*p) == 'I' ||
- (*p) == 'O' ||
- (*p) == 'U' ||
- (*p) == 'Y' ||
- (*p) == 'H' ||
- (*p) == 'W'
- ){
- (*p) = '0'; /* zero */
- }
- if( (*p) == 'B' ||
- (*p) == 'P' ||
- (*p) == 'F' ||
- (*p) == 'V'
- ){
- (*p) = '1';
- }
- if( (*p) == 'C' ||
- (*p) == 'S' ||
- (*p) == 'G' ||
- (*p) == 'J' ||
- (*p) == 'K' ||
- (*p) == 'Q' ||
- (*p) == 'X' ||
- (*p) == 'Z'
- ){
- (*p) = '2';
- }
- if( (*p) == 'D' ||
- (*p) == 'T'
- ){
- (*p) = '3';
- }
- if( (*p) == 'L' ) {
- (*p) = '4';
- }
- if( (*p) == 'M' ||
- (*p) == 'N'
- ){
- (*p) = '5';
- }
- if( (*p) == 'R' ) {
- (*p) = '6';
- }
- }
- /* soundex replacement loop done */
- /* In properly done census
- * SoundEx, the H and W will
- * be squezed out before
- * performing the test
- * for adjacent digits
- * (this differs from how
- * 'real' vowels are handled)
- */
- if(CensusOption == 1) {
- /* squeeze out dots
- */
- for(p = p2 = &WordStr[1];(*p);p++) {
- (*p2) = (*p);
- if( (*p2) != '.' ) {
- p2++;
- }
- }
- (*p2) = 0;
- }
- /* squeeze out extra equal adjacent digits
- * (don't include first letter)
- * v1.0c djr (now includes first letter)
- */
- for(p = p2 = &(WordStr[0]);(*p);p++) {
- (*p2) = (*p);
- if( (*p2) != p[1] ) {
- p2++;
- }
- }
- (*p2) = 0;
- /* squeeze out spaces and zeros
- * Leave the first letter code
- * to be replaced below.
- * (In case it made a zero)
- */
- for(p = p2 = &WordStr[1];(*p);p++) {
- (*p2) = (*p);
- if( (*p2) != ' ' && (*p2) != '0' ) {
- p2++;
- }
- }
- (*p2) = 0;
- /* Get length of WordStr
- */
- for(WSLen = 0,p = WordStr;(*p);p++) WSLen++;
- /* Right pad with zero characters
- */
- for(i = WSLen;i < SoundExLen;i++ ) {
- WordStr[i] = '0';
- }
- /* Size to taste
- */
- WordStr[SoundExLen] = 0;
- /* Replace first digit with
- * first letter.
- */
- WordStr[0] = FirstLetter;
- /* Copy WordStr to SoundEx
- */
- for(p2 = SoundEx,p = WordStr;(*p);p++,p2++) {
- (*p2) = (*p);
- }
- (*p2) = 0;
- return(SoundExLen);
- }