ext/standard/metaphone.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
PHP_FUNCTION
Lookahead
metaphone
   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 5                                                        |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 1997-2016 The PHP Group                                |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15    | Author: Thies C. Arntzen <thies@thieso.net>                          |
  16    +----------------------------------------------------------------------+
  17 */
  18 
  19 /* $Id$ */
  20 
  21 /*
  22         Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
  23 */
  24 
  25 #include "php.h"
  26 #include "php_metaphone.h"
  27 
  28 static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
  29 
  30 /* {{{ proto string metaphone(string text[, int phones])
  31    Break english phrases down into their phonemes */
  32 PHP_FUNCTION(metaphone)
  33 {
  34         char *str;
  35         char *result = 0;
  36         int str_len;
  37         long phones = 0;
  38 
  39         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
  40                                                           &phones) == FAILURE) {
  41                 return;
  42         }
  43 
  44         if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
  45                 RETVAL_STRING(result, 0);
  46         } else {
  47                 if (result) {
  48                         efree(result);
  49                 }
  50                 RETURN_FALSE;
  51         }
  52 }
  53 /* }}} */
  54 
  55 /* 
  56    this is now the original code by Michael G Schwern:
  57    i've changed it just a slightly bit (use emalloc, 
  58    get rid of includes etc) 
  59         - thies - 13.09.1999
  60 */
  61 
  62 /*-----------------------------  */
  63 /* this used to be "metaphone.h" */
  64 /*-----------------------------  */
  65 
  66 /* Special encodings */
  67 #define  SH     'X'
  68 #define  TH             '0'
  69 
  70 /*-----------------------------  */
  71 /* end of "metaphone.h"          */
  72 /*-----------------------------  */
  73 
  74 /*----------------------------- */
  75 /* this used to be "metachar.h" */
  76 /*----------------------------- */
  77 
  78 /* Metachar.h ... little bits about characters for metaphone */
  79 /*-- Character encoding array & accessing macros --*/
  80 /* Stolen directly out of the book... */
  81 char _codes[26] =
  82 {
  83         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
  84 /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
  85 };
  86 
  87 
  88 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
  89 
  90 #define isvowel(c)  (ENCODE(c) & 1)             /* AEIOU */
  91 
  92 /* These letters are passed through unchanged */
  93 #define NOCHANGE(c) (ENCODE(c) & 2)             /* FJMNR */
  94 
  95 /* These form dipthongs when preceding H */
  96 #define AFFECTH(c)  (ENCODE(c) & 4)             /* CGPST */
  97 
  98 /* These make C and G soft */
  99 #define MAKESOFT(c) (ENCODE(c) & 8)             /* EIY */
 100 
 101 /* These prevent GH from becoming F */
 102 #define NOGHTOF(c)  (ENCODE(c) & 16)    /* BDH */
 103 
 104 /*----------------------------- */
 105 /* end of "metachar.h"          */
 106 /*----------------------------- */
 107 
 108 /* I suppose I could have been using a character pointer instead of
 109  * accesssing the array directly... */
 110 
 111 /* Look at the next letter in the word */
 112 #define Next_Letter (toupper(word[w_idx+1]))
 113 /* Look at the current letter in the word */
 114 #define Curr_Letter (toupper(word[w_idx]))
 115 /* Go N letters back. */
 116 #define Look_Back_Letter(n)     (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
 117 /* Previous letter.  I dunno, should this return null on failure? */
 118 #define Prev_Letter (Look_Back_Letter(1))
 119 /* Look two letters down.  It makes sure you don't walk off the string. */
 120 #define After_Next_Letter       (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
 121                                                                                              : '\0')
 122 #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
 123 
 124 
 125 /* Allows us to safely look ahead an arbitrary # of letters */
 126 /* I probably could have just used strlen... */
 127 static char Lookahead(char *word, int how_far)
 128 {
 129         char letter_ahead = '\0';       /* null by default */
 130         int idx;
 131         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
 132         /* Edge forward in the string... */
 133 
 134         letter_ahead = word[idx];       /* idx will be either == to how_far or
 135                                                                  * at the end of the string
 136                                                                  */
 137         return letter_ahead;
 138 }
 139 
 140 
 141 /* phonize one letter
 142  * We don't know the buffers size in advance. On way to solve this is to just
 143  * re-allocate the buffer size. We're using an extra of 2 characters (this
 144  * could be one though; or more too). */
 145 #define Phonize(c)      { \
 146                                                 if (p_idx >= max_buffer_len) { \
 147                                                         *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
 148                                                         max_buffer_len += 2; \
 149                                                 } \
 150                                                 (*phoned_word)[p_idx++] = c; \
 151                                         }
 152 /* Slap a null character on the end of the phoned word */
 153 #define End_Phoned_Word { \
 154                                                         if (p_idx == max_buffer_len) { \
 155                                                                 *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
 156                                                         } \
 157                                                         (*phoned_word)[p_idx] = '\0'; \
 158                                                 }
 159 /* How long is the phoned word? */
 160 #define Phone_Len       (p_idx)
 161 
 162 /* Note is a letter is a 'break' in the word */
 163 #define Isbreak(c)  (!isalpha(c))
 164 
 165 /* {{{ metaphone
 166  */
 167 static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
 168 {
 169         int w_idx = 0;                          /* point in the phonization we're at. */
 170         int p_idx = 0;                          /* end of the phoned phrase */
 171         int max_buffer_len = 0;         /* maximum length of the destination buffer */
 172 
 173 /*-- Parameter checks --*/
 174         /* Negative phoneme length is meaningless */
 175 
 176         if (max_phonemes < 0)
 177                 return -1;
 178 
 179         /* Empty/null string is meaningless */
 180         /* Overly paranoid */
 181         /* assert(word != NULL && word[0] != '\0'); */
 182 
 183         if (word == NULL)
 184                 return -1;
 185 
 186 /*-- Allocate memory for our phoned_phrase --*/
 187         if (max_phonemes == 0) {        /* Assume largest possible */
 188                 max_buffer_len = word_len;
 189                 *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
 190         } else {
 191                 max_buffer_len = max_phonemes;
 192                 *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
 193         }
 194 
 195 
 196 /*-- The first phoneme has to be processed specially. --*/
 197         /* Find our first letter */
 198         for (; !isalpha(Curr_Letter); w_idx++) {
 199                 /* On the off chance we were given nothing but crap... */
 200                 if (Curr_Letter == '\0') {
 201                         End_Phoned_Word
 202                                 return SUCCESS; /* For testing */
 203                 }
 204         }
 205 
 206         switch (Curr_Letter) {
 207                 /* AE becomes E */
 208         case 'A':
 209                 if (Next_Letter == 'E') {
 210                         Phonize('E');
 211                         w_idx += 2;
 212                 }
 213                 /* Remember, preserve vowels at the beginning */
 214                 else {
 215                         Phonize('A');
 216                         w_idx++;
 217                 }
 218                 break;
 219                 /* [GKP]N becomes N */
 220         case 'G':
 221         case 'K':
 222         case 'P':
 223                 if (Next_Letter == 'N') {
 224                         Phonize('N');
 225                         w_idx += 2;
 226                 }
 227                 break;
 228                 /* WH becomes W, 
 229                    WR becomes R 
 230                    W if followed by a vowel */
 231         case 'W':
 232                 if (Next_Letter == 'R') {
 233                         Phonize(Next_Letter);
 234                         w_idx += 2;
 235                 } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
 236                         Phonize('W');
 237                         w_idx += 2;
 238                 }
 239                 /* else ignore */
 240                 break;
 241                 /* X becomes S */
 242         case 'X':
 243                 Phonize('S');
 244                 w_idx++;
 245                 break;
 246                 /* Vowels are kept */
 247                 /* We did A already
 248                    case 'A':
 249                    case 'a':
 250                  */
 251         case 'E':
 252         case 'I':
 253         case 'O':
 254         case 'U':
 255                 Phonize(Curr_Letter);
 256                 w_idx++;
 257                 break;
 258         default:
 259                 /* do nothing */
 260                 break;
 261         }
 262 
 263 
 264 
 265         /* On to the metaphoning */
 266         for (; Curr_Letter != '\0' &&
 267                  (max_phonemes == 0 || Phone_Len < max_phonemes);
 268                  w_idx++) {
 269                 /* How many letters to skip because an eariler encoding handled     
 270                  * multiple letters */
 271                 unsigned short int skip_letter = 0;
 272 
 273 
 274                 /* THOUGHT:  It would be nice if, rather than having things like...
 275                  * well, SCI.  For SCI you encode the S, then have to remember
 276                  * to skip the C.  So the phonome SCI invades both S and C.  It would
 277                  * be better, IMHO, to skip the C from the S part of the encoding.
 278                  * Hell, I'm trying it.
 279                  */
 280 
 281                 /* Ignore non-alphas */
 282                 if (!isalpha(Curr_Letter))
 283                         continue;
 284 
 285                 /* Drop duplicates, except CC */
 286                 if (Curr_Letter == Prev_Letter &&
 287                         Curr_Letter != 'C')
 288                         continue;
 289 
 290                 switch (Curr_Letter) {
 291                         /* B -> B unless in MB */
 292                 case 'B':
 293                         if (Prev_Letter != 'M')
 294                                 Phonize('B');
 295                         break;
 296                         /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
 297                          * (SCHW is handled in S)
 298                          *  S if -CI-, -CE- or -CY-
 299                          *  dropped if -SCI-, SCE-, -SCY- (handed in S)
 300                          *  else K
 301                          */
 302                 case 'C':
 303                         if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
 304                                 if (After_Next_Letter == 'A' &&
 305                                         Next_Letter == 'I') {   /* CIA */
 306                                         Phonize(SH);
 307                                 }
 308                                 /* SC[IEY] */
 309                                 else if (Prev_Letter == 'S') {
 310                                         /* Dropped */
 311                                 } else {
 312                                         Phonize('S');
 313                                 }
 314                         } else if (Next_Letter == 'H') {
 315                                 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {       /* Christ, School */
 316                                         Phonize('K');
 317                                 } else {
 318                                         Phonize(SH);
 319                                 }
 320                                 skip_letter++;
 321                         } else {
 322                                 Phonize('K');
 323                         }
 324                         break;
 325                         /* J if in -DGE-, -DGI- or -DGY-
 326                          * else T
 327                          */
 328                 case 'D':
 329                         if (Next_Letter == 'G' &&
 330                                 MAKESOFT(After_Next_Letter)) {
 331                                 Phonize('J');
 332                                 skip_letter++;
 333                         } else
 334                                 Phonize('T');
 335                         break;
 336                         /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
 337                          * else dropped if -GNED, -GN, 
 338                          * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
 339                          * else J if in -GE-, -GI, -GY and not GG
 340                          * else K
 341                          */
 342                 case 'G':
 343                         if (Next_Letter == 'H') {
 344                                 if (!(NOGHTOF(Look_Back_Letter(3)) ||
 345                                           Look_Back_Letter(4) == 'H')) {
 346                                         Phonize('F');
 347                                         skip_letter++;
 348                                 } else {
 349                                         /* silent */
 350                                 }
 351                         } else if (Next_Letter == 'N') {
 352                                 if (Isbreak(After_Next_Letter) ||
 353                                         (After_Next_Letter == 'E' &&
 354                                          Look_Ahead_Letter(3) == 'D')) {
 355                                         /* dropped */
 356                                 } else
 357                                         Phonize('K');
 358                         } else if (MAKESOFT(Next_Letter) &&
 359                                            Prev_Letter != 'G') {
 360                                 Phonize('J');
 361                         } else {
 362                                 Phonize('K');
 363                         }
 364                         break;
 365                         /* H if before a vowel and not after C,G,P,S,T */
 366                 case 'H':
 367                         if (isvowel(Next_Letter) &&
 368                                 !AFFECTH(Prev_Letter))
 369                                 Phonize('H');
 370                         break;
 371                         /* dropped if after C
 372                          * else K
 373                          */
 374                 case 'K':
 375                         if (Prev_Letter != 'C')
 376                                 Phonize('K');
 377                         break;
 378                         /* F if before H
 379                          * else P
 380                          */
 381                 case 'P':
 382                         if (Next_Letter == 'H') {
 383                                 Phonize('F');
 384                         } else {
 385                                 Phonize('P');
 386                         }
 387                         break;
 388                         /* K
 389                          */
 390                 case 'Q':
 391                         Phonize('K');
 392                         break;
 393                         /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
 394                          * else S
 395                          */
 396                 case 'S':
 397                         if (Next_Letter == 'I' &&
 398                                 (After_Next_Letter == 'O' ||
 399                                  After_Next_Letter == 'A')) {
 400                                 Phonize(SH);
 401                         } else if (Next_Letter == 'H') {
 402                                 Phonize(SH);
 403                                 skip_letter++;
 404                         } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
 405                                 Phonize(SH);
 406                                 skip_letter += 2;
 407                         } else {
 408                                 Phonize('S');
 409                         }
 410                         break;
 411                         /* 'sh' in -TIA- or -TIO-
 412                          * else 'th' before H
 413                          * else T
 414                          */
 415                 case 'T':
 416                         if (Next_Letter == 'I' &&
 417                                 (After_Next_Letter == 'O' ||
 418                                  After_Next_Letter == 'A')) {
 419                                 Phonize(SH);
 420                         } else if (Next_Letter == 'H') {
 421                                 Phonize(TH);
 422                                 skip_letter++;
 423                         } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
 424                                 Phonize('T');
 425                         }
 426                         break;
 427                         /* F */
 428                 case 'V':
 429                         Phonize('F');
 430                         break;
 431                         /* W before a vowel, else dropped */
 432                 case 'W':
 433                         if (isvowel(Next_Letter))
 434                                 Phonize('W');
 435                         break;
 436                         /* KS */
 437                 case 'X':
 438                         Phonize('K');
 439                         Phonize('S');
 440                         break;
 441                         /* Y if followed by a vowel */
 442                 case 'Y':
 443                         if (isvowel(Next_Letter))
 444                                 Phonize('Y');
 445                         break;
 446                         /* S */
 447                 case 'Z':
 448                         Phonize('S');
 449                         break;
 450                         /* No transformation */
 451                 case 'F':
 452                 case 'J':
 453                 case 'L':
 454                 case 'M':
 455                 case 'N':
 456                 case 'R':
 457                         Phonize(Curr_Letter);
 458                         break;
 459                 default:
 460                         /* nothing */
 461                         break;
 462                 }                                               /* END SWITCH */
 463 
 464                 w_idx += skip_letter;
 465         }                                                       /* END FOR */
 466 
 467         End_Phoned_Word;
 468 
 469         return 0;
 470 }                                                               /* END metaphone */
 471 /* }}} */
 472 
 473 /*
 474  * Local variables:
 475  * tab-width: 4
 476  * c-basic-offset: 4
 477  * End:
 478  * vim600: sw=4 ts=4 fdm=marker
 479  * vim<600: sw=4 ts=4
 480  */
/* [<][>][^][v][top][bottom][index][help] */
root/ext/standard/metaphone.c

DEFINITIONS