1 /* $NetBSD: phonetic.c,v 1.3 2021/08/14 16:14:58 christos Exp $ */ 2 3 /* phonetic.c - routines to do phonetic matching */ 4 /* $OpenLDAP$ */ 5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 6 * 7 * Copyright 1998-2021 The OpenLDAP Foundation. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted only as authorized by the OpenLDAP 12 * Public License. 13 * 14 * A copy of this license is available in the file LICENSE in the 15 * top-level directory of the distribution or, alternatively, at 16 * <http://www.OpenLDAP.org/license.html>. 17 */ 18 /* Portions Copyright (c) 1995 Regents of the University of Michigan. 19 * All rights reserved. 20 * 21 * Redistribution and use in source and binary forms are permitted 22 * provided that this notice is preserved and that due credit is given 23 * to the University of Michigan at Ann Arbor. The name of the University 24 * may not be used to endorse or promote products derived from this 25 * software without specific prior written permission. This software 26 * is provided ``as is'' without express or implied warranty. 27 */ 28 29 #include <sys/cdefs.h> 30 __RCSID("$NetBSD: phonetic.c,v 1.3 2021/08/14 16:14:58 christos Exp $"); 31 32 #include "portable.h" 33 34 #include <stdio.h> 35 36 #include <ac/ctype.h> 37 #include <ac/string.h> 38 #include <ac/socket.h> 39 #include <ac/time.h> 40 41 #include "slap.h" 42 43 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC) 44 #define SLAPD_METAPHONE 45 #endif 46 47 #define iswordbreak(x) (!isascii(x) || isspace((unsigned char) (x)) || \ 48 ispunct((unsigned char) (x)) || \ 49 isdigit((unsigned char) (x)) || (x) == '\0') 50 51 #if 0 52 static char * 53 first_word( char *s ) 54 { 55 if ( s == NULL ) { 56 return( NULL ); 57 } 58 59 while ( iswordbreak( *s ) ) { 60 if ( *s == '\0' ) { 61 return( NULL ); 62 } else { 63 s++; 64 } 65 } 66 67 return( s ); 68 } 69 70 static char * 71 next_word( char *s ) 72 { 73 if ( s == NULL ) { 74 return( NULL ); 75 } 76 77 while ( ! iswordbreak( *s ) ) { 78 s++; 79 } 80 81 while ( iswordbreak( *s ) ) { 82 if ( *s == '\0' ) { 83 return( NULL ); 84 } else { 85 s++; 86 } 87 } 88 89 return( s ); 90 } 91 92 static char * 93 word_dup( char *w ) 94 { 95 char *s, *ret; 96 char save; 97 98 for ( s = w; !iswordbreak( *s ); s++ ) 99 ; /* NULL */ 100 save = *s; 101 *s = '\0'; 102 ret = ch_strdup( w ); 103 *s = save; 104 105 return( ret ); 106 } 107 #endif /* 0 */ 108 109 #ifndef MAXPHONEMELEN 110 #define MAXPHONEMELEN 4 111 #endif 112 113 #if defined(SLAPD_PHONETIC) 114 115 /* lifted from isode-8.0 */ 116 char * 117 phonetic( char *s ) 118 { 119 char code, adjacent, ch; 120 char *p; 121 int i; 122 char phoneme[MAXPHONEMELEN + 1]; 123 124 p = s; 125 if ( p == NULL || *p == '\0' ) { 126 return( NULL ); 127 } 128 129 adjacent = '0'; 130 phoneme[0] = TOUPPER((unsigned char)*p); 131 132 phoneme[1] = '\0'; 133 for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) { 134 ch = TOUPPER ((unsigned char)*p); 135 136 code = '0'; 137 138 switch (ch) { 139 case 'B': 140 case 'F': 141 case 'P': 142 case 'V': 143 code = (adjacent != '1') ? '1' : '0'; 144 break; 145 case 'S': 146 case 'C': 147 case 'G': 148 case 'J': 149 case 'K': 150 case 'Q': 151 case 'X': 152 case 'Z': 153 code = (adjacent != '2') ? '2' : '0'; 154 break; 155 case 'D': 156 case 'T': 157 code = (adjacent != '3') ? '3' : '0'; 158 break; 159 case 'L': 160 code = (adjacent != '4') ? '4' : '0'; 161 break; 162 case 'M': 163 case 'N': 164 code = (adjacent != '5') ? '5' : '0'; 165 break; 166 case 'R': 167 code = (adjacent != '6') ? '6' : '0'; 168 break; 169 default: 170 adjacent = '0'; 171 } 172 173 if ( i == 0 ) { 174 adjacent = code; 175 i++; 176 } else if ( code != '0' ) { 177 if ( i == MAXPHONEMELEN ) 178 break; 179 adjacent = phoneme[i] = code; 180 i++; 181 } 182 } 183 184 if ( i > 0 ) 185 phoneme[i] = '\0'; 186 187 return( ch_strdup( phoneme ) ); 188 } 189 190 #elif defined(SLAPD_METAPHONE) 191 192 /* 193 * Metaphone was originally developed by Lawrence Philips and 194 * published in the "Computer Language" magazine in 1990. 195 */ 196 /* 197 * Metaphone copied from C Gazette, June/July 1991, pp 56-57, 198 * author Gary A. Parker, with changes by Bernard Tiffany of the 199 * University of Michigan, and more changes by Tim Howes of the 200 * University of Michigan. 201 */ 202 203 /* Character coding array */ 204 static const char vsvfn[26] = { 205 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 206 /* A B C D E F G H I J K L M */ 207 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0}; 208 /* N O P Q R S T U V W X Y Z */ 209 210 /* Macros to access character coding array */ 211 #define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */ 212 #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */ 213 #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */ 214 #define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */ 215 #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */ 216 217 char * 218 phonetic( char *Word ) 219 { 220 char *n, *n_start, *n_end; /* pointers to string */ 221 char *metaph_end; /* pointers to metaph */ 222 char ntrans[40]; /* word with uppercase letters */ 223 int KSflag; /* state flag for X -> KS */ 224 char buf[MAXPHONEMELEN + 2]; 225 char *Metaph; 226 227 /* 228 * Copy Word to internal buffer, dropping non-alphabetic characters 229 * and converting to upper case 230 */ 231 232 for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) && 233 n < n_end; Word++) { 234 if (isalpha((unsigned char)*Word)) 235 *n++ = TOUPPER((unsigned char)*Word); 236 } 237 Metaph = buf; 238 *Metaph = '\0'; 239 if (n == ntrans + 4) { 240 return( ch_strdup( buf ) ); /* Return if null */ 241 } 242 n_end = n; /* Set n_end to end of string */ 243 244 /* ntrans[0] will always be == 0 */ 245 ntrans[0] = '\0'; 246 ntrans[1] = '\0'; 247 ntrans[2] = '\0'; 248 ntrans[3] = '\0'; 249 *n++ = 0; 250 *n++ = 0; 251 *n++ = 0; 252 *n = 0; /* Pad with nulls */ 253 n = ntrans + 4; /* Assign pointer to start */ 254 255 /* Check for PN, KN, GN, AE, WR, WH, and X at start */ 256 switch (*n) { 257 case 'P': 258 case 'K': 259 case 'G': 260 /* 'PN', 'KN', 'GN' becomes 'N' */ 261 if (*(n + 1) == 'N') 262 *n++ = 0; 263 break; 264 case 'A': 265 /* 'AE' becomes 'E' */ 266 if (*(n + 1) == 'E') 267 *n++ = 0; 268 break; 269 case 'W': 270 /* 'WR' becomes 'R', and 'WH' to 'H' */ 271 if (*(n + 1) == 'R') 272 *n++ = 0; 273 else if (*(n + 1) == 'H') { 274 *(n + 1) = *n; 275 *n++ = 0; 276 } 277 break; 278 case 'X': 279 /* 'X' becomes 'S' */ 280 *n = 'S'; 281 break; 282 } 283 284 /* 285 * Now, loop step through string, stopping at end of string or when 286 * the computed 'metaph' is MAXPHONEMELEN characters long 287 */ 288 289 KSflag = 0; /* state flag for KS translation */ 290 for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n; 291 n <= n_end && Metaph < metaph_end; n++) { 292 if (KSflag) { 293 KSflag = 0; 294 *Metaph++ = 'S'; 295 } else { 296 /* Drop duplicates except for CC */ 297 if (*(n - 1) == *n && *n != 'C') 298 continue; 299 /* Check for F J L M N R or first letter vowel */ 300 if (same(*n) || (n == n_start && vowel(*n))) 301 *Metaph++ = *n; 302 else 303 switch (*n) { 304 case 'B': 305 306 /* 307 * B unless in -MB 308 */ 309 if (n == (n_end - 1) && *(n - 1) != 'M') 310 *Metaph++ = *n; 311 break; 312 case 'C': 313 314 /* 315 * X if in -CIA-, -CH- else S if in 316 * -CI-, -CE-, -CY- else dropped if 317 * in -SCI-, -SCE-, -SCY- else K 318 */ 319 if (*(n - 1) != 'S' || !frontv(*(n + 1))) { 320 if (*(n + 1) == 'I' && *(n + 2) == 'A') 321 *Metaph++ = 'X'; 322 else if (frontv(*(n + 1))) 323 *Metaph++ = 'S'; 324 else if (*(n + 1) == 'H') 325 *Metaph++ = ((n == n_start && !vowel(*(n + 2))) 326 || *(n - 1) == 'S') 327 ? (char) 'K' : (char) 'X'; 328 else 329 *Metaph++ = 'K'; 330 } 331 break; 332 case 'D': 333 334 /* 335 * J if in DGE or DGI or DGY else T 336 */ 337 *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2))) 338 ? (char) 'J' : (char) 'T'; 339 break; 340 case 'G': 341 342 /* 343 * F if in -GH and not B--GH, D--GH, 344 * -H--GH, -H---GH else dropped if 345 * -GNED, -GN, -DGE-, -DGI-, -DGY- 346 * else J if in -GE-, -GI-, -GY- and 347 * not GG else K 348 */ 349 if ((*(n + 1) != 'J' || vowel(*(n + 2))) && 350 (*(n + 1) != 'N' || ((n + 1) < n_end && 351 (*(n + 2) != 'E' || *(n + 3) != 'D'))) && 352 (*(n - 1) != 'D' || !frontv(*(n + 1)))) 353 *Metaph++ = (frontv(*(n + 1)) && 354 *(n + 2) != 'G') ? (char) 'G' : (char) 'K'; 355 else if (*(n + 1) == 'H' && !noghf(*(n - 3)) && 356 *(n - 4) != 'H') 357 *Metaph++ = 'F'; 358 break; 359 case 'H': 360 361 /* 362 * H if before a vowel and not after 363 * C, G, P, S, T else dropped 364 */ 365 if (!varson(*(n - 1)) && (!vowel(*(n - 1)) || 366 vowel(*(n + 1)))) 367 *Metaph++ = 'H'; 368 break; 369 case 'K': 370 371 /* 372 * dropped if after C else K 373 */ 374 if (*(n - 1) != 'C') 375 *Metaph++ = 'K'; 376 break; 377 case 'P': 378 379 /* 380 * F if before H, else P 381 */ 382 *Metaph++ = *(n + 1) == 'H' ? 383 (char) 'F' : (char) 'P'; 384 break; 385 case 'Q': 386 387 /* 388 * K 389 */ 390 *Metaph++ = 'K'; 391 break; 392 case 'S': 393 394 /* 395 * X in -SH-, -SIO- or -SIA- else S 396 */ 397 *Metaph++ = (*(n + 1) == 'H' || 398 (*(n + 1) == 'I' && (*(n + 2) == 'O' || 399 *(n + 2) == 'A'))) 400 ? (char) 'X' : (char) 'S'; 401 break; 402 case 'T': 403 404 /* 405 * X in -TIA- or -TIO- else 0 (zero) 406 * before H else dropped if in -TCH- 407 * else T 408 */ 409 if (*(n + 1) == 'I' && (*(n + 2) == 'O' || 410 *(n + 2) == 'A')) 411 *Metaph++ = 'X'; 412 else if (*(n + 1) == 'H') 413 *Metaph++ = '0'; 414 else if (*(n + 1) != 'C' || *(n + 2) != 'H') 415 *Metaph++ = 'T'; 416 break; 417 case 'V': 418 419 /* 420 * F 421 */ 422 *Metaph++ = 'F'; 423 break; 424 case 'W': 425 426 /* 427 * W after a vowel, else dropped 428 */ 429 case 'Y': 430 431 /* 432 * Y unless followed by a vowel 433 */ 434 if (vowel(*(n + 1))) 435 *Metaph++ = *n; 436 break; 437 case 'X': 438 439 /* 440 * KS 441 */ 442 if (n == n_start) 443 *Metaph++ = 'S'; 444 else { 445 *Metaph++ = 'K'; /* Insert K, then S */ 446 KSflag = 1; 447 } 448 break; 449 case 'Z': 450 451 /* 452 * S 453 */ 454 *Metaph++ = 'S'; 455 break; 456 } 457 } 458 } 459 460 *Metaph = 0; /* Null terminate */ 461 return( ch_strdup( buf ) ); 462 } 463 464 #endif /* SLAPD_METAPHONE */ 465