1 /* $NetBSD: custom_apropos_tokenizer.c,v 1.4 2021/12/05 08:03:32 wiz Exp $ */ 2 /* 3 ** 2006 September 30 4 ** 5 ** The author disclaims copyright to this source code. In place of 6 ** a legal notice, here is a blessing: 7 ** 8 ** May you do good and not evil. 9 ** May you find forgiveness for yourself and forgive others. 10 ** May you share freely, never taking more than you give. 11 ** 12 ************************************************************************* 13 ** Implementation of the full-text-search tokenizer that implements 14 ** a Porter stemmer. 15 */ 16 17 /* 18 ** The code in this file is only compiled if: 19 ** 20 ** * The FTS3 module is being built as an extension 21 ** (in which case SQLITE_CORE is not defined), or 22 ** 23 ** * The FTS3 module is being built into the core of 24 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). 25 */ 26 27 #include <assert.h> 28 #include <ctype.h> 29 #include <stdlib.h> 30 #include <stdio.h> 31 #include <string.h> 32 33 #include "custom_apropos_tokenizer.h" 34 #include "fts3_tokenizer.h" 35 #include "nostem.c" 36 37 /* 38 * Class derived from sqlite3_tokenizer 39 */ 40 typedef struct custom_apropos_tokenizer { 41 sqlite3_tokenizer base; /* Base class */ 42 } custom_apropos_tokenizer; 43 44 /* 45 * Class derived from sqlite3_tokenizer_cursor 46 */ 47 typedef struct custom_apropos_tokenizer_cursor { 48 sqlite3_tokenizer_cursor base; 49 const char *zInput; /* input we are tokenizing */ 50 size_t nInput; /* size of the input */ 51 size_t iOffset; /* current position in zInput */ 52 size_t iToken; /* index of next token to be returned */ 53 char *zToken; /* storage for current token */ 54 size_t nAllocated; /* space allocated to zToken buffer */ 55 } custom_apropos_tokenizer_cursor; 56 57 /* 58 * Create a new tokenizer instance. 59 */ 60 static int 61 aproposPorterCreate(int argc, const char *const * argv, 62 sqlite3_tokenizer ** ppTokenizer) 63 { 64 custom_apropos_tokenizer *t; 65 t = calloc(1, sizeof(*t)); 66 if (t == NULL) 67 return SQLITE_NOMEM; 68 *ppTokenizer = &t->base; 69 return SQLITE_OK; 70 } 71 72 /* 73 * Destroy a tokenizer 74 */ 75 static int 76 aproposPorterDestroy(sqlite3_tokenizer * pTokenizer) 77 { 78 free(pTokenizer); 79 return SQLITE_OK; 80 } 81 82 /* 83 * Prepare to begin tokenizing a particular string. The input 84 * string to be tokenized is zInput[0..nInput-1]. A cursor 85 * used to incrementally tokenize this string is returned in 86 * *ppCursor. 87 */ 88 static int 89 aproposPorterOpen( 90 sqlite3_tokenizer * pTokenizer, /* The tokenizer */ 91 const char *zInput, int nInput, /* String to be tokenized */ 92 sqlite3_tokenizer_cursor ** ppCursor /* OUT: Tokenization cursor */ 93 ) 94 { 95 custom_apropos_tokenizer_cursor *c; 96 97 c = calloc(1, sizeof(*c)); 98 if (c == NULL) 99 return SQLITE_NOMEM; 100 101 c->zInput = zInput; 102 if (zInput != 0) { 103 if (nInput < 0) 104 c->nInput = strlen(zInput); 105 else 106 c->nInput = nInput; 107 } 108 109 *ppCursor = &c->base; 110 return SQLITE_OK; 111 } 112 113 /* 114 * Close a tokenization cursor previously opened by a call to 115 * aproposPorterOpen() above. 116 */ 117 static int 118 aproposPorterClose(sqlite3_tokenizer_cursor *pCursor) 119 { 120 custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor; 121 free(c->zToken); 122 free(c); 123 return SQLITE_OK; 124 } 125 126 /* 127 * Vowel or consonant 128 */ 129 static const char cType[] = { 130 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 131 1, 1, 1, 2, 1 132 }; 133 134 /* 135 * isConsonant() and isVowel() determine if their first character in 136 * the string they point to is a consonant or a vowel, according 137 * to Porter ruls. 138 * 139 * A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. 140 * 'Y' is a consonant unless it follows another consonant, 141 * in which case it is a vowel. 142 * 143 * In these routine, the letters are in reverse order. So the 'y' rule 144 * is that 'y' is a consonant unless it is followed by another 145 * consonent. 146 */ 147 static int isVowel(const char*); 148 149 static int 150 isConsonant(const char *z) 151 { 152 int j; 153 char x = *z; 154 if (x == 0) 155 return 0; 156 assert(x >= 'a' && x <= 'z'); 157 j = cType[x - 'a']; 158 if (j < 2) 159 return j; 160 return z[1] == 0 || isVowel(z + 1); 161 } 162 163 static int 164 isVowel(const char *z) 165 { 166 int j; 167 char x = *z; 168 if (x == 0) 169 return 0; 170 assert(x >= 'a' && x <= 'z'); 171 j = cType[x - 'a']; 172 if (j < 2) 173 return 1 - j; 174 return isConsonant(z + 1); 175 } 176 177 /* 178 * Let any sequence of one or more vowels be represented by V and let 179 * C be sequence of one or more consonants. Then every word can be 180 * represented as: 181 * 182 * [C] (VC){m} [V] 183 * 184 * In prose: A word is an optional consonant followed by zero or 185 * vowel-consonant pairs followed by an optional vowel. "m" is the 186 * number of vowel consonant pairs. This routine computes the value 187 * of m for the first i bytes of a word. 188 * 189 * Return true if the m-value for z is 1 or more. In other words, 190 * return true if z contains at least one vowel that is followed 191 * by a consonant. 192 * 193 * In this routine z[] is in reverse order. So we are really looking 194 * for an instance of a consonant followed by a vowel. 195 */ 196 static int 197 m_gt_0(const char *z) 198 { 199 while (isVowel(z)) { 200 z++; 201 } 202 if (*z == 0) 203 return 0; 204 while (isConsonant(z)) { 205 z++; 206 } 207 return *z != 0; 208 } 209 210 /* Like mgt0 above except we are looking for a value of m which is 211 * exactly 1 212 */ 213 static int 214 m_eq_1(const char *z) 215 { 216 while (isVowel(z)) { 217 z++; 218 } 219 if (*z == 0) 220 return 0; 221 while (isConsonant(z)) { 222 z++; 223 } 224 if (*z == 0) 225 return 0; 226 while (isVowel(z)) { 227 z++; 228 } 229 if (*z == 0) 230 return 1; 231 while (isConsonant(z)) { 232 z++; 233 } 234 return *z == 0; 235 } 236 237 /* Like mgt0 above except we are looking for a value of m>1 instead 238 * or m>0 239 */ 240 static int 241 m_gt_1(const char *z) 242 { 243 while (isVowel(z)) { 244 z++; 245 } 246 if (*z == 0) 247 return 0; 248 while (isConsonant(z)) { 249 z++; 250 } 251 if (*z == 0) 252 return 0; 253 while (isVowel(z)) { 254 z++; 255 } 256 if (*z == 0) 257 return 0; 258 while (isConsonant(z)) { 259 z++; 260 } 261 return *z != 0; 262 } 263 264 /* 265 * Return TRUE if there is a vowel anywhere within z[0..n-1] 266 */ 267 static int 268 hasVowel(const char *z) 269 { 270 while (isConsonant(z)) { 271 z++; 272 } 273 return *z != 0; 274 } 275 276 /* 277 * Return TRUE if the word ends in a double consonant. 278 * 279 * The text is reversed here. So we are really looking at 280 * the first two characters of z[]. 281 */ 282 static int 283 doubleConsonant(const char *z) 284 { 285 return isConsonant(z) && z[0] == z[1]; 286 } 287 288 /* 289 * Return TRUE if the word ends with three letters which 290 * are consonant-vowel-consonent and where the final consonant 291 * is not 'w', 'x', or 'y'. 292 * 293 * The word is reversed here. So we are really checking the 294 * first three letters and the first one cannot be in [wxy]. 295 */ 296 static int 297 star_oh(const char *z) 298 { 299 return isConsonant(z) && 300 z[0] != 'w' && z[0] != 'x' && z[0] != 'y' && 301 isVowel(z + 1) && 302 isConsonant(z + 2); 303 } 304 305 /* 306 * If the word ends with zFrom and xCond() is true for the stem 307 * of the word that precedes the zFrom ending, then change the 308 * ending to zTo. 309 * 310 * The input word *pz and zFrom are both in reverse order. zTo 311 * is in normal order. 312 * 313 * Return TRUE if zFrom matches. Return FALSE if zFrom does not 314 * match. Not that TRUE is returned even if xCond() fails and 315 * no substitution occurs. 316 */ 317 static int 318 stem( 319 char **pz, /* The word being stemmed (Reversed) */ 320 const char *zFrom, /* If the ending matches this... (Reversed) */ 321 const char *zTo, /* ... change the ending to this (not reversed) */ 322 int (*xCond) (const char *) /* Condition that must be true */ 323 ) 324 { 325 char *z = *pz; 326 while (*zFrom && *zFrom == *z) { 327 z++; 328 zFrom++; 329 } 330 if (*zFrom != 0) 331 return 0; 332 if (xCond && !xCond(z)) 333 return 1; 334 while (*zTo) { 335 *(--z) = *(zTo++); 336 } 337 *pz = z; 338 return 1; 339 } 340 341 /* 342 * This is the fallback stemmer used when the porter stemmer is 343 * inappropriate. The input word is copied into the output with 344 * US-ASCII case folding. If the input word is too long (more 345 * than 20 bytes if it contains no digits or more than 6 bytes if 346 * it contains digits) then word is truncated to 20 or 6 bytes 347 * by taking 10 or 3 bytes from the beginning and end. 348 */ 349 static void 350 copy_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) 351 { 352 size_t i, mx, j; 353 int hasDigit = 0; 354 for (i = 0; i < nIn; i++) { 355 char c = zIn[i]; 356 if (c >= 'A' && c <= 'Z') { 357 zOut[i] = c - 'A' + 'a'; 358 } else { 359 if (c >= '0' && c <= '9') 360 hasDigit = 1; 361 zOut[i] = c; 362 } 363 } 364 mx = hasDigit ? 3 : 10; 365 if (nIn > mx * 2) { 366 for (j = mx, i = nIn - mx; i < nIn; i++, j++) { 367 zOut[j] = zOut[i]; 368 } 369 i = j; 370 } 371 zOut[i] = 0; 372 *pnOut = i; 373 } 374 375 376 /* 377 * Stem the input word zIn[0..nIn-1]. Store the output in zOut. 378 * zOut is at least big enough to hold nIn bytes. Write the actual 379 * size of the output word (exclusive of the '\0' terminator) into *pnOut. 380 * 381 * Any upper-case characters in the US-ASCII character set ([A-Z]) 382 * are converted to lower case. Upper-case UTF characters are 383 * unchanged. 384 * 385 * Words that are longer than about 20 bytes are stemmed by retaining 386 * a few bytes from the beginning and the end of the word. If the 387 * word contains digits, 3 bytes are taken from the beginning and 388 * 3 bytes from the end. For long words without digits, 10 bytes 389 * are taken from each end. US-ASCII case folding still applies. 390 * 391 * If the input word contains not digits but does characters not 392 * in [a-zA-Z] then no stemming is attempted and this routine just 393 * copies the input into the input into the output with US-ASCII 394 * case folding. 395 * 396 * Stemming never increases the length of the word. So there is 397 * no chance of overflowing the zOut buffer. 398 */ 399 static void 400 porter_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) 401 { 402 size_t i, j; 403 char zReverse[28]; 404 char *z, *z2; 405 if (nIn < 3 || nIn >= sizeof(zReverse) - 7) { 406 /* The word is too big or too small for the porter stemmer. 407 * Fallback to the copy stemmer 408 */ 409 copy_stemmer(zIn, nIn, zOut, pnOut); 410 return; 411 } 412 413 for (i = 0, j = sizeof(zReverse) - 6; i < nIn; i++, j--) { 414 char c = zIn[i]; 415 if (c >= 'A' && c <= 'Z') { 416 zReverse[j] = c + 'a' - 'A'; 417 } else if (c >= 'a' && c <= 'z') { 418 zReverse[j] = c; 419 } else { 420 /* The use of a character not in [a-zA-Z] means that 421 * we fallback * to the copy stemmer 422 */ 423 copy_stemmer(zIn, nIn, zOut, pnOut); 424 return; 425 } 426 } 427 memset(&zReverse[sizeof(zReverse) - 5], 0, 5); 428 z = &zReverse[j + 1]; 429 430 431 /* Step 1a */ 432 if (z[0] == 's') { 433 if ( 434 !stem(&z, "sess", "ss", 0) && 435 !stem(&z, "sei", "i", 0) && 436 !stem(&z, "ss", "ss", 0) 437 ) { 438 z++; 439 } 440 } 441 /* Step 1b */ 442 z2 = z; 443 if (stem(&z, "dee", "ee", m_gt_0)) { 444 /* Do nothing. The work was all in the test */ 445 } else if ( 446 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) 447 && z != z2 448 ) { 449 if (stem(&z, "ta", "ate", 0) || 450 stem(&z, "lb", "ble", 0) || 451 stem(&z, "zi", "ize", 0)) { 452 /* Do nothing. The work was all in the test */ 453 } else if (doubleConsonant(z) && (*z != 'l' && *z != 's' && *z != 'z')) { 454 z++; 455 } else if (m_eq_1(z) && star_oh(z)) { 456 *(--z) = 'e'; 457 } 458 } 459 /* Step 1c */ 460 if (z[0] == 'y' && hasVowel(z + 1)) { 461 z[0] = 'i'; 462 } 463 /* Step 2 */ 464 switch (z[1]) { 465 case 'a': 466 if (!stem(&z, "lanoita", "ate", m_gt_0)) { 467 stem(&z, "lanoit", "tion", m_gt_0); 468 } 469 break; 470 case 'c': 471 if (!stem(&z, "icne", "ence", m_gt_0)) { 472 stem(&z, "icna", "ance", m_gt_0); 473 } 474 break; 475 case 'e': 476 stem(&z, "rezi", "ize", m_gt_0); 477 break; 478 case 'g': 479 stem(&z, "igol", "log", m_gt_0); 480 break; 481 case 'l': 482 if (!stem(&z, "ilb", "ble", m_gt_0) 483 && !stem(&z, "illa", "al", m_gt_0) 484 && !stem(&z, "iltne", "ent", m_gt_0) 485 && !stem(&z, "ile", "e", m_gt_0) 486 ) { 487 stem(&z, "ilsuo", "ous", m_gt_0); 488 } 489 break; 490 case 'o': 491 if (!stem(&z, "noitazi", "ize", m_gt_0) 492 && !stem(&z, "noita", "ate", m_gt_0) 493 ) { 494 stem(&z, "rota", "ate", m_gt_0); 495 } 496 break; 497 case 's': 498 if (!stem(&z, "msila", "al", m_gt_0) 499 && !stem(&z, "ssenevi", "ive", m_gt_0) 500 && !stem(&z, "ssenluf", "ful", m_gt_0) 501 ) { 502 stem(&z, "ssensuo", "ous", m_gt_0); 503 } 504 break; 505 case 't': 506 if (!stem(&z, "itila", "al", m_gt_0) 507 && !stem(&z, "itivi", "ive", m_gt_0) 508 ) { 509 stem(&z, "itilib", "ble", m_gt_0); 510 } 511 break; 512 } 513 514 /* Step 3 */ 515 switch (z[0]) { 516 case 'e': 517 if (!stem(&z, "etaci", "ic", m_gt_0) 518 && !stem(&z, "evita", "", m_gt_0) 519 ) { 520 stem(&z, "ezila", "al", m_gt_0); 521 } 522 break; 523 case 'i': 524 stem(&z, "itici", "ic", m_gt_0); 525 break; 526 case 'l': 527 if (!stem(&z, "laci", "ic", m_gt_0)) { 528 stem(&z, "luf", "", m_gt_0); 529 } 530 break; 531 case 's': 532 stem(&z, "ssen", "", m_gt_0); 533 break; 534 } 535 536 /* Step 4 */ 537 switch (z[1]) { 538 case 'a': 539 if (z[0] == 'l' && m_gt_1(z + 2)) { 540 z += 2; 541 } 542 break; 543 case 'c': 544 if (z[0] == 'e' && z[2] == 'n' && (z[3] == 'a' || z[3] == 'e') && m_gt_1(z + 4)) { 545 z += 4; 546 } 547 break; 548 case 'e': 549 if (z[0] == 'r' && m_gt_1(z + 2)) { 550 z += 2; 551 } 552 break; 553 case 'i': 554 if (z[0] == 'c' && m_gt_1(z + 2)) { 555 z += 2; 556 } 557 break; 558 case 'l': 559 if (z[0] == 'e' && z[2] == 'b' && (z[3] == 'a' || z[3] == 'i') && m_gt_1(z + 4)) { 560 z += 4; 561 } 562 break; 563 case 'n': 564 if (z[0] == 't') { 565 if (z[2] == 'a') { 566 if (m_gt_1(z + 3)) { 567 z += 3; 568 } 569 } else if (z[2] == 'e') { 570 if (!stem(&z, "tneme", "", m_gt_1) 571 && !stem(&z, "tnem", "", m_gt_1) 572 ) { 573 stem(&z, "tne", "", m_gt_1); 574 } 575 } 576 } 577 break; 578 case 'o': 579 if (z[0] == 'u') { 580 if (m_gt_1(z + 2)) { 581 z += 2; 582 } 583 } else if (z[3] == 's' || z[3] == 't') { 584 stem(&z, "noi", "", m_gt_1); 585 } 586 break; 587 case 's': 588 if (z[0] == 'm' && z[2] == 'i' && m_gt_1(z + 3)) { 589 z += 3; 590 } 591 break; 592 case 't': 593 if (!stem(&z, "eta", "", m_gt_1)) { 594 stem(&z, "iti", "", m_gt_1); 595 } 596 break; 597 case 'u': 598 if (z[0] == 's' && z[2] == 'o' && m_gt_1(z + 3)) { 599 z += 3; 600 } 601 break; 602 case 'v': 603 case 'z': 604 if (z[0] == 'e' && z[2] == 'i' && m_gt_1(z + 3)) { 605 z += 3; 606 } 607 break; 608 } 609 610 /* Step 5a */ 611 if (z[0] == 'e') { 612 if (m_gt_1(z + 1)) { 613 z++; 614 } else if (m_eq_1(z + 1) && !star_oh(z + 1)) { 615 z++; 616 } 617 } 618 /* Step 5b */ 619 if (m_gt_1(z) && z[0] == 'l' && z[1] == 'l') { 620 z++; 621 } 622 /* z[] is now the stemmed word in reverse order. Flip it back 623 * around into forward order and return. 624 */ 625 *pnOut = i = strlen(z); 626 zOut[i] = 0; 627 while (*z) { 628 zOut[--i] = *(z++); 629 } 630 } 631 632 /* 633 * Based on whether the input word is in the nostem list or not 634 * call porter stemmer to stem it, or call copy_stemmer to keep it 635 * as it is (copy_stemmer converts simply converts it to lower case) 636 * Returns SQLITE_OK if stemming is successful, an error code for 637 * any errors 638 */ 639 static int 640 do_stem(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) 641 { 642 /* Before looking up the word in the hash table, convert it to lower-case */ 643 char *dupword = malloc(nIn); 644 if (dupword == NULL) 645 return SQLITE_NOMEM; 646 647 for (size_t i = 0; i < nIn; i++) 648 dupword[i] = tolower((unsigned char) zIn[i]); 649 650 size_t idx = nostem_hash(dupword, nIn); 651 if (strncmp(nostem[idx], dupword, nIn) == 0 && nostem[idx][nIn] == 0) 652 copy_stemmer(zIn, nIn, zOut, pnOut); 653 else 654 porter_stemmer(zIn, nIn, zOut, pnOut); 655 656 free(dupword); 657 return SQLITE_OK; 658 } 659 660 661 /* 662 * Characters that can be part of a token. We assume any character 663 * whose value is greater than 0x80 (any UTF character) can be 664 * part of a token. In other words, delimiters all must have 665 * values of 0x7f or lower. 666 */ 667 static const char porterIdChar[] = { 668 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ 669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ 670 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ 671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 5x */ 672 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ 673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ 674 }; 675 676 #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30])) 677 678 /* 679 * Extract the next token from a tokenization cursor. The cursor must 680 * have been opened by a prior call to aproposPorterOpen(). 681 */ 682 static int 683 aproposPorterNext( 684 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by aproposPorterOpen */ 685 const char **pzToken, /* OUT: *pzToken is the token text */ 686 int *pnBytes, /* OUT: Number of bytes in token */ 687 int *piStartOffset, /* OUT: Starting offset of token */ 688 int *piEndOffset, /* OUT: Ending offset of token */ 689 int *piPosition /* OUT: Position integer of token */ 690 ) 691 { 692 custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor; 693 const char *z = c->zInput; 694 695 while (c->iOffset < c->nInput) { 696 size_t iStartOffset, ch; 697 698 /* Scan past delimiter characters */ 699 while (c->iOffset < c->nInput && isDelim(z[c->iOffset])) { 700 c->iOffset++; 701 } 702 703 /* Count non-delimiter characters. */ 704 iStartOffset = c->iOffset; 705 while (c->iOffset < c->nInput && !isDelim(z[c->iOffset])) { 706 c->iOffset++; 707 } 708 709 if (c->iOffset > iStartOffset) { 710 size_t n = c->iOffset - iStartOffset; 711 if (n > c->nAllocated) { 712 char *pNew; 713 c->nAllocated = n + 20; 714 pNew = realloc(c->zToken, c->nAllocated); 715 if (!pNew) 716 return SQLITE_NOMEM; 717 c->zToken = pNew; 718 } 719 720 size_t temp; 721 int stemStatus = do_stem(&z[iStartOffset], n, c->zToken, &temp); 722 *pnBytes = temp; 723 if (stemStatus != SQLITE_OK) 724 return stemStatus; 725 726 *pzToken = c->zToken; 727 *piStartOffset = iStartOffset; 728 *piEndOffset = c->iOffset; 729 *piPosition = c->iToken++; 730 return SQLITE_OK; 731 } 732 } 733 return SQLITE_DONE; 734 } 735 736 /* 737 * The set of routines that implement the porter-stemmer tokenizer 738 */ 739 static const sqlite3_tokenizer_module aproposPorterTokenizerModule = { 740 0, 741 aproposPorterCreate, 742 aproposPorterDestroy, 743 aproposPorterOpen, 744 aproposPorterClose, 745 aproposPorterNext, 746 0 747 }; 748 749 /* 750 * Allocate a new porter tokenizer. Return a pointer to the new 751 * tokenizer in *ppModule 752 */ 753 void 754 get_custom_apropos_tokenizer(sqlite3_tokenizer_module const ** ppModule) 755 { 756 *ppModule = &aproposPorterTokenizerModule; 757 } 758