1 2 #define PERL_NO_GET_CONTEXT /* we want efficiency */ 3 4 /* private functions which need pTHX_ and aTHX_ 5 pv_cat_decompHangul 6 sv_2pvunicode 7 pv_utf8_decompose 8 pv_utf8_reorder 9 pv_utf8_compose 10 */ 11 12 #include "EXTERN.h" 13 #include "perl.h" 14 #include "XSUB.h" 15 16 #define NEED_utf8_to_uvchr_buf 17 #include "ppport.h" 18 19 /* These 5 files are prepared by mkheader */ 20 #include "unfcmb.h" 21 #include "unfcan.h" 22 #include "unfcpt.h" 23 #include "unfcmp.h" 24 #include "unfexc.h" 25 26 /* The generated normalization tables since v5.20 are in native character set 27 * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for 28 * later perls, and redefine that to be 'uvuni' for earlier ones */ 29 #if PERL_VERSION_LT(5,20,0) 30 # undef uvchr_to_utf8 31 # ifdef uvuni_to_utf8 32 # define uvchr_to_utf8 uvuni_to_utf8 33 # else /* Perl 5.6.1 */ 34 # define uvchr_to_utf8 uv_to_utf8 35 # endif 36 #endif 37 38 /* check if the string buffer is enough before uvchr_to_utf8(). */ 39 /* dstart, d, and dlen should be defined outside before. */ 40 #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ 41 if (dlen < curlen + (need)) { \ 42 dlen += (need); \ 43 Renew(dstart, dlen+1, U8); \ 44 d = dstart + curlen; \ 45 } 46 47 /* if utf8_to_uvchr_buf() sets retlen to 0 (if broken?) */ 48 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" 49 50 /* utf8_hop() hops back before start. Maybe broken UTF-8 */ 51 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" 52 53 /* At present, char > 0x10ffff are unaffected without complaint, right? */ 54 #define VALID_UTF_MAX (0x10ffff) 55 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) 56 57 /* size of array for combining characters */ 58 /* enough as an initial value? */ 59 #define CC_SEQ_SIZE (10) 60 #define CC_SEQ_STEP (5) 61 62 /* HANGUL begin */ 63 #define Hangul_SBase 0xAC00 64 #define Hangul_SFinal 0xD7A3 65 #define Hangul_SCount 11172 66 67 #define Hangul_NCount 588 68 69 #define Hangul_LBase 0x1100 70 #define Hangul_LFinal 0x1112 71 #define Hangul_LCount 19 72 73 #define Hangul_VBase 0x1161 74 #define Hangul_VFinal 0x1175 75 #define Hangul_VCount 21 76 77 #define Hangul_TBase 0x11A7 78 #define Hangul_TFinal 0x11C2 79 #define Hangul_TCount 28 80 81 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) 82 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) 83 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) 84 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) 85 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) 86 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) 87 /* HANGUL end */ 88 89 /* this is used for canonical ordering of combining characters (c.c.). */ 90 typedef struct { 91 U8 cc; /* combining class */ 92 UV uv; /* codepoint */ 93 STRLEN pos; /* position */ 94 } UNF_cc; 95 96 static int compare_cc(const void *a, const void *b) 97 { 98 int ret_cc; 99 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; 100 if (ret_cc) 101 return ret_cc; 102 103 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) 104 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); 105 } 106 107 static U8* dec_canonical(UV uv) 108 { 109 U8 ***plane, **row; 110 if (OVER_UTF_MAX(uv)) 111 return NULL; 112 plane = (U8***)UNF_canon[uv >> 16]; 113 if (! plane) 114 return NULL; 115 row = plane[(U8) (uv >> 8)]; 116 return row ? row[(U8) uv] : NULL; 117 } 118 119 static U8* dec_compat(UV uv) 120 { 121 U8 ***plane, **row; 122 if (OVER_UTF_MAX(uv)) 123 return NULL; 124 plane = (U8***)UNF_compat[uv >> 16]; 125 if (! plane) 126 return NULL; 127 row = plane[(U8) (uv >> 8)]; 128 return row ? row[(U8) uv] : NULL; 129 } 130 131 static UV composite_uv(UV uv, UV uv2) 132 { 133 UNF_complist ***plane, **row, *cell, *i; 134 135 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) 136 return 0; 137 138 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { 139 UV lindex = uv - Hangul_LBase; 140 UV vindex = uv2 - Hangul_VBase; 141 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * 142 Hangul_TCount); 143 } 144 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { 145 UV tindex = uv2 - Hangul_TBase; 146 return(uv + tindex); 147 } 148 plane = UNF_compos[uv >> 16]; 149 if (! plane) 150 return 0; 151 row = plane[(U8) (uv >> 8)]; 152 if (! row) 153 return 0; 154 cell = row[(U8) uv]; 155 if (! cell) 156 return 0; 157 for (i = cell; i->nextchar; i++) { 158 if (uv2 == i->nextchar) 159 return i->composite; 160 } 161 return 0; 162 } 163 164 static U8 getCombinClass(UV uv) 165 { 166 U8 **plane, *row; 167 if (OVER_UTF_MAX(uv)) 168 return 0; 169 plane = (U8**)UNF_combin[uv >> 16]; 170 if (! plane) 171 return 0; 172 row = plane[(U8) (uv >> 8)]; 173 return row ? row[(U8) uv] : 0; 174 } 175 176 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) 177 { 178 UV sindex = uv - Hangul_SBase; 179 UV lindex = sindex / Hangul_NCount; 180 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; 181 UV tindex = sindex % Hangul_TCount; 182 183 if (! Hangul_IsS(uv)) 184 return d; 185 186 d = uvchr_to_utf8(d, (lindex + Hangul_LBase)); 187 d = uvchr_to_utf8(d, (vindex + Hangul_VBase)); 188 if (tindex) 189 d = uvchr_to_utf8(d, (tindex + Hangul_TBase)); 190 return d; 191 } 192 193 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) 194 { 195 char *s; 196 STRLEN len; 197 s = SvPV(sv,len); 198 if (!SvUTF8(sv)) { 199 SV* tmpsv = sv_2mortal(newSVpvn(s, len)); 200 if (!SvPOK(tmpsv)) 201 s = SvPV_force(tmpsv,len); 202 sv_utf8_upgrade(tmpsv); 203 s = SvPV(tmpsv,len); 204 } 205 if (lp) 206 *lp = len; 207 return s; 208 } 209 210 static 211 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) 212 { 213 U8* p = s; 214 U8* e = s + slen; 215 U8* dstart = *dp; 216 U8* d = dstart; 217 218 while (p < e) { 219 STRLEN retlen; 220 UV uv = utf8_to_uvchr_buf(p, e, &retlen); 221 if (!retlen) 222 croak(ErrRetlenIsZero, "decompose"); 223 p += retlen; 224 225 if (Hangul_IsS(uv)) { 226 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) 227 d = pv_cat_decompHangul(aTHX_ d, uv); 228 } 229 else { 230 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); 231 232 if (r) { 233 STRLEN len = (STRLEN)strlen((char *)r); 234 Renew_d_if_not_enough_to(len) 235 while (len--) 236 *d++ = *r++; 237 } 238 else { 239 Renew_d_if_not_enough_to(UTF8_MAXLEN) 240 d = uvchr_to_utf8(d, uv); 241 } 242 } 243 } 244 *dp = dstart; 245 return d; 246 } 247 248 static 249 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) 250 { 251 U8* p = s; 252 U8* e = s + slen; 253 U8* dstart = *dp; 254 U8* d = dstart; 255 256 UNF_cc seq_ary[CC_SEQ_SIZE]; 257 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ 258 UNF_cc* seq_ext = NULL; /* extend if need */ 259 STRLEN seq_max = CC_SEQ_SIZE; 260 STRLEN cc_pos = 0; 261 262 while (p < e) { 263 U8 curCC; 264 STRLEN retlen; 265 UV uv = utf8_to_uvchr_buf(p, e, &retlen); 266 if (!retlen) 267 croak(ErrRetlenIsZero, "reorder"); 268 p += retlen; 269 270 curCC = getCombinClass(uv); 271 272 if (curCC != 0) { 273 if (seq_max < cc_pos + 1) { /* extend if need */ 274 seq_max = cc_pos + CC_SEQ_STEP; /* new size */ 275 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ 276 STRLEN i; 277 New(0, seq_ext, seq_max, UNF_cc); 278 for (i = 0; i < cc_pos; i++) 279 seq_ext[i] = seq_ary[i]; 280 } 281 else { 282 Renew(seq_ext, seq_max, UNF_cc); 283 } 284 seq_ptr = seq_ext; /* use seq_ext from now */ 285 } 286 287 seq_ptr[cc_pos].cc = curCC; 288 seq_ptr[cc_pos].uv = uv; 289 seq_ptr[cc_pos].pos = cc_pos; 290 ++cc_pos; 291 292 if (p < e) 293 continue; 294 } 295 296 /* output */ 297 if (cc_pos) { 298 STRLEN i; 299 300 if (cc_pos > 1) /* reordered if there are two c.c.'s */ 301 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); 302 303 for (i = 0; i < cc_pos; i++) { 304 Renew_d_if_not_enough_to(UTF8_MAXLEN) 305 d = uvchr_to_utf8(d, seq_ptr[i].uv); 306 } 307 cc_pos = 0; 308 } 309 310 if (curCC == 0) { 311 Renew_d_if_not_enough_to(UTF8_MAXLEN) 312 d = uvchr_to_utf8(d, uv); 313 } 314 } 315 if (seq_ext) 316 Safefree(seq_ext); 317 *dp = dstart; 318 return d; 319 } 320 321 static 322 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) 323 { 324 U8* p = s; 325 U8* e = s + slen; 326 U8* dstart = *dp; 327 U8* d = dstart; 328 329 UV uvS = 0; /* code point of the starter */ 330 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ 331 U8 preCC = 0; 332 333 UV seq_ary[CC_SEQ_SIZE]; 334 UV* seq_ptr = seq_ary; /* use array at the beginning */ 335 UV* seq_ext = NULL; /* extend if need */ 336 STRLEN seq_max = CC_SEQ_SIZE; 337 STRLEN cc_pos = 0; 338 339 while (p < e) { 340 U8 curCC; 341 STRLEN retlen; 342 UV uv = utf8_to_uvchr_buf(p, e, &retlen); 343 if (!retlen) 344 croak(ErrRetlenIsZero, "compose"); 345 p += retlen; 346 347 curCC = getCombinClass(uv); 348 349 if (!valid_uvS) { 350 if (curCC == 0) { 351 uvS = uv; /* the first Starter is found */ 352 valid_uvS = TRUE; 353 if (p < e) 354 continue; 355 } 356 else { 357 Renew_d_if_not_enough_to(UTF8_MAXLEN) 358 d = uvchr_to_utf8(d, uv); 359 continue; 360 } 361 } 362 else { 363 bool composed; 364 365 /* blocked */ 366 if ((iscontig && cc_pos) || /* discontiguous combination */ 367 (curCC != 0 && preCC == curCC) || /* blocked by same CC */ 368 (preCC > curCC)) /* blocked by higher CC: revised D2 */ 369 composed = FALSE; 370 371 /* not blocked: 372 iscontig && cc_pos == 0 -- contiguous combination 373 curCC == 0 && preCC == 0 -- starter + starter 374 curCC != 0 && preCC < curCC -- lower CC */ 375 else { 376 /* try composition */ 377 UV uvComp = composite_uv(uvS, uv); 378 379 if (uvComp && !isExclusion(uvComp)) { 380 uvS = uvComp; 381 composed = TRUE; 382 383 /* preCC should not be changed to curCC */ 384 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ 385 if (p < e) 386 continue; 387 } 388 else 389 composed = FALSE; 390 } 391 392 if (!composed) { 393 preCC = curCC; 394 if (curCC != 0 || !(p < e)) { 395 if (seq_max < cc_pos + 1) { /* extend if need */ 396 seq_max = cc_pos + CC_SEQ_STEP; /* new size */ 397 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ 398 New(0, seq_ext, seq_max, UV); 399 Copy(seq_ary, seq_ext, cc_pos, UV); 400 } 401 else { 402 Renew(seq_ext, seq_max, UV); 403 } 404 seq_ptr = seq_ext; /* use seq_ext from now */ 405 } 406 seq_ptr[cc_pos] = uv; 407 ++cc_pos; 408 } 409 if (curCC != 0 && p < e) 410 continue; 411 } 412 } 413 414 /* output */ 415 { 416 Renew_d_if_not_enough_to(UTF8_MAXLEN) 417 d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */ 418 } 419 420 if (cc_pos) { 421 STRLEN i; 422 423 for (i = 0; i < cc_pos; i++) { 424 Renew_d_if_not_enough_to(UTF8_MAXLEN) 425 d = uvchr_to_utf8(d, seq_ptr[i]); 426 } 427 cc_pos = 0; 428 } 429 430 uvS = uv; 431 } 432 if (seq_ext) 433 Safefree(seq_ext); 434 *dp = dstart; 435 return d; 436 } 437 438 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize 439 440 SV* 441 decompose(src, compat = &PL_sv_no) 442 SV * src 443 SV * compat 444 PROTOTYPE: $;$ 445 PREINIT: 446 SV* dst; 447 U8 *s, *d, *dend; 448 STRLEN slen, dlen; 449 CODE: 450 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 451 dst = newSVpvn("", 0); 452 dlen = slen; 453 New(0, d, dlen+1, U8); 454 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); 455 sv_setpvn(dst, (char *)d, dend - d); 456 SvUTF8_on(dst); 457 Safefree(d); 458 RETVAL = dst; 459 OUTPUT: 460 RETVAL 461 462 463 SV* 464 reorder(src) 465 SV * src 466 PROTOTYPE: $ 467 PREINIT: 468 SV* dst; 469 U8 *s, *d, *dend; 470 STRLEN slen, dlen; 471 CODE: 472 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 473 dst = newSVpvn("", 0); 474 dlen = slen; 475 New(0, d, dlen+1, U8); 476 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); 477 sv_setpvn(dst, (char *)d, dend - d); 478 SvUTF8_on(dst); 479 Safefree(d); 480 RETVAL = dst; 481 OUTPUT: 482 RETVAL 483 484 485 SV* 486 compose(src) 487 SV * src 488 PROTOTYPE: $ 489 ALIAS: 490 composeContiguous = 1 491 PREINIT: 492 SV* dst; 493 U8 *s, *d, *dend; 494 STRLEN slen, dlen; 495 CODE: 496 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 497 dst = newSVpvn("", 0); 498 dlen = slen; 499 New(0, d, dlen+1, U8); 500 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); 501 sv_setpvn(dst, (char *)d, dend - d); 502 SvUTF8_on(dst); 503 Safefree(d); 504 RETVAL = dst; 505 OUTPUT: 506 RETVAL 507 508 509 SV* 510 NFD(src) 511 SV * src 512 PROTOTYPE: $ 513 ALIAS: 514 NFKD = 1 515 PREINIT: 516 SV *dst; 517 U8 *s, *t, *tend, *d, *dend; 518 STRLEN slen, tlen, dlen; 519 CODE: 520 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 521 522 /* decompose */ 523 tlen = slen; 524 New(0, t, tlen+1, U8); 525 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); 526 *tend = '\0'; 527 tlen = tend - t; /* no longer know real size of t */ 528 529 /* reorder */ 530 dlen = tlen; 531 New(0, d, dlen+1, U8); 532 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); 533 *dend = '\0'; 534 dlen = dend - d; /* no longer know real size of d */ 535 536 /* return */ 537 dst = newSVpvn("", 0); 538 sv_setpvn(dst, (char *)d, dlen); 539 SvUTF8_on(dst); 540 541 Safefree(t); 542 Safefree(d); 543 RETVAL = dst; 544 OUTPUT: 545 RETVAL 546 547 548 SV* 549 NFC(src) 550 SV * src 551 PROTOTYPE: $ 552 ALIAS: 553 NFKC = 1 554 FCC = 2 555 PREINIT: 556 SV *dst; 557 U8 *s, *t, *tend, *u, *uend, *d, *dend; 558 STRLEN slen, tlen, ulen, dlen; 559 CODE: 560 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 561 562 /* decompose */ 563 tlen = slen; 564 New(0, t, tlen+1, U8); 565 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); 566 *tend = '\0'; 567 tlen = tend - t; /* no longer know real size of t */ 568 569 /* reorder */ 570 ulen = tlen; 571 New(0, u, ulen+1, U8); 572 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); 573 *uend = '\0'; 574 ulen = uend - u; /* no longer know real size of u */ 575 576 /* compose */ 577 dlen = ulen; 578 New(0, d, dlen+1, U8); 579 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); 580 *dend = '\0'; 581 dlen = dend - d; /* no longer know real size of d */ 582 583 /* return */ 584 dst = newSVpvn("", 0); 585 sv_setpvn(dst, (char *)d, dlen); 586 SvUTF8_on(dst); 587 588 Safefree(t); 589 Safefree(u); 590 Safefree(d); 591 RETVAL = dst; 592 OUTPUT: 593 RETVAL 594 595 596 SV* 597 checkNFD(src) 598 SV * src 599 PROTOTYPE: $ 600 ALIAS: 601 checkNFKD = 1 602 PREINIT: 603 STRLEN srclen, retlen; 604 U8 *s, *e, *p, curCC, preCC; 605 bool result = TRUE; 606 CODE: 607 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 608 e = s + srclen; 609 610 preCC = 0; 611 for (p = s; p < e; p += retlen) { 612 UV uv = utf8_to_uvchr_buf(p, e, &retlen); 613 if (!retlen) 614 croak(ErrRetlenIsZero, "checkNFD or -NFKD"); 615 616 curCC = getCombinClass(uv); 617 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ 618 result = FALSE; 619 break; 620 } 621 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { 622 result = FALSE; 623 break; 624 } 625 preCC = curCC; 626 } 627 RETVAL = boolSV(result); 628 OUTPUT: 629 RETVAL 630 631 632 SV* 633 checkNFC(src) 634 SV * src 635 PROTOTYPE: $ 636 ALIAS: 637 checkNFKC = 1 638 PREINIT: 639 STRLEN srclen, retlen; 640 U8 *s, *e, *p, curCC, preCC; 641 bool result = TRUE; 642 bool isMAYBE = FALSE; 643 CODE: 644 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 645 e = s + srclen; 646 647 preCC = 0; 648 for (p = s; p < e; p += retlen) { 649 UV uv = utf8_to_uvchr_buf(p, e, &retlen); 650 if (!retlen) 651 croak(ErrRetlenIsZero, "checkNFC or -NFKC"); 652 653 curCC = getCombinClass(uv); 654 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ 655 result = FALSE; 656 break; 657 } 658 659 /* get NFC/NFKC property */ 660 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ 661 ; /* YES */ 662 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { 663 result = FALSE; 664 break; 665 } 666 else if (isComp2nd(uv)) 667 isMAYBE = TRUE; 668 else if (ix) { 669 char *canon, *compat; 670 /* NFKC_NO when having compatibility mapping. */ 671 canon = (char *) dec_canonical(uv); 672 compat = (char *) dec_compat(uv); 673 if (compat && !(canon && strEQ(canon, compat))) { 674 result = FALSE; 675 break; 676 } 677 } /* end of get NFC/NFKC property */ 678 679 preCC = curCC; 680 } 681 if (isMAYBE && result) /* NO precedes MAYBE */ 682 XSRETURN_UNDEF; 683 RETVAL = boolSV(result); 684 OUTPUT: 685 RETVAL 686 687 688 SV* 689 checkFCD(src) 690 SV * src 691 PROTOTYPE: $ 692 ALIAS: 693 checkFCC = 1 694 PREINIT: 695 STRLEN srclen, retlen; 696 U8 *s, *e, *p, curCC, preCC; 697 bool result = TRUE; 698 bool isMAYBE = FALSE; 699 CODE: 700 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 701 e = s + srclen; 702 preCC = 0; 703 for (p = s; p < e; p += retlen) { 704 U8 *sCan; 705 UV uvLead; 706 STRLEN canlen = 0; 707 UV uv = utf8_to_uvchr_buf(p, e, &retlen); 708 if (!retlen) 709 croak(ErrRetlenIsZero, "checkFCD or -FCC"); 710 711 sCan = (U8*) dec_canonical(uv); 712 713 if (sCan) { 714 STRLEN canret; 715 canlen = (STRLEN)strlen((char *) sCan); 716 uvLead = utf8_to_uvchr_buf(sCan, sCan + canlen, &canret); 717 if (!canret) 718 croak(ErrRetlenIsZero, "checkFCD or -FCC"); 719 } 720 else { 721 uvLead = uv; 722 } 723 724 curCC = getCombinClass(uvLead); 725 726 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ 727 result = FALSE; 728 break; 729 } 730 731 if (ix) { 732 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { 733 result = FALSE; 734 break; 735 } 736 else if (isComp2nd(uv)) 737 isMAYBE = TRUE; 738 } 739 740 if (sCan) { 741 STRLEN canret; 742 UV uvTrail; 743 U8* eCan = sCan + canlen; 744 U8* pCan = utf8_hop(eCan, -1); 745 if (pCan < sCan) 746 croak(ErrHopBeforeStart); 747 uvTrail = utf8_to_uvchr_buf(pCan, eCan, &canret); 748 if (!canret) 749 croak(ErrRetlenIsZero, "checkFCD or -FCC"); 750 preCC = getCombinClass(uvTrail); 751 } 752 else { 753 preCC = curCC; 754 } 755 } 756 if (isMAYBE && result) /* NO precedes MAYBE */ 757 XSRETURN_UNDEF; 758 RETVAL = boolSV(result); 759 OUTPUT: 760 RETVAL 761 762 763 U8 764 getCombinClass(uv) 765 UV uv 766 PROTOTYPE: $ 767 768 bool 769 isExclusion(uv) 770 UV uv 771 PROTOTYPE: $ 772 773 bool 774 isSingleton(uv) 775 UV uv 776 PROTOTYPE: $ 777 778 bool 779 isNonStDecomp(uv) 780 UV uv 781 PROTOTYPE: $ 782 783 bool 784 isComp2nd(uv) 785 UV uv 786 PROTOTYPE: $ 787 ALIAS: 788 isNFC_MAYBE = 1 789 isNFKC_MAYBE = 2 790 INIT: 791 PERL_UNUSED_VAR(ix); 792 793 SV* 794 isNFD_NO(uv) 795 UV uv 796 PROTOTYPE: $ 797 ALIAS: 798 isNFKD_NO = 1 799 PREINIT: 800 bool result = FALSE; 801 CODE: 802 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) 803 result = TRUE; /* NFD_NO or NFKD_NO */ 804 RETVAL = boolSV(result); 805 OUTPUT: 806 RETVAL 807 808 809 SV* 810 isComp_Ex(uv) 811 UV uv 812 PROTOTYPE: $ 813 ALIAS: 814 isNFC_NO = 0 815 isNFKC_NO = 1 816 PREINIT: 817 bool result = FALSE; 818 CODE: 819 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) 820 result = TRUE; /* NFC_NO or NFKC_NO */ 821 else if (ix) { 822 char *canon, *compat; 823 canon = (char *) dec_canonical(uv); 824 compat = (char *) dec_compat(uv); 825 if (compat && (!canon || strNE(canon, compat))) 826 result = TRUE; /* NFC_NO or NFKC_NO */ 827 } 828 RETVAL = boolSV(result); 829 OUTPUT: 830 RETVAL 831 832 SV* 833 getComposite(uv, uv2) 834 UV uv 835 UV uv2 836 PROTOTYPE: $$ 837 PREINIT: 838 UV composite; 839 CODE: 840 composite = composite_uv(uv, uv2); 841 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; 842 OUTPUT: 843 RETVAL 844 845 846 847 SV* 848 getCanon(uv) 849 UV uv 850 PROTOTYPE: $ 851 ALIAS: 852 getCompat = 1 853 CODE: 854 if (Hangul_IsS(uv)) { 855 U8 tmp[3 * UTF8_MAXLEN + 1]; 856 U8 *t = tmp; 857 U8 *e = pv_cat_decompHangul(aTHX_ t, uv); 858 RETVAL = newSVpvn((char *)t, e - t); 859 } else { 860 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); 861 if (!rstr) 862 XSRETURN_UNDEF; 863 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); 864 } 865 SvUTF8_on(RETVAL); 866 OUTPUT: 867 RETVAL 868 869 870 void 871 splitOnLastStarter(src) 872 SV * src 873 PREINIT: 874 SV *svp; 875 STRLEN srclen; 876 U8 *s, *e, *p; 877 PPCODE: 878 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 879 e = s + srclen; 880 p = e; 881 while (s < p) { 882 UV uv; 883 p = utf8_hop(p, -1); 884 if (p < s) 885 croak(ErrHopBeforeStart); 886 uv = utf8_to_uvchr_buf(p, e, NULL); 887 if (getCombinClass(uv) == 0) /* Last Starter found */ 888 break; 889 } 890 891 svp = sv_2mortal(newSVpvn((char*)s, p - s)); 892 SvUTF8_on(svp); 893 XPUSHs(svp); 894 895 svp = sv_2mortal(newSVpvn((char*)p, e - p)); 896 SvUTF8_on(svp); 897 XPUSHs(svp); 898 899