15759b3d2Safresh1
25759b3d2Safresh1 #define PERL_NO_GET_CONTEXT /* we want efficiency */
35759b3d2Safresh1
45759b3d2Safresh1 /* private functions which need pTHX_ and aTHX_
55759b3d2Safresh1 pv_cat_decompHangul
65759b3d2Safresh1 sv_2pvunicode
75759b3d2Safresh1 pv_utf8_decompose
85759b3d2Safresh1 pv_utf8_reorder
95759b3d2Safresh1 pv_utf8_compose
105759b3d2Safresh1 */
115759b3d2Safresh1
125759b3d2Safresh1 #include "EXTERN.h"
135759b3d2Safresh1 #include "perl.h"
145759b3d2Safresh1 #include "XSUB.h"
155759b3d2Safresh1
16*256a93a4Safresh1 #define NEED_utf8_to_uvchr_buf
17*256a93a4Safresh1 #include "ppport.h"
18*256a93a4Safresh1
195759b3d2Safresh1 /* These 5 files are prepared by mkheader */
205759b3d2Safresh1 #include "unfcmb.h"
215759b3d2Safresh1 #include "unfcan.h"
225759b3d2Safresh1 #include "unfcpt.h"
235759b3d2Safresh1 #include "unfcmp.h"
245759b3d2Safresh1 #include "unfexc.h"
255759b3d2Safresh1
265759b3d2Safresh1 /* The generated normalization tables since v5.20 are in native character set
275759b3d2Safresh1 * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for
285759b3d2Safresh1 * later perls, and redefine that to be 'uvuni' for earlier ones */
29*256a93a4Safresh1 #if PERL_VERSION_LT(5,20,0)
305759b3d2Safresh1 # undef uvchr_to_utf8
315759b3d2Safresh1 # ifdef uvuni_to_utf8
325759b3d2Safresh1 # define uvchr_to_utf8 uvuni_to_utf8
335759b3d2Safresh1 # else /* Perl 5.6.1 */
345759b3d2Safresh1 # define uvchr_to_utf8 uv_to_utf8
355759b3d2Safresh1 # endif
365759b3d2Safresh1 #endif
375759b3d2Safresh1
385759b3d2Safresh1 /* check if the string buffer is enough before uvchr_to_utf8(). */
395759b3d2Safresh1 /* dstart, d, and dlen should be defined outside before. */
405759b3d2Safresh1 #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
415759b3d2Safresh1 if (dlen < curlen + (need)) { \
425759b3d2Safresh1 dlen += (need); \
435759b3d2Safresh1 Renew(dstart, dlen+1, U8); \
445759b3d2Safresh1 d = dstart + curlen; \
455759b3d2Safresh1 }
465759b3d2Safresh1
47*256a93a4Safresh1 /* if utf8_to_uvchr_buf() sets retlen to 0 (if broken?) */
485759b3d2Safresh1 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
495759b3d2Safresh1
505759b3d2Safresh1 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
515759b3d2Safresh1 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
525759b3d2Safresh1
535759b3d2Safresh1 /* At present, char > 0x10ffff are unaffected without complaint, right? */
545759b3d2Safresh1 #define VALID_UTF_MAX (0x10ffff)
555759b3d2Safresh1 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
565759b3d2Safresh1
575759b3d2Safresh1 /* size of array for combining characters */
585759b3d2Safresh1 /* enough as an initial value? */
595759b3d2Safresh1 #define CC_SEQ_SIZE (10)
605759b3d2Safresh1 #define CC_SEQ_STEP (5)
615759b3d2Safresh1
625759b3d2Safresh1 /* HANGUL begin */
635759b3d2Safresh1 #define Hangul_SBase 0xAC00
645759b3d2Safresh1 #define Hangul_SFinal 0xD7A3
655759b3d2Safresh1 #define Hangul_SCount 11172
665759b3d2Safresh1
675759b3d2Safresh1 #define Hangul_NCount 588
685759b3d2Safresh1
695759b3d2Safresh1 #define Hangul_LBase 0x1100
705759b3d2Safresh1 #define Hangul_LFinal 0x1112
715759b3d2Safresh1 #define Hangul_LCount 19
725759b3d2Safresh1
735759b3d2Safresh1 #define Hangul_VBase 0x1161
745759b3d2Safresh1 #define Hangul_VFinal 0x1175
755759b3d2Safresh1 #define Hangul_VCount 21
765759b3d2Safresh1
775759b3d2Safresh1 #define Hangul_TBase 0x11A7
785759b3d2Safresh1 #define Hangul_TFinal 0x11C2
795759b3d2Safresh1 #define Hangul_TCount 28
805759b3d2Safresh1
815759b3d2Safresh1 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
825759b3d2Safresh1 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
835759b3d2Safresh1 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
845759b3d2Safresh1 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
855759b3d2Safresh1 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
865759b3d2Safresh1 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
875759b3d2Safresh1 /* HANGUL end */
885759b3d2Safresh1
895759b3d2Safresh1 /* this is used for canonical ordering of combining characters (c.c.). */
905759b3d2Safresh1 typedef struct {
915759b3d2Safresh1 U8 cc; /* combining class */
925759b3d2Safresh1 UV uv; /* codepoint */
935759b3d2Safresh1 STRLEN pos; /* position */
945759b3d2Safresh1 } UNF_cc;
955759b3d2Safresh1
compare_cc(const void * a,const void * b)965759b3d2Safresh1 static int compare_cc(const void *a, const void *b)
975759b3d2Safresh1 {
985759b3d2Safresh1 int ret_cc;
995759b3d2Safresh1 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
1005759b3d2Safresh1 if (ret_cc)
1015759b3d2Safresh1 return ret_cc;
1025759b3d2Safresh1
1035759b3d2Safresh1 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
1045759b3d2Safresh1 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
1055759b3d2Safresh1 }
1065759b3d2Safresh1
dec_canonical(UV uv)1075759b3d2Safresh1 static U8* dec_canonical(UV uv)
1085759b3d2Safresh1 {
1095759b3d2Safresh1 U8 ***plane, **row;
1105759b3d2Safresh1 if (OVER_UTF_MAX(uv))
1115759b3d2Safresh1 return NULL;
1125759b3d2Safresh1 plane = (U8***)UNF_canon[uv >> 16];
1135759b3d2Safresh1 if (! plane)
1145759b3d2Safresh1 return NULL;
115*256a93a4Safresh1 row = plane[(U8) (uv >> 8)];
116*256a93a4Safresh1 return row ? row[(U8) uv] : NULL;
1175759b3d2Safresh1 }
1185759b3d2Safresh1
dec_compat(UV uv)1195759b3d2Safresh1 static U8* dec_compat(UV uv)
1205759b3d2Safresh1 {
1215759b3d2Safresh1 U8 ***plane, **row;
1225759b3d2Safresh1 if (OVER_UTF_MAX(uv))
1235759b3d2Safresh1 return NULL;
1245759b3d2Safresh1 plane = (U8***)UNF_compat[uv >> 16];
1255759b3d2Safresh1 if (! plane)
1265759b3d2Safresh1 return NULL;
127*256a93a4Safresh1 row = plane[(U8) (uv >> 8)];
128*256a93a4Safresh1 return row ? row[(U8) uv] : NULL;
1295759b3d2Safresh1 }
1305759b3d2Safresh1
composite_uv(UV uv,UV uv2)1315759b3d2Safresh1 static UV composite_uv(UV uv, UV uv2)
1325759b3d2Safresh1 {
1335759b3d2Safresh1 UNF_complist ***plane, **row, *cell, *i;
1345759b3d2Safresh1
1355759b3d2Safresh1 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
1365759b3d2Safresh1 return 0;
1375759b3d2Safresh1
1385759b3d2Safresh1 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
1395759b3d2Safresh1 UV lindex = uv - Hangul_LBase;
1405759b3d2Safresh1 UV vindex = uv2 - Hangul_VBase;
1415759b3d2Safresh1 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
1425759b3d2Safresh1 Hangul_TCount);
1435759b3d2Safresh1 }
1445759b3d2Safresh1 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
1455759b3d2Safresh1 UV tindex = uv2 - Hangul_TBase;
1465759b3d2Safresh1 return(uv + tindex);
1475759b3d2Safresh1 }
1485759b3d2Safresh1 plane = UNF_compos[uv >> 16];
1495759b3d2Safresh1 if (! plane)
1505759b3d2Safresh1 return 0;
151*256a93a4Safresh1 row = plane[(U8) (uv >> 8)];
1525759b3d2Safresh1 if (! row)
1535759b3d2Safresh1 return 0;
154*256a93a4Safresh1 cell = row[(U8) uv];
1555759b3d2Safresh1 if (! cell)
1565759b3d2Safresh1 return 0;
1575759b3d2Safresh1 for (i = cell; i->nextchar; i++) {
1585759b3d2Safresh1 if (uv2 == i->nextchar)
1595759b3d2Safresh1 return i->composite;
1605759b3d2Safresh1 }
1615759b3d2Safresh1 return 0;
1625759b3d2Safresh1 }
1635759b3d2Safresh1
getCombinClass(UV uv)1645759b3d2Safresh1 static U8 getCombinClass(UV uv)
1655759b3d2Safresh1 {
1665759b3d2Safresh1 U8 **plane, *row;
1675759b3d2Safresh1 if (OVER_UTF_MAX(uv))
1685759b3d2Safresh1 return 0;
1695759b3d2Safresh1 plane = (U8**)UNF_combin[uv >> 16];
1705759b3d2Safresh1 if (! plane)
1715759b3d2Safresh1 return 0;
172*256a93a4Safresh1 row = plane[(U8) (uv >> 8)];
173*256a93a4Safresh1 return row ? row[(U8) uv] : 0;
1745759b3d2Safresh1 }
1755759b3d2Safresh1
pv_cat_decompHangul(pTHX_ U8 * d,UV uv)1765759b3d2Safresh1 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
1775759b3d2Safresh1 {
1785759b3d2Safresh1 UV sindex = uv - Hangul_SBase;
1795759b3d2Safresh1 UV lindex = sindex / Hangul_NCount;
1805759b3d2Safresh1 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
1815759b3d2Safresh1 UV tindex = sindex % Hangul_TCount;
1825759b3d2Safresh1
1835759b3d2Safresh1 if (! Hangul_IsS(uv))
1845759b3d2Safresh1 return d;
1855759b3d2Safresh1
1865759b3d2Safresh1 d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
1875759b3d2Safresh1 d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
1885759b3d2Safresh1 if (tindex)
1895759b3d2Safresh1 d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
1905759b3d2Safresh1 return d;
1915759b3d2Safresh1 }
1925759b3d2Safresh1
sv_2pvunicode(pTHX_ SV * sv,STRLEN * lp)1935759b3d2Safresh1 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
1945759b3d2Safresh1 {
1955759b3d2Safresh1 char *s;
1965759b3d2Safresh1 STRLEN len;
1975759b3d2Safresh1 s = SvPV(sv,len);
1985759b3d2Safresh1 if (!SvUTF8(sv)) {
1995759b3d2Safresh1 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
2005759b3d2Safresh1 if (!SvPOK(tmpsv))
2015759b3d2Safresh1 s = SvPV_force(tmpsv,len);
2025759b3d2Safresh1 sv_utf8_upgrade(tmpsv);
2035759b3d2Safresh1 s = SvPV(tmpsv,len);
2045759b3d2Safresh1 }
2055759b3d2Safresh1 if (lp)
2065759b3d2Safresh1 *lp = len;
2075759b3d2Safresh1 return s;
2085759b3d2Safresh1 }
2095759b3d2Safresh1
2105759b3d2Safresh1 static
pv_utf8_decompose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscompat)2115759b3d2Safresh1 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
2125759b3d2Safresh1 {
2135759b3d2Safresh1 U8* p = s;
2145759b3d2Safresh1 U8* e = s + slen;
2155759b3d2Safresh1 U8* dstart = *dp;
2165759b3d2Safresh1 U8* d = dstart;
2175759b3d2Safresh1
2185759b3d2Safresh1 while (p < e) {
2195759b3d2Safresh1 STRLEN retlen;
220*256a93a4Safresh1 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
2215759b3d2Safresh1 if (!retlen)
2225759b3d2Safresh1 croak(ErrRetlenIsZero, "decompose");
2235759b3d2Safresh1 p += retlen;
2245759b3d2Safresh1
2255759b3d2Safresh1 if (Hangul_IsS(uv)) {
2265759b3d2Safresh1 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
2275759b3d2Safresh1 d = pv_cat_decompHangul(aTHX_ d, uv);
2285759b3d2Safresh1 }
2295759b3d2Safresh1 else {
2305759b3d2Safresh1 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
2315759b3d2Safresh1
2325759b3d2Safresh1 if (r) {
2335759b3d2Safresh1 STRLEN len = (STRLEN)strlen((char *)r);
2345759b3d2Safresh1 Renew_d_if_not_enough_to(len)
2355759b3d2Safresh1 while (len--)
2365759b3d2Safresh1 *d++ = *r++;
2375759b3d2Safresh1 }
2385759b3d2Safresh1 else {
2395759b3d2Safresh1 Renew_d_if_not_enough_to(UTF8_MAXLEN)
2405759b3d2Safresh1 d = uvchr_to_utf8(d, uv);
2415759b3d2Safresh1 }
2425759b3d2Safresh1 }
2435759b3d2Safresh1 }
2445759b3d2Safresh1 *dp = dstart;
2455759b3d2Safresh1 return d;
2465759b3d2Safresh1 }
2475759b3d2Safresh1
2485759b3d2Safresh1 static
pv_utf8_reorder(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen)2495759b3d2Safresh1 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
2505759b3d2Safresh1 {
2515759b3d2Safresh1 U8* p = s;
2525759b3d2Safresh1 U8* e = s + slen;
2535759b3d2Safresh1 U8* dstart = *dp;
2545759b3d2Safresh1 U8* d = dstart;
2555759b3d2Safresh1
2565759b3d2Safresh1 UNF_cc seq_ary[CC_SEQ_SIZE];
2575759b3d2Safresh1 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
2585759b3d2Safresh1 UNF_cc* seq_ext = NULL; /* extend if need */
2595759b3d2Safresh1 STRLEN seq_max = CC_SEQ_SIZE;
2605759b3d2Safresh1 STRLEN cc_pos = 0;
2615759b3d2Safresh1
2625759b3d2Safresh1 while (p < e) {
2635759b3d2Safresh1 U8 curCC;
2645759b3d2Safresh1 STRLEN retlen;
265*256a93a4Safresh1 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
2665759b3d2Safresh1 if (!retlen)
2675759b3d2Safresh1 croak(ErrRetlenIsZero, "reorder");
2685759b3d2Safresh1 p += retlen;
2695759b3d2Safresh1
2705759b3d2Safresh1 curCC = getCombinClass(uv);
2715759b3d2Safresh1
2725759b3d2Safresh1 if (curCC != 0) {
2735759b3d2Safresh1 if (seq_max < cc_pos + 1) { /* extend if need */
2745759b3d2Safresh1 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
2755759b3d2Safresh1 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
2765759b3d2Safresh1 STRLEN i;
2775759b3d2Safresh1 New(0, seq_ext, seq_max, UNF_cc);
2785759b3d2Safresh1 for (i = 0; i < cc_pos; i++)
2795759b3d2Safresh1 seq_ext[i] = seq_ary[i];
2805759b3d2Safresh1 }
2815759b3d2Safresh1 else {
2825759b3d2Safresh1 Renew(seq_ext, seq_max, UNF_cc);
2835759b3d2Safresh1 }
2845759b3d2Safresh1 seq_ptr = seq_ext; /* use seq_ext from now */
2855759b3d2Safresh1 }
2865759b3d2Safresh1
2875759b3d2Safresh1 seq_ptr[cc_pos].cc = curCC;
2885759b3d2Safresh1 seq_ptr[cc_pos].uv = uv;
2895759b3d2Safresh1 seq_ptr[cc_pos].pos = cc_pos;
2905759b3d2Safresh1 ++cc_pos;
2915759b3d2Safresh1
2925759b3d2Safresh1 if (p < e)
2935759b3d2Safresh1 continue;
2945759b3d2Safresh1 }
2955759b3d2Safresh1
2965759b3d2Safresh1 /* output */
2975759b3d2Safresh1 if (cc_pos) {
2985759b3d2Safresh1 STRLEN i;
2995759b3d2Safresh1
3005759b3d2Safresh1 if (cc_pos > 1) /* reordered if there are two c.c.'s */
3015759b3d2Safresh1 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
3025759b3d2Safresh1
3035759b3d2Safresh1 for (i = 0; i < cc_pos; i++) {
3045759b3d2Safresh1 Renew_d_if_not_enough_to(UTF8_MAXLEN)
3055759b3d2Safresh1 d = uvchr_to_utf8(d, seq_ptr[i].uv);
3065759b3d2Safresh1 }
3075759b3d2Safresh1 cc_pos = 0;
3085759b3d2Safresh1 }
3095759b3d2Safresh1
3105759b3d2Safresh1 if (curCC == 0) {
3115759b3d2Safresh1 Renew_d_if_not_enough_to(UTF8_MAXLEN)
3125759b3d2Safresh1 d = uvchr_to_utf8(d, uv);
3135759b3d2Safresh1 }
3145759b3d2Safresh1 }
3155759b3d2Safresh1 if (seq_ext)
3165759b3d2Safresh1 Safefree(seq_ext);
3175759b3d2Safresh1 *dp = dstart;
3185759b3d2Safresh1 return d;
3195759b3d2Safresh1 }
3205759b3d2Safresh1
3215759b3d2Safresh1 static
pv_utf8_compose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscontig)3225759b3d2Safresh1 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
3235759b3d2Safresh1 {
3245759b3d2Safresh1 U8* p = s;
3255759b3d2Safresh1 U8* e = s + slen;
3265759b3d2Safresh1 U8* dstart = *dp;
3275759b3d2Safresh1 U8* d = dstart;
3285759b3d2Safresh1
3295759b3d2Safresh1 UV uvS = 0; /* code point of the starter */
3305759b3d2Safresh1 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
3315759b3d2Safresh1 U8 preCC = 0;
3325759b3d2Safresh1
3335759b3d2Safresh1 UV seq_ary[CC_SEQ_SIZE];
3345759b3d2Safresh1 UV* seq_ptr = seq_ary; /* use array at the beginning */
3355759b3d2Safresh1 UV* seq_ext = NULL; /* extend if need */
3365759b3d2Safresh1 STRLEN seq_max = CC_SEQ_SIZE;
3375759b3d2Safresh1 STRLEN cc_pos = 0;
3385759b3d2Safresh1
3395759b3d2Safresh1 while (p < e) {
3405759b3d2Safresh1 U8 curCC;
3415759b3d2Safresh1 STRLEN retlen;
342*256a93a4Safresh1 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
3435759b3d2Safresh1 if (!retlen)
3445759b3d2Safresh1 croak(ErrRetlenIsZero, "compose");
3455759b3d2Safresh1 p += retlen;
3465759b3d2Safresh1
3475759b3d2Safresh1 curCC = getCombinClass(uv);
3485759b3d2Safresh1
3495759b3d2Safresh1 if (!valid_uvS) {
3505759b3d2Safresh1 if (curCC == 0) {
3515759b3d2Safresh1 uvS = uv; /* the first Starter is found */
3525759b3d2Safresh1 valid_uvS = TRUE;
3535759b3d2Safresh1 if (p < e)
3545759b3d2Safresh1 continue;
3555759b3d2Safresh1 }
3565759b3d2Safresh1 else {
3575759b3d2Safresh1 Renew_d_if_not_enough_to(UTF8_MAXLEN)
3585759b3d2Safresh1 d = uvchr_to_utf8(d, uv);
3595759b3d2Safresh1 continue;
3605759b3d2Safresh1 }
3615759b3d2Safresh1 }
3625759b3d2Safresh1 else {
3635759b3d2Safresh1 bool composed;
3645759b3d2Safresh1
3655759b3d2Safresh1 /* blocked */
3665759b3d2Safresh1 if ((iscontig && cc_pos) || /* discontiguous combination */
3675759b3d2Safresh1 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
3685759b3d2Safresh1 (preCC > curCC)) /* blocked by higher CC: revised D2 */
3695759b3d2Safresh1 composed = FALSE;
3705759b3d2Safresh1
3715759b3d2Safresh1 /* not blocked:
3725759b3d2Safresh1 iscontig && cc_pos == 0 -- contiguous combination
3735759b3d2Safresh1 curCC == 0 && preCC == 0 -- starter + starter
3745759b3d2Safresh1 curCC != 0 && preCC < curCC -- lower CC */
3755759b3d2Safresh1 else {
3765759b3d2Safresh1 /* try composition */
3775759b3d2Safresh1 UV uvComp = composite_uv(uvS, uv);
3785759b3d2Safresh1
3795759b3d2Safresh1 if (uvComp && !isExclusion(uvComp)) {
3805759b3d2Safresh1 uvS = uvComp;
3815759b3d2Safresh1 composed = TRUE;
3825759b3d2Safresh1
3835759b3d2Safresh1 /* preCC should not be changed to curCC */
3845759b3d2Safresh1 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
3855759b3d2Safresh1 if (p < e)
3865759b3d2Safresh1 continue;
3875759b3d2Safresh1 }
3885759b3d2Safresh1 else
3895759b3d2Safresh1 composed = FALSE;
3905759b3d2Safresh1 }
3915759b3d2Safresh1
3925759b3d2Safresh1 if (!composed) {
3935759b3d2Safresh1 preCC = curCC;
3945759b3d2Safresh1 if (curCC != 0 || !(p < e)) {
3955759b3d2Safresh1 if (seq_max < cc_pos + 1) { /* extend if need */
3965759b3d2Safresh1 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
3975759b3d2Safresh1 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
3985759b3d2Safresh1 New(0, seq_ext, seq_max, UV);
3995759b3d2Safresh1 Copy(seq_ary, seq_ext, cc_pos, UV);
4005759b3d2Safresh1 }
4015759b3d2Safresh1 else {
4025759b3d2Safresh1 Renew(seq_ext, seq_max, UV);
4035759b3d2Safresh1 }
4045759b3d2Safresh1 seq_ptr = seq_ext; /* use seq_ext from now */
4055759b3d2Safresh1 }
4065759b3d2Safresh1 seq_ptr[cc_pos] = uv;
4075759b3d2Safresh1 ++cc_pos;
4085759b3d2Safresh1 }
4095759b3d2Safresh1 if (curCC != 0 && p < e)
4105759b3d2Safresh1 continue;
4115759b3d2Safresh1 }
4125759b3d2Safresh1 }
4135759b3d2Safresh1
4145759b3d2Safresh1 /* output */
4155759b3d2Safresh1 {
4165759b3d2Safresh1 Renew_d_if_not_enough_to(UTF8_MAXLEN)
4175759b3d2Safresh1 d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
4185759b3d2Safresh1 }
4195759b3d2Safresh1
4205759b3d2Safresh1 if (cc_pos) {
4215759b3d2Safresh1 STRLEN i;
4225759b3d2Safresh1
4235759b3d2Safresh1 for (i = 0; i < cc_pos; i++) {
4245759b3d2Safresh1 Renew_d_if_not_enough_to(UTF8_MAXLEN)
4255759b3d2Safresh1 d = uvchr_to_utf8(d, seq_ptr[i]);
4265759b3d2Safresh1 }
4275759b3d2Safresh1 cc_pos = 0;
4285759b3d2Safresh1 }
4295759b3d2Safresh1
4305759b3d2Safresh1 uvS = uv;
4315759b3d2Safresh1 }
4325759b3d2Safresh1 if (seq_ext)
4335759b3d2Safresh1 Safefree(seq_ext);
4345759b3d2Safresh1 *dp = dstart;
4355759b3d2Safresh1 return d;
4365759b3d2Safresh1 }
4375759b3d2Safresh1
4385759b3d2Safresh1 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
4395759b3d2Safresh1
4405759b3d2Safresh1 SV*
4415759b3d2Safresh1 decompose(src, compat = &PL_sv_no)
4425759b3d2Safresh1 SV * src
4435759b3d2Safresh1 SV * compat
4445759b3d2Safresh1 PROTOTYPE: $;$
4455759b3d2Safresh1 PREINIT:
4465759b3d2Safresh1 SV* dst;
4475759b3d2Safresh1 U8 *s, *d, *dend;
4485759b3d2Safresh1 STRLEN slen, dlen;
4495759b3d2Safresh1 CODE:
4505759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
4515759b3d2Safresh1 dst = newSVpvn("", 0);
4525759b3d2Safresh1 dlen = slen;
4535759b3d2Safresh1 New(0, d, dlen+1, U8);
4545759b3d2Safresh1 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
4555759b3d2Safresh1 sv_setpvn(dst, (char *)d, dend - d);
4565759b3d2Safresh1 SvUTF8_on(dst);
4575759b3d2Safresh1 Safefree(d);
4585759b3d2Safresh1 RETVAL = dst;
4595759b3d2Safresh1 OUTPUT:
4605759b3d2Safresh1 RETVAL
4615759b3d2Safresh1
4625759b3d2Safresh1
4635759b3d2Safresh1 SV*
4645759b3d2Safresh1 reorder(src)
4655759b3d2Safresh1 SV * src
4665759b3d2Safresh1 PROTOTYPE: $
4675759b3d2Safresh1 PREINIT:
4685759b3d2Safresh1 SV* dst;
4695759b3d2Safresh1 U8 *s, *d, *dend;
4705759b3d2Safresh1 STRLEN slen, dlen;
4715759b3d2Safresh1 CODE:
4725759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
4735759b3d2Safresh1 dst = newSVpvn("", 0);
4745759b3d2Safresh1 dlen = slen;
4755759b3d2Safresh1 New(0, d, dlen+1, U8);
4765759b3d2Safresh1 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
4775759b3d2Safresh1 sv_setpvn(dst, (char *)d, dend - d);
4785759b3d2Safresh1 SvUTF8_on(dst);
4795759b3d2Safresh1 Safefree(d);
4805759b3d2Safresh1 RETVAL = dst;
4815759b3d2Safresh1 OUTPUT:
4825759b3d2Safresh1 RETVAL
4835759b3d2Safresh1
4845759b3d2Safresh1
4855759b3d2Safresh1 SV*
4865759b3d2Safresh1 compose(src)
4875759b3d2Safresh1 SV * src
4885759b3d2Safresh1 PROTOTYPE: $
4895759b3d2Safresh1 ALIAS:
4905759b3d2Safresh1 composeContiguous = 1
4915759b3d2Safresh1 PREINIT:
4925759b3d2Safresh1 SV* dst;
4935759b3d2Safresh1 U8 *s, *d, *dend;
4945759b3d2Safresh1 STRLEN slen, dlen;
4955759b3d2Safresh1 CODE:
4965759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
4975759b3d2Safresh1 dst = newSVpvn("", 0);
4985759b3d2Safresh1 dlen = slen;
4995759b3d2Safresh1 New(0, d, dlen+1, U8);
5005759b3d2Safresh1 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
5015759b3d2Safresh1 sv_setpvn(dst, (char *)d, dend - d);
5025759b3d2Safresh1 SvUTF8_on(dst);
5035759b3d2Safresh1 Safefree(d);
5045759b3d2Safresh1 RETVAL = dst;
5055759b3d2Safresh1 OUTPUT:
5065759b3d2Safresh1 RETVAL
5075759b3d2Safresh1
5085759b3d2Safresh1
5095759b3d2Safresh1 SV*
5105759b3d2Safresh1 NFD(src)
5115759b3d2Safresh1 SV * src
5125759b3d2Safresh1 PROTOTYPE: $
5135759b3d2Safresh1 ALIAS:
5145759b3d2Safresh1 NFKD = 1
5155759b3d2Safresh1 PREINIT:
5165759b3d2Safresh1 SV *dst;
5175759b3d2Safresh1 U8 *s, *t, *tend, *d, *dend;
5185759b3d2Safresh1 STRLEN slen, tlen, dlen;
5195759b3d2Safresh1 CODE:
5205759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
5215759b3d2Safresh1
5225759b3d2Safresh1 /* decompose */
5235759b3d2Safresh1 tlen = slen;
5245759b3d2Safresh1 New(0, t, tlen+1, U8);
5255759b3d2Safresh1 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
5265759b3d2Safresh1 *tend = '\0';
5275759b3d2Safresh1 tlen = tend - t; /* no longer know real size of t */
5285759b3d2Safresh1
5295759b3d2Safresh1 /* reorder */
5305759b3d2Safresh1 dlen = tlen;
5315759b3d2Safresh1 New(0, d, dlen+1, U8);
5325759b3d2Safresh1 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
5335759b3d2Safresh1 *dend = '\0';
5345759b3d2Safresh1 dlen = dend - d; /* no longer know real size of d */
5355759b3d2Safresh1
5365759b3d2Safresh1 /* return */
5375759b3d2Safresh1 dst = newSVpvn("", 0);
5385759b3d2Safresh1 sv_setpvn(dst, (char *)d, dlen);
5395759b3d2Safresh1 SvUTF8_on(dst);
5405759b3d2Safresh1
5415759b3d2Safresh1 Safefree(t);
5425759b3d2Safresh1 Safefree(d);
5435759b3d2Safresh1 RETVAL = dst;
5445759b3d2Safresh1 OUTPUT:
5455759b3d2Safresh1 RETVAL
5465759b3d2Safresh1
5475759b3d2Safresh1
5485759b3d2Safresh1 SV*
5495759b3d2Safresh1 NFC(src)
5505759b3d2Safresh1 SV * src
5515759b3d2Safresh1 PROTOTYPE: $
5525759b3d2Safresh1 ALIAS:
5535759b3d2Safresh1 NFKC = 1
5545759b3d2Safresh1 FCC = 2
5555759b3d2Safresh1 PREINIT:
5565759b3d2Safresh1 SV *dst;
5575759b3d2Safresh1 U8 *s, *t, *tend, *u, *uend, *d, *dend;
5585759b3d2Safresh1 STRLEN slen, tlen, ulen, dlen;
5595759b3d2Safresh1 CODE:
5605759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
5615759b3d2Safresh1
5625759b3d2Safresh1 /* decompose */
5635759b3d2Safresh1 tlen = slen;
5645759b3d2Safresh1 New(0, t, tlen+1, U8);
5655759b3d2Safresh1 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
5665759b3d2Safresh1 *tend = '\0';
5675759b3d2Safresh1 tlen = tend - t; /* no longer know real size of t */
5685759b3d2Safresh1
5695759b3d2Safresh1 /* reorder */
5705759b3d2Safresh1 ulen = tlen;
5715759b3d2Safresh1 New(0, u, ulen+1, U8);
5725759b3d2Safresh1 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
5735759b3d2Safresh1 *uend = '\0';
5745759b3d2Safresh1 ulen = uend - u; /* no longer know real size of u */
5755759b3d2Safresh1
5765759b3d2Safresh1 /* compose */
5775759b3d2Safresh1 dlen = ulen;
5785759b3d2Safresh1 New(0, d, dlen+1, U8);
5795759b3d2Safresh1 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
5805759b3d2Safresh1 *dend = '\0';
5815759b3d2Safresh1 dlen = dend - d; /* no longer know real size of d */
5825759b3d2Safresh1
5835759b3d2Safresh1 /* return */
5845759b3d2Safresh1 dst = newSVpvn("", 0);
5855759b3d2Safresh1 sv_setpvn(dst, (char *)d, dlen);
5865759b3d2Safresh1 SvUTF8_on(dst);
5875759b3d2Safresh1
5885759b3d2Safresh1 Safefree(t);
5895759b3d2Safresh1 Safefree(u);
5905759b3d2Safresh1 Safefree(d);
5915759b3d2Safresh1 RETVAL = dst;
5925759b3d2Safresh1 OUTPUT:
5935759b3d2Safresh1 RETVAL
5945759b3d2Safresh1
5955759b3d2Safresh1
5965759b3d2Safresh1 SV*
5975759b3d2Safresh1 checkNFD(src)
5985759b3d2Safresh1 SV * src
5995759b3d2Safresh1 PROTOTYPE: $
6005759b3d2Safresh1 ALIAS:
6015759b3d2Safresh1 checkNFKD = 1
6025759b3d2Safresh1 PREINIT:
6035759b3d2Safresh1 STRLEN srclen, retlen;
6045759b3d2Safresh1 U8 *s, *e, *p, curCC, preCC;
6055759b3d2Safresh1 bool result = TRUE;
6065759b3d2Safresh1 CODE:
6075759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
6085759b3d2Safresh1 e = s + srclen;
6095759b3d2Safresh1
6105759b3d2Safresh1 preCC = 0;
6115759b3d2Safresh1 for (p = s; p < e; p += retlen) {
612*256a93a4Safresh1 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
6135759b3d2Safresh1 if (!retlen)
6145759b3d2Safresh1 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
6155759b3d2Safresh1
6165759b3d2Safresh1 curCC = getCombinClass(uv);
6175759b3d2Safresh1 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
6185759b3d2Safresh1 result = FALSE;
6195759b3d2Safresh1 break;
6205759b3d2Safresh1 }
6215759b3d2Safresh1 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
6225759b3d2Safresh1 result = FALSE;
6235759b3d2Safresh1 break;
6245759b3d2Safresh1 }
6255759b3d2Safresh1 preCC = curCC;
6265759b3d2Safresh1 }
6275759b3d2Safresh1 RETVAL = boolSV(result);
6285759b3d2Safresh1 OUTPUT:
6295759b3d2Safresh1 RETVAL
6305759b3d2Safresh1
6315759b3d2Safresh1
6325759b3d2Safresh1 SV*
6335759b3d2Safresh1 checkNFC(src)
6345759b3d2Safresh1 SV * src
6355759b3d2Safresh1 PROTOTYPE: $
6365759b3d2Safresh1 ALIAS:
6375759b3d2Safresh1 checkNFKC = 1
6385759b3d2Safresh1 PREINIT:
6395759b3d2Safresh1 STRLEN srclen, retlen;
6405759b3d2Safresh1 U8 *s, *e, *p, curCC, preCC;
6415759b3d2Safresh1 bool result = TRUE;
6425759b3d2Safresh1 bool isMAYBE = FALSE;
6435759b3d2Safresh1 CODE:
6445759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
6455759b3d2Safresh1 e = s + srclen;
6465759b3d2Safresh1
6475759b3d2Safresh1 preCC = 0;
6485759b3d2Safresh1 for (p = s; p < e; p += retlen) {
649*256a93a4Safresh1 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
6505759b3d2Safresh1 if (!retlen)
6515759b3d2Safresh1 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
6525759b3d2Safresh1
6535759b3d2Safresh1 curCC = getCombinClass(uv);
6545759b3d2Safresh1 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
6555759b3d2Safresh1 result = FALSE;
6565759b3d2Safresh1 break;
6575759b3d2Safresh1 }
6585759b3d2Safresh1
6595759b3d2Safresh1 /* get NFC/NFKC property */
6605759b3d2Safresh1 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
6615759b3d2Safresh1 ; /* YES */
6625759b3d2Safresh1 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
6635759b3d2Safresh1 result = FALSE;
6645759b3d2Safresh1 break;
6655759b3d2Safresh1 }
6665759b3d2Safresh1 else if (isComp2nd(uv))
6675759b3d2Safresh1 isMAYBE = TRUE;
6685759b3d2Safresh1 else if (ix) {
6695759b3d2Safresh1 char *canon, *compat;
6705759b3d2Safresh1 /* NFKC_NO when having compatibility mapping. */
6715759b3d2Safresh1 canon = (char *) dec_canonical(uv);
6725759b3d2Safresh1 compat = (char *) dec_compat(uv);
6735759b3d2Safresh1 if (compat && !(canon && strEQ(canon, compat))) {
6745759b3d2Safresh1 result = FALSE;
6755759b3d2Safresh1 break;
6765759b3d2Safresh1 }
6775759b3d2Safresh1 } /* end of get NFC/NFKC property */
6785759b3d2Safresh1
6795759b3d2Safresh1 preCC = curCC;
6805759b3d2Safresh1 }
6815759b3d2Safresh1 if (isMAYBE && result) /* NO precedes MAYBE */
6825759b3d2Safresh1 XSRETURN_UNDEF;
6835759b3d2Safresh1 RETVAL = boolSV(result);
6845759b3d2Safresh1 OUTPUT:
6855759b3d2Safresh1 RETVAL
6865759b3d2Safresh1
6875759b3d2Safresh1
6885759b3d2Safresh1 SV*
6895759b3d2Safresh1 checkFCD(src)
6905759b3d2Safresh1 SV * src
6915759b3d2Safresh1 PROTOTYPE: $
6925759b3d2Safresh1 ALIAS:
6935759b3d2Safresh1 checkFCC = 1
6945759b3d2Safresh1 PREINIT:
6955759b3d2Safresh1 STRLEN srclen, retlen;
6965759b3d2Safresh1 U8 *s, *e, *p, curCC, preCC;
6975759b3d2Safresh1 bool result = TRUE;
6985759b3d2Safresh1 bool isMAYBE = FALSE;
6995759b3d2Safresh1 CODE:
7005759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
7015759b3d2Safresh1 e = s + srclen;
7025759b3d2Safresh1 preCC = 0;
7035759b3d2Safresh1 for (p = s; p < e; p += retlen) {
7045759b3d2Safresh1 U8 *sCan;
7055759b3d2Safresh1 UV uvLead;
7065759b3d2Safresh1 STRLEN canlen = 0;
707*256a93a4Safresh1 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
7085759b3d2Safresh1 if (!retlen)
7095759b3d2Safresh1 croak(ErrRetlenIsZero, "checkFCD or -FCC");
7105759b3d2Safresh1
7115759b3d2Safresh1 sCan = (U8*) dec_canonical(uv);
7125759b3d2Safresh1
7135759b3d2Safresh1 if (sCan) {
7145759b3d2Safresh1 STRLEN canret;
7155759b3d2Safresh1 canlen = (STRLEN)strlen((char *) sCan);
716*256a93a4Safresh1 uvLead = utf8_to_uvchr_buf(sCan, sCan + canlen, &canret);
7175759b3d2Safresh1 if (!canret)
7185759b3d2Safresh1 croak(ErrRetlenIsZero, "checkFCD or -FCC");
7195759b3d2Safresh1 }
7205759b3d2Safresh1 else {
7215759b3d2Safresh1 uvLead = uv;
7225759b3d2Safresh1 }
7235759b3d2Safresh1
7245759b3d2Safresh1 curCC = getCombinClass(uvLead);
7255759b3d2Safresh1
7265759b3d2Safresh1 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
7275759b3d2Safresh1 result = FALSE;
7285759b3d2Safresh1 break;
7295759b3d2Safresh1 }
7305759b3d2Safresh1
7315759b3d2Safresh1 if (ix) {
7325759b3d2Safresh1 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
7335759b3d2Safresh1 result = FALSE;
7345759b3d2Safresh1 break;
7355759b3d2Safresh1 }
7365759b3d2Safresh1 else if (isComp2nd(uv))
7375759b3d2Safresh1 isMAYBE = TRUE;
7385759b3d2Safresh1 }
7395759b3d2Safresh1
7405759b3d2Safresh1 if (sCan) {
7415759b3d2Safresh1 STRLEN canret;
7425759b3d2Safresh1 UV uvTrail;
7435759b3d2Safresh1 U8* eCan = sCan + canlen;
7445759b3d2Safresh1 U8* pCan = utf8_hop(eCan, -1);
7455759b3d2Safresh1 if (pCan < sCan)
7465759b3d2Safresh1 croak(ErrHopBeforeStart);
747*256a93a4Safresh1 uvTrail = utf8_to_uvchr_buf(pCan, eCan, &canret);
7485759b3d2Safresh1 if (!canret)
7495759b3d2Safresh1 croak(ErrRetlenIsZero, "checkFCD or -FCC");
7505759b3d2Safresh1 preCC = getCombinClass(uvTrail);
7515759b3d2Safresh1 }
7525759b3d2Safresh1 else {
7535759b3d2Safresh1 preCC = curCC;
7545759b3d2Safresh1 }
7555759b3d2Safresh1 }
7565759b3d2Safresh1 if (isMAYBE && result) /* NO precedes MAYBE */
7575759b3d2Safresh1 XSRETURN_UNDEF;
7585759b3d2Safresh1 RETVAL = boolSV(result);
7595759b3d2Safresh1 OUTPUT:
7605759b3d2Safresh1 RETVAL
7615759b3d2Safresh1
7625759b3d2Safresh1
7635759b3d2Safresh1 U8
7645759b3d2Safresh1 getCombinClass(uv)
7655759b3d2Safresh1 UV uv
7665759b3d2Safresh1 PROTOTYPE: $
7675759b3d2Safresh1
7685759b3d2Safresh1 bool
7695759b3d2Safresh1 isExclusion(uv)
7705759b3d2Safresh1 UV uv
7715759b3d2Safresh1 PROTOTYPE: $
7725759b3d2Safresh1
7735759b3d2Safresh1 bool
7745759b3d2Safresh1 isSingleton(uv)
7755759b3d2Safresh1 UV uv
7765759b3d2Safresh1 PROTOTYPE: $
7775759b3d2Safresh1
7785759b3d2Safresh1 bool
7795759b3d2Safresh1 isNonStDecomp(uv)
7805759b3d2Safresh1 UV uv
7815759b3d2Safresh1 PROTOTYPE: $
7825759b3d2Safresh1
7835759b3d2Safresh1 bool
7845759b3d2Safresh1 isComp2nd(uv)
7855759b3d2Safresh1 UV uv
7865759b3d2Safresh1 PROTOTYPE: $
7875759b3d2Safresh1 ALIAS:
7885759b3d2Safresh1 isNFC_MAYBE = 1
7895759b3d2Safresh1 isNFKC_MAYBE = 2
7905759b3d2Safresh1 INIT:
7915759b3d2Safresh1 PERL_UNUSED_VAR(ix);
7925759b3d2Safresh1
7935759b3d2Safresh1 SV*
7945759b3d2Safresh1 isNFD_NO(uv)
7955759b3d2Safresh1 UV uv
7965759b3d2Safresh1 PROTOTYPE: $
7975759b3d2Safresh1 ALIAS:
7985759b3d2Safresh1 isNFKD_NO = 1
7995759b3d2Safresh1 PREINIT:
8005759b3d2Safresh1 bool result = FALSE;
8015759b3d2Safresh1 CODE:
8025759b3d2Safresh1 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
8035759b3d2Safresh1 result = TRUE; /* NFD_NO or NFKD_NO */
8045759b3d2Safresh1 RETVAL = boolSV(result);
8055759b3d2Safresh1 OUTPUT:
8065759b3d2Safresh1 RETVAL
8075759b3d2Safresh1
8085759b3d2Safresh1
8095759b3d2Safresh1 SV*
8105759b3d2Safresh1 isComp_Ex(uv)
8115759b3d2Safresh1 UV uv
8125759b3d2Safresh1 PROTOTYPE: $
8135759b3d2Safresh1 ALIAS:
8145759b3d2Safresh1 isNFC_NO = 0
8155759b3d2Safresh1 isNFKC_NO = 1
8165759b3d2Safresh1 PREINIT:
8175759b3d2Safresh1 bool result = FALSE;
8185759b3d2Safresh1 CODE:
8195759b3d2Safresh1 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
8205759b3d2Safresh1 result = TRUE; /* NFC_NO or NFKC_NO */
8215759b3d2Safresh1 else if (ix) {
8225759b3d2Safresh1 char *canon, *compat;
8235759b3d2Safresh1 canon = (char *) dec_canonical(uv);
8245759b3d2Safresh1 compat = (char *) dec_compat(uv);
8255759b3d2Safresh1 if (compat && (!canon || strNE(canon, compat)))
8265759b3d2Safresh1 result = TRUE; /* NFC_NO or NFKC_NO */
8275759b3d2Safresh1 }
8285759b3d2Safresh1 RETVAL = boolSV(result);
8295759b3d2Safresh1 OUTPUT:
8305759b3d2Safresh1 RETVAL
8315759b3d2Safresh1
8325759b3d2Safresh1 SV*
8335759b3d2Safresh1 getComposite(uv, uv2)
8345759b3d2Safresh1 UV uv
8355759b3d2Safresh1 UV uv2
8365759b3d2Safresh1 PROTOTYPE: $$
8375759b3d2Safresh1 PREINIT:
8385759b3d2Safresh1 UV composite;
8395759b3d2Safresh1 CODE:
8405759b3d2Safresh1 composite = composite_uv(uv, uv2);
8415759b3d2Safresh1 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
8425759b3d2Safresh1 OUTPUT:
8435759b3d2Safresh1 RETVAL
8445759b3d2Safresh1
8455759b3d2Safresh1
8465759b3d2Safresh1
8475759b3d2Safresh1 SV*
8485759b3d2Safresh1 getCanon(uv)
8495759b3d2Safresh1 UV uv
8505759b3d2Safresh1 PROTOTYPE: $
8515759b3d2Safresh1 ALIAS:
8525759b3d2Safresh1 getCompat = 1
8535759b3d2Safresh1 CODE:
8545759b3d2Safresh1 if (Hangul_IsS(uv)) {
8555759b3d2Safresh1 U8 tmp[3 * UTF8_MAXLEN + 1];
8565759b3d2Safresh1 U8 *t = tmp;
8575759b3d2Safresh1 U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
8585759b3d2Safresh1 RETVAL = newSVpvn((char *)t, e - t);
8595759b3d2Safresh1 } else {
8605759b3d2Safresh1 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8615759b3d2Safresh1 if (!rstr)
8625759b3d2Safresh1 XSRETURN_UNDEF;
8635759b3d2Safresh1 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
8645759b3d2Safresh1 }
8655759b3d2Safresh1 SvUTF8_on(RETVAL);
8665759b3d2Safresh1 OUTPUT:
8675759b3d2Safresh1 RETVAL
8685759b3d2Safresh1
8695759b3d2Safresh1
8705759b3d2Safresh1 void
8715759b3d2Safresh1 splitOnLastStarter(src)
8725759b3d2Safresh1 SV * src
8735759b3d2Safresh1 PREINIT:
8745759b3d2Safresh1 SV *svp;
8755759b3d2Safresh1 STRLEN srclen;
8765759b3d2Safresh1 U8 *s, *e, *p;
8775759b3d2Safresh1 PPCODE:
8785759b3d2Safresh1 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
8795759b3d2Safresh1 e = s + srclen;
8805759b3d2Safresh1 p = e;
8815759b3d2Safresh1 while (s < p) {
8825759b3d2Safresh1 UV uv;
8835759b3d2Safresh1 p = utf8_hop(p, -1);
8845759b3d2Safresh1 if (p < s)
8855759b3d2Safresh1 croak(ErrHopBeforeStart);
886*256a93a4Safresh1 uv = utf8_to_uvchr_buf(p, e, NULL);
8875759b3d2Safresh1 if (getCombinClass(uv) == 0) /* Last Starter found */
8885759b3d2Safresh1 break;
8895759b3d2Safresh1 }
8905759b3d2Safresh1
8915759b3d2Safresh1 svp = sv_2mortal(newSVpvn((char*)s, p - s));
8925759b3d2Safresh1 SvUTF8_on(svp);
8935759b3d2Safresh1 XPUSHs(svp);
8945759b3d2Safresh1
8955759b3d2Safresh1 svp = sv_2mortal(newSVpvn((char*)p, e - p));
8965759b3d2Safresh1 SvUTF8_on(svp);
8975759b3d2Safresh1 XPUSHs(svp);
8985759b3d2Safresh1
899