xref: /openbsd-src/gnu/usr.bin/perl/dist/Unicode-Normalize/Normalize.xs (revision 256a93a44f36679bee503f12e49566c2183f6181)
15759b3d2Safresh1 
25759b3d2Safresh1 #define PERL_NO_GET_CONTEXT /* we want efficiency */
35759b3d2Safresh1 
45759b3d2Safresh1 /* private functions which need pTHX_ and aTHX_
55759b3d2Safresh1     pv_cat_decompHangul
65759b3d2Safresh1     sv_2pvunicode
75759b3d2Safresh1     pv_utf8_decompose
85759b3d2Safresh1     pv_utf8_reorder
95759b3d2Safresh1     pv_utf8_compose
105759b3d2Safresh1 */
115759b3d2Safresh1 
125759b3d2Safresh1 #include "EXTERN.h"
135759b3d2Safresh1 #include "perl.h"
145759b3d2Safresh1 #include "XSUB.h"
155759b3d2Safresh1 
16*256a93a4Safresh1 #define NEED_utf8_to_uvchr_buf
17*256a93a4Safresh1 #include "ppport.h"
18*256a93a4Safresh1 
195759b3d2Safresh1 /* These 5 files are prepared by mkheader */
205759b3d2Safresh1 #include "unfcmb.h"
215759b3d2Safresh1 #include "unfcan.h"
225759b3d2Safresh1 #include "unfcpt.h"
235759b3d2Safresh1 #include "unfcmp.h"
245759b3d2Safresh1 #include "unfexc.h"
255759b3d2Safresh1 
265759b3d2Safresh1 /* The generated normalization tables since v5.20 are in native character set
275759b3d2Safresh1  * terms.  Prior to that, they were in Unicode terms.  So we use 'uvchr' for
285759b3d2Safresh1  * later perls, and redefine that to be 'uvuni' for earlier ones */
29*256a93a4Safresh1 #if PERL_VERSION_LT(5,20,0)
305759b3d2Safresh1 #   undef uvchr_to_utf8
315759b3d2Safresh1 #   ifdef uvuni_to_utf8
325759b3d2Safresh1 #       define uvchr_to_utf8   uvuni_to_utf8
335759b3d2Safresh1 #   else /* Perl 5.6.1 */
345759b3d2Safresh1 #       define uvchr_to_utf8   uv_to_utf8
355759b3d2Safresh1 #   endif
365759b3d2Safresh1 #endif
375759b3d2Safresh1 
385759b3d2Safresh1 /* check if the string buffer is enough before uvchr_to_utf8(). */
395759b3d2Safresh1 /* dstart, d, and dlen should be defined outside before. */
405759b3d2Safresh1 #define Renew_d_if_not_enough_to(need)	STRLEN curlen = d - dstart;	\
415759b3d2Safresh1 		if (dlen < curlen + (need)) {	\
425759b3d2Safresh1 		    dlen += (need);		\
435759b3d2Safresh1 		    Renew(dstart, dlen+1, U8);	\
445759b3d2Safresh1 		    d = dstart + curlen;	\
455759b3d2Safresh1 		}
465759b3d2Safresh1 
47*256a93a4Safresh1 /* if utf8_to_uvchr_buf() sets retlen to 0 (if broken?) */
485759b3d2Safresh1 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
495759b3d2Safresh1 
505759b3d2Safresh1 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
515759b3d2Safresh1 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
525759b3d2Safresh1 
535759b3d2Safresh1 /* At present, char > 0x10ffff are unaffected without complaint, right? */
545759b3d2Safresh1 #define VALID_UTF_MAX    (0x10ffff)
555759b3d2Safresh1 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
565759b3d2Safresh1 
575759b3d2Safresh1 /* size of array for combining characters */
585759b3d2Safresh1 /* enough as an initial value? */
595759b3d2Safresh1 #define CC_SEQ_SIZE (10)
605759b3d2Safresh1 #define CC_SEQ_STEP  (5)
615759b3d2Safresh1 
625759b3d2Safresh1 /* HANGUL begin */
635759b3d2Safresh1 #define Hangul_SBase  0xAC00
645759b3d2Safresh1 #define Hangul_SFinal 0xD7A3
655759b3d2Safresh1 #define Hangul_SCount  11172
665759b3d2Safresh1 
675759b3d2Safresh1 #define Hangul_NCount    588
685759b3d2Safresh1 
695759b3d2Safresh1 #define Hangul_LBase  0x1100
705759b3d2Safresh1 #define Hangul_LFinal 0x1112
715759b3d2Safresh1 #define Hangul_LCount     19
725759b3d2Safresh1 
735759b3d2Safresh1 #define Hangul_VBase  0x1161
745759b3d2Safresh1 #define Hangul_VFinal 0x1175
755759b3d2Safresh1 #define Hangul_VCount     21
765759b3d2Safresh1 
775759b3d2Safresh1 #define Hangul_TBase  0x11A7
785759b3d2Safresh1 #define Hangul_TFinal 0x11C2
795759b3d2Safresh1 #define Hangul_TCount     28
805759b3d2Safresh1 
815759b3d2Safresh1 #define Hangul_IsS(u)  ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
825759b3d2Safresh1 #define Hangul_IsN(u)  (((u) - Hangul_SBase) % Hangul_TCount == 0)
835759b3d2Safresh1 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
845759b3d2Safresh1 #define Hangul_IsL(u)  ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
855759b3d2Safresh1 #define Hangul_IsV(u)  ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
865759b3d2Safresh1 #define Hangul_IsT(u)  ((Hangul_TBase  < (u)) && ((u) <= Hangul_TFinal))
875759b3d2Safresh1 /* HANGUL end */
885759b3d2Safresh1 
895759b3d2Safresh1 /* this is used for canonical ordering of combining characters (c.c.). */
905759b3d2Safresh1 typedef struct {
915759b3d2Safresh1     U8 cc;	/* combining class */
925759b3d2Safresh1     UV uv;	/* codepoint */
935759b3d2Safresh1     STRLEN pos; /* position */
945759b3d2Safresh1 } UNF_cc;
955759b3d2Safresh1 
compare_cc(const void * a,const void * b)965759b3d2Safresh1 static int compare_cc(const void *a, const void *b)
975759b3d2Safresh1 {
985759b3d2Safresh1     int ret_cc;
995759b3d2Safresh1     ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
1005759b3d2Safresh1     if (ret_cc)
1015759b3d2Safresh1 	return ret_cc;
1025759b3d2Safresh1 
1035759b3d2Safresh1     return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
1045759b3d2Safresh1 	 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
1055759b3d2Safresh1 }
1065759b3d2Safresh1 
dec_canonical(UV uv)1075759b3d2Safresh1 static U8* dec_canonical(UV uv)
1085759b3d2Safresh1 {
1095759b3d2Safresh1     U8 ***plane, **row;
1105759b3d2Safresh1     if (OVER_UTF_MAX(uv))
1115759b3d2Safresh1 	return NULL;
1125759b3d2Safresh1     plane = (U8***)UNF_canon[uv >> 16];
1135759b3d2Safresh1     if (! plane)
1145759b3d2Safresh1 	return NULL;
115*256a93a4Safresh1     row = plane[(U8) (uv >> 8)];
116*256a93a4Safresh1     return row ? row[(U8) uv] : NULL;
1175759b3d2Safresh1 }
1185759b3d2Safresh1 
dec_compat(UV uv)1195759b3d2Safresh1 static U8* dec_compat(UV uv)
1205759b3d2Safresh1 {
1215759b3d2Safresh1     U8 ***plane, **row;
1225759b3d2Safresh1     if (OVER_UTF_MAX(uv))
1235759b3d2Safresh1 	return NULL;
1245759b3d2Safresh1     plane = (U8***)UNF_compat[uv >> 16];
1255759b3d2Safresh1     if (! plane)
1265759b3d2Safresh1 	return NULL;
127*256a93a4Safresh1     row = plane[(U8) (uv >> 8)];
128*256a93a4Safresh1     return row ? row[(U8) uv] : NULL;
1295759b3d2Safresh1 }
1305759b3d2Safresh1 
composite_uv(UV uv,UV uv2)1315759b3d2Safresh1 static UV composite_uv(UV uv, UV uv2)
1325759b3d2Safresh1 {
1335759b3d2Safresh1     UNF_complist ***plane, **row, *cell, *i;
1345759b3d2Safresh1 
1355759b3d2Safresh1     if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
1365759b3d2Safresh1 	return 0;
1375759b3d2Safresh1 
1385759b3d2Safresh1     if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
1395759b3d2Safresh1 	UV lindex = uv  - Hangul_LBase;
1405759b3d2Safresh1 	UV vindex = uv2 - Hangul_VBase;
1415759b3d2Safresh1 	return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
1425759b3d2Safresh1 	       Hangul_TCount);
1435759b3d2Safresh1     }
1445759b3d2Safresh1     if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
1455759b3d2Safresh1 	UV tindex = uv2 - Hangul_TBase;
1465759b3d2Safresh1 	return(uv + tindex);
1475759b3d2Safresh1     }
1485759b3d2Safresh1     plane = UNF_compos[uv >> 16];
1495759b3d2Safresh1     if (! plane)
1505759b3d2Safresh1 	return 0;
151*256a93a4Safresh1     row = plane[(U8) (uv >> 8)];
1525759b3d2Safresh1     if (! row)
1535759b3d2Safresh1 	return 0;
154*256a93a4Safresh1     cell = row[(U8) uv];
1555759b3d2Safresh1     if (! cell)
1565759b3d2Safresh1 	return 0;
1575759b3d2Safresh1     for (i = cell; i->nextchar; i++) {
1585759b3d2Safresh1 	if (uv2 == i->nextchar)
1595759b3d2Safresh1 	    return i->composite;
1605759b3d2Safresh1     }
1615759b3d2Safresh1     return 0;
1625759b3d2Safresh1 }
1635759b3d2Safresh1 
getCombinClass(UV uv)1645759b3d2Safresh1 static U8 getCombinClass(UV uv)
1655759b3d2Safresh1 {
1665759b3d2Safresh1     U8 **plane, *row;
1675759b3d2Safresh1     if (OVER_UTF_MAX(uv))
1685759b3d2Safresh1 	return 0;
1695759b3d2Safresh1     plane = (U8**)UNF_combin[uv >> 16];
1705759b3d2Safresh1     if (! plane)
1715759b3d2Safresh1 	return 0;
172*256a93a4Safresh1     row = plane[(U8) (uv >> 8)];
173*256a93a4Safresh1     return row ? row[(U8) uv] : 0;
1745759b3d2Safresh1 }
1755759b3d2Safresh1 
pv_cat_decompHangul(pTHX_ U8 * d,UV uv)1765759b3d2Safresh1 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
1775759b3d2Safresh1 {
1785759b3d2Safresh1     UV sindex =  uv - Hangul_SBase;
1795759b3d2Safresh1     UV lindex =  sindex / Hangul_NCount;
1805759b3d2Safresh1     UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
1815759b3d2Safresh1     UV tindex =  sindex % Hangul_TCount;
1825759b3d2Safresh1 
1835759b3d2Safresh1     if (! Hangul_IsS(uv))
1845759b3d2Safresh1 	return d;
1855759b3d2Safresh1 
1865759b3d2Safresh1     d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
1875759b3d2Safresh1     d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
1885759b3d2Safresh1     if (tindex)
1895759b3d2Safresh1 	d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
1905759b3d2Safresh1     return d;
1915759b3d2Safresh1 }
1925759b3d2Safresh1 
sv_2pvunicode(pTHX_ SV * sv,STRLEN * lp)1935759b3d2Safresh1 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
1945759b3d2Safresh1 {
1955759b3d2Safresh1     char *s;
1965759b3d2Safresh1     STRLEN len;
1975759b3d2Safresh1     s = SvPV(sv,len);
1985759b3d2Safresh1     if (!SvUTF8(sv)) {
1995759b3d2Safresh1 	SV* tmpsv = sv_2mortal(newSVpvn(s, len));
2005759b3d2Safresh1 	if (!SvPOK(tmpsv))
2015759b3d2Safresh1 	    s = SvPV_force(tmpsv,len);
2025759b3d2Safresh1 	sv_utf8_upgrade(tmpsv);
2035759b3d2Safresh1 	s = SvPV(tmpsv,len);
2045759b3d2Safresh1     }
2055759b3d2Safresh1     if (lp)
2065759b3d2Safresh1 	*lp = len;
2075759b3d2Safresh1     return s;
2085759b3d2Safresh1 }
2095759b3d2Safresh1 
2105759b3d2Safresh1 static
pv_utf8_decompose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscompat)2115759b3d2Safresh1 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
2125759b3d2Safresh1 {
2135759b3d2Safresh1     U8* p = s;
2145759b3d2Safresh1     U8* e = s + slen;
2155759b3d2Safresh1     U8* dstart = *dp;
2165759b3d2Safresh1     U8* d = dstart;
2175759b3d2Safresh1 
2185759b3d2Safresh1     while (p < e) {
2195759b3d2Safresh1 	STRLEN retlen;
220*256a93a4Safresh1 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
2215759b3d2Safresh1 	if (!retlen)
2225759b3d2Safresh1 	    croak(ErrRetlenIsZero, "decompose");
2235759b3d2Safresh1 	p += retlen;
2245759b3d2Safresh1 
2255759b3d2Safresh1 	if (Hangul_IsS(uv)) {
2265759b3d2Safresh1 	    Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
2275759b3d2Safresh1 	    d = pv_cat_decompHangul(aTHX_ d, uv);
2285759b3d2Safresh1 	}
2295759b3d2Safresh1 	else {
2305759b3d2Safresh1 	    U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
2315759b3d2Safresh1 
2325759b3d2Safresh1 	    if (r) {
2335759b3d2Safresh1 		STRLEN len = (STRLEN)strlen((char *)r);
2345759b3d2Safresh1 		Renew_d_if_not_enough_to(len)
2355759b3d2Safresh1 		while (len--)
2365759b3d2Safresh1 		    *d++ = *r++;
2375759b3d2Safresh1 	    }
2385759b3d2Safresh1 	    else {
2395759b3d2Safresh1 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
2405759b3d2Safresh1 		d = uvchr_to_utf8(d, uv);
2415759b3d2Safresh1 	    }
2425759b3d2Safresh1 	}
2435759b3d2Safresh1     }
2445759b3d2Safresh1     *dp = dstart;
2455759b3d2Safresh1     return d;
2465759b3d2Safresh1 }
2475759b3d2Safresh1 
2485759b3d2Safresh1 static
pv_utf8_reorder(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen)2495759b3d2Safresh1 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
2505759b3d2Safresh1 {
2515759b3d2Safresh1     U8* p = s;
2525759b3d2Safresh1     U8* e = s + slen;
2535759b3d2Safresh1     U8* dstart = *dp;
2545759b3d2Safresh1     U8* d = dstart;
2555759b3d2Safresh1 
2565759b3d2Safresh1     UNF_cc  seq_ary[CC_SEQ_SIZE];
2575759b3d2Safresh1     UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
2585759b3d2Safresh1     UNF_cc* seq_ext = NULL; /* extend if need */
2595759b3d2Safresh1     STRLEN seq_max = CC_SEQ_SIZE;
2605759b3d2Safresh1     STRLEN cc_pos = 0;
2615759b3d2Safresh1 
2625759b3d2Safresh1     while (p < e) {
2635759b3d2Safresh1 	U8 curCC;
2645759b3d2Safresh1 	STRLEN retlen;
265*256a93a4Safresh1 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
2665759b3d2Safresh1 	if (!retlen)
2675759b3d2Safresh1 	    croak(ErrRetlenIsZero, "reorder");
2685759b3d2Safresh1 	p += retlen;
2695759b3d2Safresh1 
2705759b3d2Safresh1 	curCC = getCombinClass(uv);
2715759b3d2Safresh1 
2725759b3d2Safresh1 	if (curCC != 0) {
2735759b3d2Safresh1 	    if (seq_max < cc_pos + 1) { /* extend if need */
2745759b3d2Safresh1 		seq_max = cc_pos + CC_SEQ_STEP; /* new size */
2755759b3d2Safresh1 		if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
2765759b3d2Safresh1 		    STRLEN i;
2775759b3d2Safresh1 		    New(0, seq_ext, seq_max, UNF_cc);
2785759b3d2Safresh1 		    for (i = 0; i < cc_pos; i++)
2795759b3d2Safresh1 			seq_ext[i] = seq_ary[i];
2805759b3d2Safresh1 		}
2815759b3d2Safresh1 		else {
2825759b3d2Safresh1 		    Renew(seq_ext, seq_max, UNF_cc);
2835759b3d2Safresh1 		}
2845759b3d2Safresh1 		seq_ptr = seq_ext; /* use seq_ext from now */
2855759b3d2Safresh1 	    }
2865759b3d2Safresh1 
2875759b3d2Safresh1 	    seq_ptr[cc_pos].cc  = curCC;
2885759b3d2Safresh1 	    seq_ptr[cc_pos].uv  = uv;
2895759b3d2Safresh1 	    seq_ptr[cc_pos].pos = cc_pos;
2905759b3d2Safresh1 	    ++cc_pos;
2915759b3d2Safresh1 
2925759b3d2Safresh1 	    if (p < e)
2935759b3d2Safresh1 		continue;
2945759b3d2Safresh1 	}
2955759b3d2Safresh1 
2965759b3d2Safresh1 	/* output */
2975759b3d2Safresh1 	if (cc_pos) {
2985759b3d2Safresh1 	    STRLEN i;
2995759b3d2Safresh1 
3005759b3d2Safresh1 	    if (cc_pos > 1) /* reordered if there are two c.c.'s */
3015759b3d2Safresh1 		qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
3025759b3d2Safresh1 
3035759b3d2Safresh1 	    for (i = 0; i < cc_pos; i++) {
3045759b3d2Safresh1 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
3055759b3d2Safresh1 		d = uvchr_to_utf8(d, seq_ptr[i].uv);
3065759b3d2Safresh1 	    }
3075759b3d2Safresh1 	    cc_pos = 0;
3085759b3d2Safresh1 	}
3095759b3d2Safresh1 
3105759b3d2Safresh1 	if (curCC == 0) {
3115759b3d2Safresh1 	    Renew_d_if_not_enough_to(UTF8_MAXLEN)
3125759b3d2Safresh1 	    d = uvchr_to_utf8(d, uv);
3135759b3d2Safresh1 	}
3145759b3d2Safresh1     }
3155759b3d2Safresh1     if (seq_ext)
3165759b3d2Safresh1 	Safefree(seq_ext);
3175759b3d2Safresh1     *dp = dstart;
3185759b3d2Safresh1     return d;
3195759b3d2Safresh1 }
3205759b3d2Safresh1 
3215759b3d2Safresh1 static
pv_utf8_compose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscontig)3225759b3d2Safresh1 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
3235759b3d2Safresh1 {
3245759b3d2Safresh1     U8* p = s;
3255759b3d2Safresh1     U8* e = s + slen;
3265759b3d2Safresh1     U8* dstart = *dp;
3275759b3d2Safresh1     U8* d = dstart;
3285759b3d2Safresh1 
3295759b3d2Safresh1     UV uvS = 0; /* code point of the starter */
3305759b3d2Safresh1     bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
3315759b3d2Safresh1     U8 preCC = 0;
3325759b3d2Safresh1 
3335759b3d2Safresh1     UV  seq_ary[CC_SEQ_SIZE];
3345759b3d2Safresh1     UV* seq_ptr = seq_ary; /* use array at the beginning */
3355759b3d2Safresh1     UV* seq_ext = NULL; /* extend if need */
3365759b3d2Safresh1     STRLEN seq_max = CC_SEQ_SIZE;
3375759b3d2Safresh1     STRLEN cc_pos = 0;
3385759b3d2Safresh1 
3395759b3d2Safresh1     while (p < e) {
3405759b3d2Safresh1 	U8 curCC;
3415759b3d2Safresh1 	STRLEN retlen;
342*256a93a4Safresh1 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
3435759b3d2Safresh1 	if (!retlen)
3445759b3d2Safresh1 	    croak(ErrRetlenIsZero, "compose");
3455759b3d2Safresh1 	p += retlen;
3465759b3d2Safresh1 
3475759b3d2Safresh1 	curCC = getCombinClass(uv);
3485759b3d2Safresh1 
3495759b3d2Safresh1 	if (!valid_uvS) {
3505759b3d2Safresh1 	    if (curCC == 0) {
3515759b3d2Safresh1 		uvS = uv; /* the first Starter is found */
3525759b3d2Safresh1 		valid_uvS = TRUE;
3535759b3d2Safresh1 		if (p < e)
3545759b3d2Safresh1 		    continue;
3555759b3d2Safresh1 	    }
3565759b3d2Safresh1 	    else {
3575759b3d2Safresh1 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
3585759b3d2Safresh1 		d = uvchr_to_utf8(d, uv);
3595759b3d2Safresh1 		continue;
3605759b3d2Safresh1 	    }
3615759b3d2Safresh1 	}
3625759b3d2Safresh1 	else {
3635759b3d2Safresh1 	    bool composed;
3645759b3d2Safresh1 
3655759b3d2Safresh1 	    /* blocked */
3665759b3d2Safresh1 	    if ((iscontig && cc_pos) || /* discontiguous combination */
3675759b3d2Safresh1 		 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
3685759b3d2Safresh1 		 (preCC > curCC)) /* blocked by higher CC: revised D2 */
3695759b3d2Safresh1 		composed = FALSE;
3705759b3d2Safresh1 
3715759b3d2Safresh1 	    /* not blocked:
3725759b3d2Safresh1 		 iscontig && cc_pos == 0      -- contiguous combination
3735759b3d2Safresh1 		 curCC == 0 && preCC == 0     -- starter + starter
3745759b3d2Safresh1 		 curCC != 0 && preCC < curCC  -- lower CC */
3755759b3d2Safresh1 	    else {
3765759b3d2Safresh1 		/* try composition */
3775759b3d2Safresh1 		UV uvComp = composite_uv(uvS, uv);
3785759b3d2Safresh1 
3795759b3d2Safresh1 		if (uvComp && !isExclusion(uvComp))  {
3805759b3d2Safresh1 		    uvS = uvComp;
3815759b3d2Safresh1 		    composed = TRUE;
3825759b3d2Safresh1 
3835759b3d2Safresh1 		    /* preCC should not be changed to curCC */
3845759b3d2Safresh1 		    /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
3855759b3d2Safresh1 		    if (p < e)
3865759b3d2Safresh1 			continue;
3875759b3d2Safresh1 		}
3885759b3d2Safresh1 		else
3895759b3d2Safresh1 		    composed = FALSE;
3905759b3d2Safresh1 	    }
3915759b3d2Safresh1 
3925759b3d2Safresh1 	    if (!composed) {
3935759b3d2Safresh1 		preCC = curCC;
3945759b3d2Safresh1 		if (curCC != 0 || !(p < e)) {
3955759b3d2Safresh1 		    if (seq_max < cc_pos + 1) { /* extend if need */
3965759b3d2Safresh1 			seq_max = cc_pos + CC_SEQ_STEP; /* new size */
3975759b3d2Safresh1 			if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
3985759b3d2Safresh1 			    New(0, seq_ext, seq_max, UV);
3995759b3d2Safresh1 			    Copy(seq_ary, seq_ext, cc_pos, UV);
4005759b3d2Safresh1 			}
4015759b3d2Safresh1 			else {
4025759b3d2Safresh1 			    Renew(seq_ext, seq_max, UV);
4035759b3d2Safresh1 			}
4045759b3d2Safresh1 			seq_ptr = seq_ext; /* use seq_ext from now */
4055759b3d2Safresh1 		    }
4065759b3d2Safresh1 		    seq_ptr[cc_pos] = uv;
4075759b3d2Safresh1 		    ++cc_pos;
4085759b3d2Safresh1 		}
4095759b3d2Safresh1 		if (curCC != 0 && p < e)
4105759b3d2Safresh1 		    continue;
4115759b3d2Safresh1 	    }
4125759b3d2Safresh1 	}
4135759b3d2Safresh1 
4145759b3d2Safresh1 	/* output */
4155759b3d2Safresh1 	{
4165759b3d2Safresh1 	    Renew_d_if_not_enough_to(UTF8_MAXLEN)
4175759b3d2Safresh1 	    d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
4185759b3d2Safresh1 	}
4195759b3d2Safresh1 
4205759b3d2Safresh1 	if (cc_pos) {
4215759b3d2Safresh1 	    STRLEN i;
4225759b3d2Safresh1 
4235759b3d2Safresh1 	    for (i = 0; i < cc_pos; i++) {
4245759b3d2Safresh1 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
4255759b3d2Safresh1 		d = uvchr_to_utf8(d, seq_ptr[i]);
4265759b3d2Safresh1 	    }
4275759b3d2Safresh1 	    cc_pos = 0;
4285759b3d2Safresh1 	}
4295759b3d2Safresh1 
4305759b3d2Safresh1 	uvS = uv;
4315759b3d2Safresh1     }
4325759b3d2Safresh1     if (seq_ext)
4335759b3d2Safresh1 	Safefree(seq_ext);
4345759b3d2Safresh1     *dp = dstart;
4355759b3d2Safresh1     return d;
4365759b3d2Safresh1 }
4375759b3d2Safresh1 
4385759b3d2Safresh1 MODULE = Unicode::Normalize	PACKAGE = Unicode::Normalize
4395759b3d2Safresh1 
4405759b3d2Safresh1 SV*
4415759b3d2Safresh1 decompose(src, compat = &PL_sv_no)
4425759b3d2Safresh1     SV * src
4435759b3d2Safresh1     SV * compat
4445759b3d2Safresh1   PROTOTYPE: $;$
4455759b3d2Safresh1   PREINIT:
4465759b3d2Safresh1     SV* dst;
4475759b3d2Safresh1     U8 *s, *d, *dend;
4485759b3d2Safresh1     STRLEN slen, dlen;
4495759b3d2Safresh1   CODE:
4505759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
4515759b3d2Safresh1     dst = newSVpvn("", 0);
4525759b3d2Safresh1     dlen = slen;
4535759b3d2Safresh1     New(0, d, dlen+1, U8);
4545759b3d2Safresh1     dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
4555759b3d2Safresh1     sv_setpvn(dst, (char *)d, dend - d);
4565759b3d2Safresh1     SvUTF8_on(dst);
4575759b3d2Safresh1     Safefree(d);
4585759b3d2Safresh1     RETVAL = dst;
4595759b3d2Safresh1   OUTPUT:
4605759b3d2Safresh1     RETVAL
4615759b3d2Safresh1 
4625759b3d2Safresh1 
4635759b3d2Safresh1 SV*
4645759b3d2Safresh1 reorder(src)
4655759b3d2Safresh1     SV * src
4665759b3d2Safresh1   PROTOTYPE: $
4675759b3d2Safresh1   PREINIT:
4685759b3d2Safresh1     SV* dst;
4695759b3d2Safresh1     U8 *s, *d, *dend;
4705759b3d2Safresh1     STRLEN slen, dlen;
4715759b3d2Safresh1   CODE:
4725759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
4735759b3d2Safresh1     dst = newSVpvn("", 0);
4745759b3d2Safresh1     dlen = slen;
4755759b3d2Safresh1     New(0, d, dlen+1, U8);
4765759b3d2Safresh1     dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
4775759b3d2Safresh1     sv_setpvn(dst, (char *)d, dend - d);
4785759b3d2Safresh1     SvUTF8_on(dst);
4795759b3d2Safresh1     Safefree(d);
4805759b3d2Safresh1     RETVAL = dst;
4815759b3d2Safresh1   OUTPUT:
4825759b3d2Safresh1     RETVAL
4835759b3d2Safresh1 
4845759b3d2Safresh1 
4855759b3d2Safresh1 SV*
4865759b3d2Safresh1 compose(src)
4875759b3d2Safresh1     SV * src
4885759b3d2Safresh1   PROTOTYPE: $
4895759b3d2Safresh1   ALIAS:
4905759b3d2Safresh1     composeContiguous = 1
4915759b3d2Safresh1   PREINIT:
4925759b3d2Safresh1     SV* dst;
4935759b3d2Safresh1     U8 *s, *d, *dend;
4945759b3d2Safresh1     STRLEN slen, dlen;
4955759b3d2Safresh1   CODE:
4965759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
4975759b3d2Safresh1     dst = newSVpvn("", 0);
4985759b3d2Safresh1     dlen = slen;
4995759b3d2Safresh1     New(0, d, dlen+1, U8);
5005759b3d2Safresh1     dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
5015759b3d2Safresh1     sv_setpvn(dst, (char *)d, dend - d);
5025759b3d2Safresh1     SvUTF8_on(dst);
5035759b3d2Safresh1     Safefree(d);
5045759b3d2Safresh1     RETVAL = dst;
5055759b3d2Safresh1   OUTPUT:
5065759b3d2Safresh1     RETVAL
5075759b3d2Safresh1 
5085759b3d2Safresh1 
5095759b3d2Safresh1 SV*
5105759b3d2Safresh1 NFD(src)
5115759b3d2Safresh1     SV * src
5125759b3d2Safresh1   PROTOTYPE: $
5135759b3d2Safresh1   ALIAS:
5145759b3d2Safresh1     NFKD = 1
5155759b3d2Safresh1   PREINIT:
5165759b3d2Safresh1     SV *dst;
5175759b3d2Safresh1     U8 *s, *t, *tend, *d, *dend;
5185759b3d2Safresh1     STRLEN slen, tlen, dlen;
5195759b3d2Safresh1   CODE:
5205759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
5215759b3d2Safresh1 
5225759b3d2Safresh1     /* decompose */
5235759b3d2Safresh1     tlen = slen;
5245759b3d2Safresh1     New(0, t, tlen+1, U8);
5255759b3d2Safresh1     tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
5265759b3d2Safresh1     *tend = '\0';
5275759b3d2Safresh1     tlen = tend - t; /* no longer know real size of t */
5285759b3d2Safresh1 
5295759b3d2Safresh1     /* reorder */
5305759b3d2Safresh1     dlen = tlen;
5315759b3d2Safresh1     New(0, d, dlen+1, U8);
5325759b3d2Safresh1     dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
5335759b3d2Safresh1     *dend = '\0';
5345759b3d2Safresh1     dlen = dend - d; /* no longer know real size of d */
5355759b3d2Safresh1 
5365759b3d2Safresh1     /* return */
5375759b3d2Safresh1     dst = newSVpvn("", 0);
5385759b3d2Safresh1     sv_setpvn(dst, (char *)d, dlen);
5395759b3d2Safresh1     SvUTF8_on(dst);
5405759b3d2Safresh1 
5415759b3d2Safresh1     Safefree(t);
5425759b3d2Safresh1     Safefree(d);
5435759b3d2Safresh1     RETVAL = dst;
5445759b3d2Safresh1   OUTPUT:
5455759b3d2Safresh1     RETVAL
5465759b3d2Safresh1 
5475759b3d2Safresh1 
5485759b3d2Safresh1 SV*
5495759b3d2Safresh1 NFC(src)
5505759b3d2Safresh1     SV * src
5515759b3d2Safresh1   PROTOTYPE: $
5525759b3d2Safresh1   ALIAS:
5535759b3d2Safresh1     NFKC = 1
5545759b3d2Safresh1     FCC  = 2
5555759b3d2Safresh1   PREINIT:
5565759b3d2Safresh1     SV *dst;
5575759b3d2Safresh1     U8 *s, *t, *tend, *u, *uend, *d, *dend;
5585759b3d2Safresh1     STRLEN slen, tlen, ulen, dlen;
5595759b3d2Safresh1   CODE:
5605759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
5615759b3d2Safresh1 
5625759b3d2Safresh1     /* decompose */
5635759b3d2Safresh1     tlen = slen;
5645759b3d2Safresh1     New(0, t, tlen+1, U8);
5655759b3d2Safresh1     tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
5665759b3d2Safresh1     *tend = '\0';
5675759b3d2Safresh1     tlen = tend - t; /* no longer know real size of t */
5685759b3d2Safresh1 
5695759b3d2Safresh1     /* reorder */
5705759b3d2Safresh1     ulen = tlen;
5715759b3d2Safresh1     New(0, u, ulen+1, U8);
5725759b3d2Safresh1     uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
5735759b3d2Safresh1     *uend = '\0';
5745759b3d2Safresh1     ulen = uend - u; /* no longer know real size of u */
5755759b3d2Safresh1 
5765759b3d2Safresh1     /* compose */
5775759b3d2Safresh1     dlen = ulen;
5785759b3d2Safresh1     New(0, d, dlen+1, U8);
5795759b3d2Safresh1     dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
5805759b3d2Safresh1     *dend = '\0';
5815759b3d2Safresh1     dlen = dend - d; /* no longer know real size of d */
5825759b3d2Safresh1 
5835759b3d2Safresh1     /* return */
5845759b3d2Safresh1     dst = newSVpvn("", 0);
5855759b3d2Safresh1     sv_setpvn(dst, (char *)d, dlen);
5865759b3d2Safresh1     SvUTF8_on(dst);
5875759b3d2Safresh1 
5885759b3d2Safresh1     Safefree(t);
5895759b3d2Safresh1     Safefree(u);
5905759b3d2Safresh1     Safefree(d);
5915759b3d2Safresh1     RETVAL = dst;
5925759b3d2Safresh1   OUTPUT:
5935759b3d2Safresh1     RETVAL
5945759b3d2Safresh1 
5955759b3d2Safresh1 
5965759b3d2Safresh1 SV*
5975759b3d2Safresh1 checkNFD(src)
5985759b3d2Safresh1     SV * src
5995759b3d2Safresh1   PROTOTYPE: $
6005759b3d2Safresh1   ALIAS:
6015759b3d2Safresh1     checkNFKD = 1
6025759b3d2Safresh1   PREINIT:
6035759b3d2Safresh1     STRLEN srclen, retlen;
6045759b3d2Safresh1     U8 *s, *e, *p, curCC, preCC;
6055759b3d2Safresh1     bool result = TRUE;
6065759b3d2Safresh1   CODE:
6075759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
6085759b3d2Safresh1     e = s + srclen;
6095759b3d2Safresh1 
6105759b3d2Safresh1     preCC = 0;
6115759b3d2Safresh1     for (p = s; p < e; p += retlen) {
612*256a93a4Safresh1 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
6135759b3d2Safresh1 	if (!retlen)
6145759b3d2Safresh1 	    croak(ErrRetlenIsZero, "checkNFD or -NFKD");
6155759b3d2Safresh1 
6165759b3d2Safresh1 	curCC = getCombinClass(uv);
6175759b3d2Safresh1 	if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
6185759b3d2Safresh1 	    result = FALSE;
6195759b3d2Safresh1 	    break;
6205759b3d2Safresh1 	}
6215759b3d2Safresh1 	if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
6225759b3d2Safresh1 	    result = FALSE;
6235759b3d2Safresh1 	    break;
6245759b3d2Safresh1 	}
6255759b3d2Safresh1 	preCC = curCC;
6265759b3d2Safresh1     }
6275759b3d2Safresh1     RETVAL = boolSV(result);
6285759b3d2Safresh1   OUTPUT:
6295759b3d2Safresh1     RETVAL
6305759b3d2Safresh1 
6315759b3d2Safresh1 
6325759b3d2Safresh1 SV*
6335759b3d2Safresh1 checkNFC(src)
6345759b3d2Safresh1     SV * src
6355759b3d2Safresh1   PROTOTYPE: $
6365759b3d2Safresh1   ALIAS:
6375759b3d2Safresh1     checkNFKC = 1
6385759b3d2Safresh1   PREINIT:
6395759b3d2Safresh1     STRLEN srclen, retlen;
6405759b3d2Safresh1     U8 *s, *e, *p, curCC, preCC;
6415759b3d2Safresh1     bool result = TRUE;
6425759b3d2Safresh1     bool isMAYBE = FALSE;
6435759b3d2Safresh1   CODE:
6445759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
6455759b3d2Safresh1     e = s + srclen;
6465759b3d2Safresh1 
6475759b3d2Safresh1     preCC = 0;
6485759b3d2Safresh1     for (p = s; p < e; p += retlen) {
649*256a93a4Safresh1 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
6505759b3d2Safresh1 	if (!retlen)
6515759b3d2Safresh1 	    croak(ErrRetlenIsZero, "checkNFC or -NFKC");
6525759b3d2Safresh1 
6535759b3d2Safresh1 	curCC = getCombinClass(uv);
6545759b3d2Safresh1 	if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
6555759b3d2Safresh1 	    result = FALSE;
6565759b3d2Safresh1 	    break;
6575759b3d2Safresh1 	}
6585759b3d2Safresh1 
6595759b3d2Safresh1 	/* get NFC/NFKC property */
6605759b3d2Safresh1 	if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
6615759b3d2Safresh1 	    ; /* YES */
6625759b3d2Safresh1 	else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
6635759b3d2Safresh1 	    result = FALSE;
6645759b3d2Safresh1 	    break;
6655759b3d2Safresh1 	}
6665759b3d2Safresh1 	else if (isComp2nd(uv))
6675759b3d2Safresh1 	    isMAYBE = TRUE;
6685759b3d2Safresh1 	else if (ix) {
6695759b3d2Safresh1 	    char *canon, *compat;
6705759b3d2Safresh1 	  /* NFKC_NO when having compatibility mapping. */
6715759b3d2Safresh1 	    canon  = (char *) dec_canonical(uv);
6725759b3d2Safresh1 	    compat = (char *) dec_compat(uv);
6735759b3d2Safresh1 	    if (compat && !(canon && strEQ(canon, compat))) {
6745759b3d2Safresh1 		result = FALSE;
6755759b3d2Safresh1 		break;
6765759b3d2Safresh1 	    }
6775759b3d2Safresh1 	} /* end of get NFC/NFKC property */
6785759b3d2Safresh1 
6795759b3d2Safresh1 	preCC = curCC;
6805759b3d2Safresh1     }
6815759b3d2Safresh1     if (isMAYBE && result) /* NO precedes MAYBE */
6825759b3d2Safresh1 	XSRETURN_UNDEF;
6835759b3d2Safresh1     RETVAL = boolSV(result);
6845759b3d2Safresh1   OUTPUT:
6855759b3d2Safresh1     RETVAL
6865759b3d2Safresh1 
6875759b3d2Safresh1 
6885759b3d2Safresh1 SV*
6895759b3d2Safresh1 checkFCD(src)
6905759b3d2Safresh1     SV * src
6915759b3d2Safresh1   PROTOTYPE: $
6925759b3d2Safresh1   ALIAS:
6935759b3d2Safresh1     checkFCC = 1
6945759b3d2Safresh1   PREINIT:
6955759b3d2Safresh1     STRLEN srclen, retlen;
6965759b3d2Safresh1     U8 *s, *e, *p, curCC, preCC;
6975759b3d2Safresh1     bool result = TRUE;
6985759b3d2Safresh1     bool isMAYBE = FALSE;
6995759b3d2Safresh1   CODE:
7005759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
7015759b3d2Safresh1     e = s + srclen;
7025759b3d2Safresh1     preCC = 0;
7035759b3d2Safresh1     for (p = s; p < e; p += retlen) {
7045759b3d2Safresh1 	U8 *sCan;
7055759b3d2Safresh1 	UV uvLead;
7065759b3d2Safresh1 	STRLEN canlen = 0;
707*256a93a4Safresh1 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
7085759b3d2Safresh1 	if (!retlen)
7095759b3d2Safresh1 	    croak(ErrRetlenIsZero, "checkFCD or -FCC");
7105759b3d2Safresh1 
7115759b3d2Safresh1 	sCan = (U8*) dec_canonical(uv);
7125759b3d2Safresh1 
7135759b3d2Safresh1 	if (sCan) {
7145759b3d2Safresh1 	    STRLEN canret;
7155759b3d2Safresh1 	    canlen = (STRLEN)strlen((char *) sCan);
716*256a93a4Safresh1 	    uvLead = utf8_to_uvchr_buf(sCan, sCan + canlen, &canret);
7175759b3d2Safresh1 	    if (!canret)
7185759b3d2Safresh1 		croak(ErrRetlenIsZero, "checkFCD or -FCC");
7195759b3d2Safresh1 	}
7205759b3d2Safresh1 	else {
7215759b3d2Safresh1 	    uvLead = uv;
7225759b3d2Safresh1 	}
7235759b3d2Safresh1 
7245759b3d2Safresh1 	curCC = getCombinClass(uvLead);
7255759b3d2Safresh1 
7265759b3d2Safresh1 	if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
7275759b3d2Safresh1 	    result = FALSE;
7285759b3d2Safresh1 	    break;
7295759b3d2Safresh1 	}
7305759b3d2Safresh1 
7315759b3d2Safresh1 	if (ix) {
7325759b3d2Safresh1 	    if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
7335759b3d2Safresh1 		result = FALSE;
7345759b3d2Safresh1 		break;
7355759b3d2Safresh1 	    }
7365759b3d2Safresh1 	    else if (isComp2nd(uv))
7375759b3d2Safresh1 		isMAYBE = TRUE;
7385759b3d2Safresh1 	}
7395759b3d2Safresh1 
7405759b3d2Safresh1 	if (sCan) {
7415759b3d2Safresh1 	    STRLEN canret;
7425759b3d2Safresh1 	    UV uvTrail;
7435759b3d2Safresh1 	    U8* eCan = sCan + canlen;
7445759b3d2Safresh1 	    U8* pCan = utf8_hop(eCan, -1);
7455759b3d2Safresh1 	    if (pCan < sCan)
7465759b3d2Safresh1 		croak(ErrHopBeforeStart);
747*256a93a4Safresh1 	    uvTrail = utf8_to_uvchr_buf(pCan, eCan, &canret);
7485759b3d2Safresh1 	    if (!canret)
7495759b3d2Safresh1 		croak(ErrRetlenIsZero, "checkFCD or -FCC");
7505759b3d2Safresh1 	    preCC = getCombinClass(uvTrail);
7515759b3d2Safresh1 	}
7525759b3d2Safresh1 	else {
7535759b3d2Safresh1 	    preCC = curCC;
7545759b3d2Safresh1 	}
7555759b3d2Safresh1     }
7565759b3d2Safresh1     if (isMAYBE && result) /* NO precedes MAYBE */
7575759b3d2Safresh1 	XSRETURN_UNDEF;
7585759b3d2Safresh1     RETVAL = boolSV(result);
7595759b3d2Safresh1   OUTPUT:
7605759b3d2Safresh1     RETVAL
7615759b3d2Safresh1 
7625759b3d2Safresh1 
7635759b3d2Safresh1 U8
7645759b3d2Safresh1 getCombinClass(uv)
7655759b3d2Safresh1     UV uv
7665759b3d2Safresh1   PROTOTYPE: $
7675759b3d2Safresh1 
7685759b3d2Safresh1 bool
7695759b3d2Safresh1 isExclusion(uv)
7705759b3d2Safresh1     UV uv
7715759b3d2Safresh1   PROTOTYPE: $
7725759b3d2Safresh1 
7735759b3d2Safresh1 bool
7745759b3d2Safresh1 isSingleton(uv)
7755759b3d2Safresh1     UV uv
7765759b3d2Safresh1   PROTOTYPE: $
7775759b3d2Safresh1 
7785759b3d2Safresh1 bool
7795759b3d2Safresh1 isNonStDecomp(uv)
7805759b3d2Safresh1     UV uv
7815759b3d2Safresh1   PROTOTYPE: $
7825759b3d2Safresh1 
7835759b3d2Safresh1 bool
7845759b3d2Safresh1 isComp2nd(uv)
7855759b3d2Safresh1     UV uv
7865759b3d2Safresh1   PROTOTYPE: $
7875759b3d2Safresh1   ALIAS:
7885759b3d2Safresh1     isNFC_MAYBE  = 1
7895759b3d2Safresh1     isNFKC_MAYBE = 2
7905759b3d2Safresh1   INIT:
7915759b3d2Safresh1     PERL_UNUSED_VAR(ix);
7925759b3d2Safresh1 
7935759b3d2Safresh1 SV*
7945759b3d2Safresh1 isNFD_NO(uv)
7955759b3d2Safresh1     UV uv
7965759b3d2Safresh1   PROTOTYPE: $
7975759b3d2Safresh1   ALIAS:
7985759b3d2Safresh1     isNFKD_NO = 1
7995759b3d2Safresh1   PREINIT:
8005759b3d2Safresh1     bool result = FALSE;
8015759b3d2Safresh1   CODE:
8025759b3d2Safresh1     if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
8035759b3d2Safresh1 	result = TRUE; /* NFD_NO or NFKD_NO */
8045759b3d2Safresh1     RETVAL = boolSV(result);
8055759b3d2Safresh1   OUTPUT:
8065759b3d2Safresh1     RETVAL
8075759b3d2Safresh1 
8085759b3d2Safresh1 
8095759b3d2Safresh1 SV*
8105759b3d2Safresh1 isComp_Ex(uv)
8115759b3d2Safresh1     UV uv
8125759b3d2Safresh1   PROTOTYPE: $
8135759b3d2Safresh1   ALIAS:
8145759b3d2Safresh1     isNFC_NO  = 0
8155759b3d2Safresh1     isNFKC_NO = 1
8165759b3d2Safresh1   PREINIT:
8175759b3d2Safresh1     bool result = FALSE;
8185759b3d2Safresh1   CODE:
8195759b3d2Safresh1     if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
8205759b3d2Safresh1 	result = TRUE; /* NFC_NO or NFKC_NO */
8215759b3d2Safresh1     else if (ix) {
8225759b3d2Safresh1 	char *canon, *compat;
8235759b3d2Safresh1 	canon  = (char *) dec_canonical(uv);
8245759b3d2Safresh1 	compat = (char *) dec_compat(uv);
8255759b3d2Safresh1 	if (compat && (!canon || strNE(canon, compat)))
8265759b3d2Safresh1 	    result = TRUE; /* NFC_NO or NFKC_NO */
8275759b3d2Safresh1     }
8285759b3d2Safresh1     RETVAL = boolSV(result);
8295759b3d2Safresh1   OUTPUT:
8305759b3d2Safresh1     RETVAL
8315759b3d2Safresh1 
8325759b3d2Safresh1 SV*
8335759b3d2Safresh1 getComposite(uv, uv2)
8345759b3d2Safresh1     UV uv
8355759b3d2Safresh1     UV uv2
8365759b3d2Safresh1   PROTOTYPE: $$
8375759b3d2Safresh1   PREINIT:
8385759b3d2Safresh1     UV composite;
8395759b3d2Safresh1   CODE:
8405759b3d2Safresh1     composite = composite_uv(uv, uv2);
8415759b3d2Safresh1     RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
8425759b3d2Safresh1   OUTPUT:
8435759b3d2Safresh1     RETVAL
8445759b3d2Safresh1 
8455759b3d2Safresh1 
8465759b3d2Safresh1 
8475759b3d2Safresh1 SV*
8485759b3d2Safresh1 getCanon(uv)
8495759b3d2Safresh1     UV uv
8505759b3d2Safresh1   PROTOTYPE: $
8515759b3d2Safresh1   ALIAS:
8525759b3d2Safresh1     getCompat = 1
8535759b3d2Safresh1   CODE:
8545759b3d2Safresh1     if (Hangul_IsS(uv)) {
8555759b3d2Safresh1 	U8 tmp[3 * UTF8_MAXLEN + 1];
8565759b3d2Safresh1 	U8 *t = tmp;
8575759b3d2Safresh1 	U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
8585759b3d2Safresh1 	RETVAL = newSVpvn((char *)t, e - t);
8595759b3d2Safresh1     } else {
8605759b3d2Safresh1 	U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8615759b3d2Safresh1 	if (!rstr)
8625759b3d2Safresh1 	    XSRETURN_UNDEF;
8635759b3d2Safresh1 	RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
8645759b3d2Safresh1     }
8655759b3d2Safresh1     SvUTF8_on(RETVAL);
8665759b3d2Safresh1   OUTPUT:
8675759b3d2Safresh1     RETVAL
8685759b3d2Safresh1 
8695759b3d2Safresh1 
8705759b3d2Safresh1 void
8715759b3d2Safresh1 splitOnLastStarter(src)
8725759b3d2Safresh1     SV * src
8735759b3d2Safresh1   PREINIT:
8745759b3d2Safresh1     SV *svp;
8755759b3d2Safresh1     STRLEN srclen;
8765759b3d2Safresh1     U8 *s, *e, *p;
8775759b3d2Safresh1   PPCODE:
8785759b3d2Safresh1     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
8795759b3d2Safresh1     e = s + srclen;
8805759b3d2Safresh1     p = e;
8815759b3d2Safresh1     while (s < p) {
8825759b3d2Safresh1 	UV uv;
8835759b3d2Safresh1 	p = utf8_hop(p, -1);
8845759b3d2Safresh1 	if (p < s)
8855759b3d2Safresh1 	    croak(ErrHopBeforeStart);
886*256a93a4Safresh1 	uv = utf8_to_uvchr_buf(p, e, NULL);
8875759b3d2Safresh1 	if (getCombinClass(uv) == 0) /* Last Starter found */
8885759b3d2Safresh1 	    break;
8895759b3d2Safresh1     }
8905759b3d2Safresh1 
8915759b3d2Safresh1     svp = sv_2mortal(newSVpvn((char*)s, p - s));
8925759b3d2Safresh1     SvUTF8_on(svp);
8935759b3d2Safresh1     XPUSHs(svp);
8945759b3d2Safresh1 
8955759b3d2Safresh1     svp = sv_2mortal(newSVpvn((char*)p, e - p));
8965759b3d2Safresh1     SvUTF8_on(svp);
8975759b3d2Safresh1     XPUSHs(svp);
8985759b3d2Safresh1 
899