191f110e0Safresh1 291f110e0Safresh1 #define PERL_NO_GET_CONTEXT /* we want efficiency */ 391f110e0Safresh1 491f110e0Safresh1 /* I guese no private function needs pTHX_ and aTHX_ */ 591f110e0Safresh1 6898184e3Ssthen #include "EXTERN.h" 7898184e3Ssthen #include "perl.h" 8898184e3Ssthen #include "XSUB.h" 9898184e3Ssthen 10898184e3Ssthen /* This file is prepared by mkheader */ 11898184e3Ssthen #include "ucatbl.h" 12898184e3Ssthen 13898184e3Ssthen /* At present, char > 0x10ffff are unaffected without complaint, right? */ 14898184e3Ssthen #define VALID_UTF_MAX (0x10ffff) 15898184e3Ssthen #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) 16898184e3Ssthen 17b8851fccSafresh1 #define MAX_DIV_16 (UV_MAX / 16) 18898184e3Ssthen 19898184e3Ssthen /* Supported Levels */ 20898184e3Ssthen #define MinLevel (1) 21898184e3Ssthen #define MaxLevel (4) 22898184e3Ssthen 23898184e3Ssthen /* Shifted weight at 4th level */ 24898184e3Ssthen #define Shift4Wt (0xFFFF) 25898184e3Ssthen 26898184e3Ssthen #define VCE_Length (9) 27898184e3Ssthen 28898184e3Ssthen #define Hangul_SBase (0xAC00) 29898184e3Ssthen #define Hangul_SIni (0xAC00) 30898184e3Ssthen #define Hangul_SFin (0xD7A3) 31898184e3Ssthen #define Hangul_NCount (588) 32898184e3Ssthen #define Hangul_TCount (28) 33898184e3Ssthen #define Hangul_LBase (0x1100) 34898184e3Ssthen #define Hangul_LIni (0x1100) 35898184e3Ssthen #define Hangul_LFin (0x1159) 36898184e3Ssthen #define Hangul_LFill (0x115F) 37898184e3Ssthen #define Hangul_LEnd (0x115F) /* Unicode 5.2 */ 38898184e3Ssthen #define Hangul_VBase (0x1161) 39898184e3Ssthen #define Hangul_VIni (0x1160) /* from Vowel Filler */ 40898184e3Ssthen #define Hangul_VFin (0x11A2) 41898184e3Ssthen #define Hangul_VEnd (0x11A7) /* Unicode 5.2 */ 42898184e3Ssthen #define Hangul_TBase (0x11A7) /* from "no-final" codepoint */ 43898184e3Ssthen #define Hangul_TIni (0x11A8) 44898184e3Ssthen #define Hangul_TFin (0x11F9) 45898184e3Ssthen #define Hangul_TEnd (0x11FF) /* Unicode 5.2 */ 46898184e3Ssthen #define HangulL2Ini (0xA960) /* Unicode 5.2 */ 47898184e3Ssthen #define HangulL2Fin (0xA97C) /* Unicode 5.2 */ 48898184e3Ssthen #define HangulV2Ini (0xD7B0) /* Unicode 5.2 */ 49898184e3Ssthen #define HangulV2Fin (0xD7C6) /* Unicode 5.2 */ 50898184e3Ssthen #define HangulT2Ini (0xD7CB) /* Unicode 5.2 */ 51898184e3Ssthen #define HangulT2Fin (0xD7FB) /* Unicode 5.2 */ 52898184e3Ssthen 53898184e3Ssthen #define CJK_UidIni (0x4E00) 54898184e3Ssthen #define CJK_UidFin (0x9FA5) 559f11ffb7Safresh1 #define CJK_UidF41 (0x9FBB) /* Unicode 4.1 */ 569f11ffb7Safresh1 #define CJK_UidF51 (0x9FC3) /* Unicode 5.1 */ 579f11ffb7Safresh1 #define CJK_UidF52 (0x9FCB) /* Unicode 5.2 */ 589f11ffb7Safresh1 #define CJK_UidF61 (0x9FCC) /* Unicode 6.1 */ 599f11ffb7Safresh1 #define CJK_UidF80 (0x9FD5) /* Unicode 8.0 */ 609f11ffb7Safresh1 #define CJK_UidF100 (0x9FEA) /* Unicode 10.0 */ 61*eac174f2Safresh1 #define CJK_UidF110 (0x9FEF) /* Unicode 11.0 */ 62*eac174f2Safresh1 #define CJK_UidF130 (0x9FFC) /* Unicode 13.0 */ 639f11ffb7Safresh1 64898184e3Ssthen #define CJK_ExtAIni (0x3400) /* Unicode 3.0 */ 65898184e3Ssthen #define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */ 66*eac174f2Safresh1 #define CJK_ExtA130 (0x4DBF) /* Unicode 13.0 */ 67898184e3Ssthen #define CJK_ExtBIni (0x20000) /* Unicode 3.1 */ 68898184e3Ssthen #define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */ 69*eac174f2Safresh1 #define CJK_ExtB130 (0x2A6DD) /* Unicode 13.0 */ 70898184e3Ssthen #define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */ 71898184e3Ssthen #define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */ 72898184e3Ssthen #define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */ 73898184e3Ssthen #define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */ 749f11ffb7Safresh1 #define CJK_ExtEIni (0x2B820) /* Unicode 8.0 */ 759f11ffb7Safresh1 #define CJK_ExtEFin (0x2CEA1) /* Unicode 8.0 */ 769f11ffb7Safresh1 #define CJK_ExtFIni (0x2CEB0) /* Unicode 10.0 */ 779f11ffb7Safresh1 #define CJK_ExtFFin (0x2EBE0) /* Unicode 10.0 */ 78*eac174f2Safresh1 #define CJK_ExtGIni (0x30000) /* Unicode 13.0 */ 79*eac174f2Safresh1 #define CJK_ExtGFin (0x3134A) /* Unicode 13.0 */ 80898184e3Ssthen 81898184e3Ssthen #define CJK_CompIni (0xFA0E) 82898184e3Ssthen #define CJK_CompFin (0xFA29) 83b8851fccSafresh1 static const STDCHAR UnifiedCompat[] = { 84898184e3Ssthen 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1 85898184e3Ssthen }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */ 86898184e3Ssthen 879f11ffb7Safresh1 #define TangIdeoIni (0x17000) /* Unicode 9.0 */ 889f11ffb7Safresh1 #define TangIdeoFin (0x187EC) /* Unicode 9.0 */ 89*eac174f2Safresh1 #define TangIdeo110 (0x187F1) /* Unicode 11.0 */ 90*eac174f2Safresh1 #define TangIdeo120 (0x187F7) /* Unicode 12.0 */ 919f11ffb7Safresh1 #define TangCompIni (0x18800) /* Unicode 9.0 */ 929f11ffb7Safresh1 #define TangCompFin (0x18AF2) /* Unicode 9.0 */ 93*eac174f2Safresh1 #define TangComp130 (0x18AFF) /* Unicode 13.0 */ 94*eac174f2Safresh1 #define TangSuppIni (0x18D00) /* Unicode 13.0 */ 95*eac174f2Safresh1 #define TangSuppFin (0x18D08) /* Unicode 13.0 */ 969f11ffb7Safresh1 #define NushuIni (0x1B170) /* Unicode 10.0 */ 979f11ffb7Safresh1 #define NushuFin (0x1B2FB) /* Unicode 10.0 */ 98*eac174f2Safresh1 #define KhitanIni (0x18B00) /* Unicode 13.0 */ 99*eac174f2Safresh1 #define KhitanFin (0x18CD5) /* Unicode 13.0 */ 1009f11ffb7Safresh1 101898184e3Ssthen #define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode)) 102898184e3Ssthen 103898184e3Ssthen MODULE = Unicode::Collate PACKAGE = Unicode::Collate 104898184e3Ssthen 105898184e3Ssthen PROTOTYPES: DISABLE 106898184e3Ssthen 107898184e3Ssthen void 108898184e3Ssthen _fetch_rest () 109898184e3Ssthen PREINIT: 110898184e3Ssthen char ** rest; 111898184e3Ssthen PPCODE: 112b8851fccSafresh1 for (rest = (char **)UCA_rest; *rest; ++rest) { 113898184e3Ssthen XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0))); 114898184e3Ssthen } 115898184e3Ssthen 116898184e3Ssthen 117898184e3Ssthen void 118898184e3Ssthen _fetch_simple (uv) 119898184e3Ssthen UV uv 120898184e3Ssthen PREINIT: 121898184e3Ssthen U8 ***plane, **row; 122898184e3Ssthen U8* result = NULL; 123898184e3Ssthen PPCODE: 124898184e3Ssthen if (!OVER_UTF_MAX(uv)){ 125898184e3Ssthen plane = (U8***)UCA_simple[uv >> 16]; 126898184e3Ssthen if (plane) { 127898184e3Ssthen row = plane[(uv >> 8) & 0xff]; 128898184e3Ssthen result = row ? row[uv & 0xff] : NULL; 129898184e3Ssthen } 130898184e3Ssthen } 131898184e3Ssthen if (result) { 132898184e3Ssthen int i; 133898184e3Ssthen int num = (int)*result; 134898184e3Ssthen ++result; 135b8851fccSafresh1 EXTEND(SP, num); 136898184e3Ssthen for (i = 0; i < num; ++i) { 137b8851fccSafresh1 PUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length))); 138898184e3Ssthen result += VCE_Length; 139898184e3Ssthen } 140898184e3Ssthen } else { 141b8851fccSafresh1 PUSHs(sv_2mortal(newSViv(0))); 142898184e3Ssthen } 143898184e3Ssthen 144898184e3Ssthen SV* 145898184e3Ssthen _ignorable_simple (uv) 146898184e3Ssthen UV uv 147898184e3Ssthen ALIAS: 148898184e3Ssthen _exists_simple = 1 149898184e3Ssthen PREINIT: 150898184e3Ssthen U8 ***plane, **row; 151898184e3Ssthen int num = -1; 152898184e3Ssthen U8* result = NULL; 153898184e3Ssthen CODE: 154898184e3Ssthen if (!OVER_UTF_MAX(uv)){ 155898184e3Ssthen plane = (U8***)UCA_simple[uv >> 16]; 156898184e3Ssthen if (plane) { 157898184e3Ssthen row = plane[(uv >> 8) & 0xff]; 158898184e3Ssthen result = row ? row[uv & 0xff] : NULL; 159898184e3Ssthen } 160898184e3Ssthen if (result) 161898184e3Ssthen num = (int)*result; /* assuming 0 <= num < 128 */ 162898184e3Ssthen } 163898184e3Ssthen 164898184e3Ssthen if (ix) 165898184e3Ssthen RETVAL = boolSV(num >0); 166898184e3Ssthen else 167898184e3Ssthen RETVAL = boolSV(num==0); 168898184e3Ssthen OUTPUT: 169898184e3Ssthen RETVAL 170898184e3Ssthen 171898184e3Ssthen 172898184e3Ssthen void 173898184e3Ssthen _getHexArray (src) 174898184e3Ssthen SV* src 175898184e3Ssthen PREINIT: 176898184e3Ssthen char *s, *e; 177898184e3Ssthen STRLEN byte; 178898184e3Ssthen UV value; 179898184e3Ssthen bool overflowed = FALSE; 180898184e3Ssthen const char *hexdigit; 181898184e3Ssthen PPCODE: 182898184e3Ssthen s = SvPV(src,byte); 183898184e3Ssthen for (e = s + byte; s < e;) { 184898184e3Ssthen hexdigit = strchr((char *) PL_hexdigit, *s++); 185898184e3Ssthen if (! hexdigit) 186898184e3Ssthen continue; 187898184e3Ssthen value = (hexdigit - PL_hexdigit) & 0xF; 188898184e3Ssthen while (*s) { 189898184e3Ssthen hexdigit = strchr((char *) PL_hexdigit, *s++); 190898184e3Ssthen if (! hexdigit) 191898184e3Ssthen break; 192898184e3Ssthen if (overflowed) 193898184e3Ssthen continue; 194b8851fccSafresh1 if (value > MAX_DIV_16) { 195898184e3Ssthen overflowed = TRUE; 196898184e3Ssthen continue; 197898184e3Ssthen } 198898184e3Ssthen value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF); 199898184e3Ssthen } 200898184e3Ssthen XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value))); 201898184e3Ssthen } 202898184e3Ssthen 203898184e3Ssthen 204898184e3Ssthen SV* 205898184e3Ssthen _isIllegal (sv) 206898184e3Ssthen SV* sv 207898184e3Ssthen PREINIT: 208898184e3Ssthen UV uv; 209898184e3Ssthen CODE: 210898184e3Ssthen if (!sv || !SvIOK(sv)) 211898184e3Ssthen XSRETURN_YES; 212898184e3Ssthen uv = SvUVX(sv); 213898184e3Ssthen RETVAL = boolSV( 214898184e3Ssthen 0x10FFFF < uv /* out of range */ 2156fb12b70Safresh1 || ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] */ 216898184e3Ssthen || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */ 217898184e3Ssthen || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */ 218898184e3Ssthen ); 219898184e3Ssthen OUTPUT: 220898184e3Ssthen RETVAL 221898184e3Ssthen 222898184e3Ssthen 223898184e3Ssthen void 224898184e3Ssthen _decompHangul (code) 225898184e3Ssthen UV code 226898184e3Ssthen PREINIT: 227898184e3Ssthen UV sindex, lindex, vindex, tindex; 228898184e3Ssthen PPCODE: 229898184e3Ssthen /* code *must* be in Hangul syllable. 230898184e3Ssthen * Check it before you enter here. */ 231898184e3Ssthen sindex = code - Hangul_SBase; 232898184e3Ssthen lindex = sindex / Hangul_NCount; 233898184e3Ssthen vindex = (sindex % Hangul_NCount) / Hangul_TCount; 234898184e3Ssthen tindex = sindex % Hangul_TCount; 235898184e3Ssthen 236b8851fccSafresh1 EXTEND(SP, tindex ? 3 : 2); 237b8851fccSafresh1 PUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase))); 238b8851fccSafresh1 PUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase))); 239898184e3Ssthen if (tindex) 240b8851fccSafresh1 PUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase))); 241898184e3Ssthen 242898184e3Ssthen 243898184e3Ssthen SV* 244898184e3Ssthen getHST (code, uca_vers = 0) 245898184e3Ssthen UV code; 246898184e3Ssthen IV uca_vers; 247898184e3Ssthen PREINIT: 248898184e3Ssthen const char * hangtype; 249898184e3Ssthen STRLEN typelen; 250898184e3Ssthen CODE: 251898184e3Ssthen if (codeRange(Hangul_SIni, Hangul_SFin)) { 252898184e3Ssthen if ((code - Hangul_SBase) % Hangul_TCount) { 253898184e3Ssthen hangtype = "LVT"; typelen = 3; 254898184e3Ssthen } else { 255898184e3Ssthen hangtype = "LV"; typelen = 2; 256898184e3Ssthen } 257898184e3Ssthen } else if (uca_vers < 20) { 258898184e3Ssthen if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) { 259898184e3Ssthen hangtype = "L"; typelen = 1; 260898184e3Ssthen } else if (codeRange(Hangul_VIni, Hangul_VFin)) { 261898184e3Ssthen hangtype = "V"; typelen = 1; 262898184e3Ssthen } else if (codeRange(Hangul_TIni, Hangul_TFin)) { 263898184e3Ssthen hangtype = "T"; typelen = 1; 264898184e3Ssthen } else { 265898184e3Ssthen hangtype = ""; typelen = 0; 266898184e3Ssthen } 267898184e3Ssthen } else { 268898184e3Ssthen if (codeRange(Hangul_LIni, Hangul_LEnd) || 269898184e3Ssthen codeRange(HangulL2Ini, HangulL2Fin)) { 270898184e3Ssthen hangtype = "L"; typelen = 1; 271898184e3Ssthen } else if (codeRange(Hangul_VIni, Hangul_VEnd) || 272898184e3Ssthen codeRange(HangulV2Ini, HangulV2Fin)) { 273898184e3Ssthen hangtype = "V"; typelen = 1; 274898184e3Ssthen } else if (codeRange(Hangul_TIni, Hangul_TEnd) || 275898184e3Ssthen codeRange(HangulT2Ini, HangulT2Fin)) { 276898184e3Ssthen hangtype = "T"; typelen = 1; 277898184e3Ssthen } else { 278898184e3Ssthen hangtype = ""; typelen = 0; 279898184e3Ssthen } 280898184e3Ssthen } 281898184e3Ssthen 282898184e3Ssthen RETVAL = newSVpvn(hangtype, typelen); 283898184e3Ssthen OUTPUT: 284898184e3Ssthen RETVAL 285898184e3Ssthen 286898184e3Ssthen 287898184e3Ssthen void 288898184e3Ssthen _derivCE_9 (code) 289898184e3Ssthen UV code 290898184e3Ssthen ALIAS: 291898184e3Ssthen _derivCE_14 = 1 292898184e3Ssthen _derivCE_18 = 2 293898184e3Ssthen _derivCE_20 = 3 294898184e3Ssthen _derivCE_22 = 4 295898184e3Ssthen _derivCE_24 = 5 2969f11ffb7Safresh1 _derivCE_32 = 6 2979f11ffb7Safresh1 _derivCE_34 = 7 2989f11ffb7Safresh1 _derivCE_36 = 8 299*eac174f2Safresh1 _derivCE_38 = 9 300*eac174f2Safresh1 _derivCE_40 = 10 301*eac174f2Safresh1 _derivCE_43 = 11 302898184e3Ssthen PREINIT: 303898184e3Ssthen UV base, aaaa, bbbb; 304b8851fccSafresh1 U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 305b8851fccSafresh1 U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 306*eac174f2Safresh1 bool basic_unified = 0, tangut = 0, nushu = 0, khitan = 0; 307898184e3Ssthen PPCODE: 3089f11ffb7Safresh1 if (codeRange(CJK_UidIni, CJK_CompFin)) { 309898184e3Ssthen if (codeRange(CJK_CompIni, CJK_CompFin)) 310898184e3Ssthen basic_unified = (bool)UnifiedCompat[code - CJK_CompIni]; 311898184e3Ssthen else 312*eac174f2Safresh1 basic_unified = (ix >= 11 ? (code <= CJK_UidF130) : 313*eac174f2Safresh1 ix >= 9 ? (code <= CJK_UidF110) : 314*eac174f2Safresh1 ix == 8 ? (code <= CJK_UidF100) : 3159f11ffb7Safresh1 ix >= 6 ? (code <= CJK_UidF80) : 3169f11ffb7Safresh1 ix == 5 ? (code <= CJK_UidF61) : 317898184e3Ssthen ix >= 3 ? (code <= CJK_UidF52) : 318898184e3Ssthen ix == 2 ? (code <= CJK_UidF51) : 319898184e3Ssthen ix == 1 ? (code <= CJK_UidF41) : 320898184e3Ssthen (code <= CJK_UidFin)); 3219f11ffb7Safresh1 } else { 322*eac174f2Safresh1 if (ix >= 7) { 323*eac174f2Safresh1 tangut = (ix >= 11) ? (codeRange(TangIdeoIni, TangIdeo120) || 324*eac174f2Safresh1 codeRange(TangCompIni, TangComp130) || 325*eac174f2Safresh1 codeRange(TangSuppIni, TangSuppFin)) : 326*eac174f2Safresh1 (ix == 10) ? (codeRange(TangIdeoIni, TangIdeo120) || 327*eac174f2Safresh1 codeRange(TangCompIni, TangCompFin)) : 328*eac174f2Safresh1 (ix == 9) ? (codeRange(TangIdeoIni, TangIdeo110) || 329*eac174f2Safresh1 codeRange(TangCompIni, TangCompFin)) : 330*eac174f2Safresh1 (codeRange(TangIdeoIni, TangIdeoFin) || 3319f11ffb7Safresh1 codeRange(TangCompIni, TangCompFin)); 332*eac174f2Safresh1 } 3339f11ffb7Safresh1 if (ix >= 8) 3349f11ffb7Safresh1 nushu = (codeRange(NushuIni, NushuFin)); 335*eac174f2Safresh1 if (ix >= 11) 336*eac174f2Safresh1 khitan = (codeRange(KhitanIni, KhitanFin)); 337898184e3Ssthen } 3389f11ffb7Safresh1 base = tangut 3399f11ffb7Safresh1 ? 0xFB00 : 3409f11ffb7Safresh1 nushu 3419f11ffb7Safresh1 ? 0xFB01 : 342*eac174f2Safresh1 khitan 343*eac174f2Safresh1 ? 0xFB02 : 3449f11ffb7Safresh1 basic_unified 345898184e3Ssthen ? 0xFB40 : /* CJK */ 346*eac174f2Safresh1 ((ix >= 11 ? codeRange(CJK_ExtAIni, CJK_ExtA130) 347*eac174f2Safresh1 : codeRange(CJK_ExtAIni, CJK_ExtAFin)) 348898184e3Ssthen || 349*eac174f2Safresh1 (ix >= 11 ? codeRange(CJK_ExtBIni, CJK_ExtB130) 350*eac174f2Safresh1 : codeRange(CJK_ExtBIni, CJK_ExtBFin)) 351898184e3Ssthen || 352898184e3Ssthen (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) 353898184e3Ssthen || 3549f11ffb7Safresh1 (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) 3559f11ffb7Safresh1 || 3569f11ffb7Safresh1 (ix >= 6 && codeRange(CJK_ExtEIni, CJK_ExtEFin)) 3579f11ffb7Safresh1 || 358*eac174f2Safresh1 (ix >= 8 && codeRange(CJK_ExtFIni, CJK_ExtFFin)) 359*eac174f2Safresh1 || 360*eac174f2Safresh1 (ix >= 11 && codeRange(CJK_ExtGIni, CJK_ExtGFin))) 361898184e3Ssthen ? 0xFB80 /* CJK ext. */ 362898184e3Ssthen : 0xFBC0; /* others */ 363*eac174f2Safresh1 aaaa = tangut || nushu || khitan ? base : base + (code >> 15); 3649f11ffb7Safresh1 bbbb = (tangut ? (code - TangIdeoIni) : 365*eac174f2Safresh1 nushu ? (code - NushuIni) : 366*eac174f2Safresh1 khitan ? (code - KhitanIni) : (code & 0x7FFF)) | 0x8000; 367898184e3Ssthen a[1] = (U8)(aaaa >> 8); 368898184e3Ssthen a[2] = (U8)(aaaa & 0xFF); 369898184e3Ssthen b[1] = (U8)(bbbb >> 8); 370898184e3Ssthen b[2] = (U8)(bbbb & 0xFF); 371b8851fccSafresh1 a[4] = (U8)(0x20); /* second octet of level 2 */ 372b8851fccSafresh1 a[6] = (U8)(0x02); /* second octet of level 3 */ 373898184e3Ssthen a[7] = b[7] = (U8)(code >> 8); 374898184e3Ssthen a[8] = b[8] = (U8)(code & 0xFF); 375b8851fccSafresh1 EXTEND(SP, 2); 376b8851fccSafresh1 PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); 377b8851fccSafresh1 PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); 378898184e3Ssthen 379898184e3Ssthen 380898184e3Ssthen void 381898184e3Ssthen _derivCE_8 (code) 382898184e3Ssthen UV code 383898184e3Ssthen PREINIT: 384898184e3Ssthen UV aaaa, bbbb; 385b8851fccSafresh1 U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 386b8851fccSafresh1 U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 387898184e3Ssthen PPCODE: 388898184e3Ssthen aaaa = 0xFF80 + (code >> 15); 389898184e3Ssthen bbbb = (code & 0x7FFF) | 0x8000; 390898184e3Ssthen a[1] = (U8)(aaaa >> 8); 391898184e3Ssthen a[2] = (U8)(aaaa & 0xFF); 392898184e3Ssthen b[1] = (U8)(bbbb >> 8); 393898184e3Ssthen b[2] = (U8)(bbbb & 0xFF); 394b8851fccSafresh1 a[4] = (U8)(0x02); /* second octet of level 2 */ 395b8851fccSafresh1 a[6] = (U8)(0x01); /* second octet of level 3 */ 396898184e3Ssthen a[7] = b[7] = (U8)(code >> 8); 397898184e3Ssthen a[8] = b[8] = (U8)(code & 0xFF); 398b8851fccSafresh1 EXTEND(SP, 2); 399b8851fccSafresh1 PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); 400b8851fccSafresh1 PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); 401898184e3Ssthen 402898184e3Ssthen 403898184e3Ssthen void 404898184e3Ssthen _uideoCE_8 (code) 405898184e3Ssthen UV code 406898184e3Ssthen PREINIT: 407b8851fccSafresh1 U8 uice[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 408898184e3Ssthen PPCODE: 409898184e3Ssthen uice[1] = uice[7] = (U8)(code >> 8); 410898184e3Ssthen uice[2] = uice[8] = (U8)(code & 0xFF); 411b8851fccSafresh1 uice[4] = (U8)(0x20); /* second octet of level 2 */ 412b8851fccSafresh1 uice[6] = (U8)(0x02); /* second octet of level 3 */ 413b8851fccSafresh1 PUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length))); 414898184e3Ssthen 415898184e3Ssthen 416898184e3Ssthen SV* 417898184e3Ssthen _isUIdeo (code, uca_vers) 418898184e3Ssthen UV code; 419898184e3Ssthen IV uca_vers; 420898184e3Ssthen bool basic_unified = 0; 421898184e3Ssthen CODE: 422898184e3Ssthen /* uca_vers = 0 for _uideoCE_8() */ 423898184e3Ssthen if (CJK_UidIni <= code) { 424898184e3Ssthen if (codeRange(CJK_CompIni, CJK_CompFin)) 425898184e3Ssthen basic_unified = (bool)UnifiedCompat[code - CJK_CompIni]; 426898184e3Ssthen else 427*eac174f2Safresh1 basic_unified = (uca_vers >= 43 ? (code <= CJK_UidF130) : 428*eac174f2Safresh1 uca_vers >= 38 ? (code <= CJK_UidF110) : 429*eac174f2Safresh1 uca_vers >= 36 ? (code <= CJK_UidF100) : 4309f11ffb7Safresh1 uca_vers >= 32 ? (code <= CJK_UidF80) : 4319f11ffb7Safresh1 uca_vers >= 24 ? (code <= CJK_UidF61) : 432898184e3Ssthen uca_vers >= 20 ? (code <= CJK_UidF52) : 433898184e3Ssthen uca_vers >= 18 ? (code <= CJK_UidF51) : 434898184e3Ssthen uca_vers >= 14 ? (code <= CJK_UidF41) : 435898184e3Ssthen (code <= CJK_UidFin)); 436898184e3Ssthen } 437898184e3Ssthen RETVAL = boolSV( 438898184e3Ssthen (basic_unified) 439898184e3Ssthen || 440898184e3Ssthen (codeRange(CJK_ExtAIni, CJK_ExtAFin)) 441898184e3Ssthen || 442*eac174f2Safresh1 (uca_vers >= 43 && codeRange(CJK_ExtAIni, CJK_ExtA130)) 443*eac174f2Safresh1 || 444898184e3Ssthen (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin)) 445898184e3Ssthen || 446*eac174f2Safresh1 (uca_vers >= 43 && codeRange(CJK_ExtBIni, CJK_ExtB130)) 447*eac174f2Safresh1 || 448898184e3Ssthen (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) 449898184e3Ssthen || 450898184e3Ssthen (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) 4519f11ffb7Safresh1 || 4529f11ffb7Safresh1 (uca_vers >= 32 && codeRange(CJK_ExtEIni, CJK_ExtEFin)) 4539f11ffb7Safresh1 || 4549f11ffb7Safresh1 (uca_vers >= 36 && codeRange(CJK_ExtFIni, CJK_ExtFFin)) 455*eac174f2Safresh1 || 456*eac174f2Safresh1 (uca_vers >= 43 && codeRange(CJK_ExtGIni, CJK_ExtGFin)) 457898184e3Ssthen ); 458898184e3Ssthen OUTPUT: 459898184e3Ssthen RETVAL 460898184e3Ssthen 461898184e3Ssthen 462898184e3Ssthen SV* 463898184e3Ssthen mk_SortKey (self, buf) 464898184e3Ssthen SV* self; 465898184e3Ssthen SV* buf; 466898184e3Ssthen PREINIT: 467898184e3Ssthen SV *dst, **svp; 468898184e3Ssthen STRLEN dlen, vlen; 469898184e3Ssthen U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel]; 470898184e3Ssthen AV *bufAV; 471898184e3Ssthen HV *selfHV; 472898184e3Ssthen UV back_flag; 473898184e3Ssthen I32 i, buf_len; 474898184e3Ssthen IV lv, level, uca_vers; 475898184e3Ssthen bool upper_lower, kata_hira, v2i, last_is_var; 476898184e3Ssthen CODE: 477898184e3Ssthen if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) 478898184e3Ssthen selfHV = (HV*)SvRV(self); 479898184e3Ssthen else 480898184e3Ssthen croak("$self is not a HASHREF."); 481898184e3Ssthen 482898184e3Ssthen if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV) 483898184e3Ssthen bufAV = (AV*)SvRV(buf); 484898184e3Ssthen else 485898184e3Ssthen croak("XSUB, not an ARRAYREF."); 486898184e3Ssthen 487898184e3Ssthen buf_len = av_len(bufAV); 488898184e3Ssthen 489898184e3Ssthen if (buf_len < 0) { /* empty: -1 */ 490898184e3Ssthen dlen = 2 * (MaxLevel - 1); 491898184e3Ssthen dst = newSV(dlen); 492898184e3Ssthen (void)SvPOK_only(dst); 493898184e3Ssthen d = (U8*)SvPVX(dst); 494898184e3Ssthen while (dlen--) 495898184e3Ssthen *d++ = '\0'; 496898184e3Ssthen } else { 497898184e3Ssthen svp = hv_fetch(selfHV, "level", 5, FALSE); 498898184e3Ssthen level = svp ? SvIV(*svp) : MaxLevel; 499898184e3Ssthen 500898184e3Ssthen for (lv = 0; lv < level; lv++) { 501898184e3Ssthen New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8); 502898184e3Ssthen s[lv] = eachlevel[lv]; 503898184e3Ssthen } 504898184e3Ssthen 505898184e3Ssthen svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE); 506898184e3Ssthen upper_lower = svp ? SvTRUE(*svp) : FALSE; 507898184e3Ssthen svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE); 508898184e3Ssthen kata_hira = svp ? SvTRUE(*svp) : FALSE; 509898184e3Ssthen svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); 510898184e3Ssthen uca_vers = SvIV(*svp); 511898184e3Ssthen svp = hv_fetch(selfHV, "variable", 8, FALSE); 512898184e3Ssthen v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */ 513898184e3Ssthen ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13)) 514898184e3Ssthen : FALSE; 515898184e3Ssthen 516898184e3Ssthen last_is_var = FALSE; 517898184e3Ssthen for (i = 0; i <= buf_len; i++) { 518898184e3Ssthen svp = av_fetch(bufAV, i, FALSE); 519898184e3Ssthen 520898184e3Ssthen if (svp && SvPOK(*svp)) 521898184e3Ssthen v = (U8*)SvPV(*svp, vlen); 522898184e3Ssthen else 523898184e3Ssthen croak("not a vwt."); 524898184e3Ssthen 525898184e3Ssthen if (vlen < VCE_Length) /* ignore short VCE (unexpected) */ 526898184e3Ssthen continue; 527898184e3Ssthen 528898184e3Ssthen /* "Ignorable (L1, L2) after Variable" since track. v. 9 */ 529898184e3Ssthen if (v2i) { 530898184e3Ssthen if (*v) 531898184e3Ssthen last_is_var = TRUE; 532898184e3Ssthen else if (v[1] || v[2]) /* non zero primary weight */ 533898184e3Ssthen last_is_var = FALSE; 534898184e3Ssthen else if (last_is_var) /* zero primary weight; skipped */ 535898184e3Ssthen continue; 536898184e3Ssthen } 537898184e3Ssthen 538898184e3Ssthen if (v[5] == 0) { /* tert wt < 256 */ 539898184e3Ssthen if (upper_lower) { 540898184e3Ssthen if (0x8 <= v[6] && v[6] <= 0xC) /* lower */ 541898184e3Ssthen v[6] -= 6; 542898184e3Ssthen else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */ 543898184e3Ssthen v[6] += 6; 544898184e3Ssthen else if (v[6] == 0x1C) /* square upper */ 545898184e3Ssthen v[6]++; 546898184e3Ssthen else if (v[6] == 0x1D) /* square lower */ 547898184e3Ssthen v[6]--; 548898184e3Ssthen } 549898184e3Ssthen if (kata_hira) { 550898184e3Ssthen if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */ 551898184e3Ssthen v[6] -= 2; 552898184e3Ssthen else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */ 553898184e3Ssthen v[6] += 5; 554898184e3Ssthen } 555898184e3Ssthen } 556898184e3Ssthen 557898184e3Ssthen for (lv = 0; lv < level; lv++) { 558898184e3Ssthen if (v[2 * lv + 1] || v[2 * lv + 2]) { 559898184e3Ssthen *s[lv]++ = v[2 * lv + 1]; 560898184e3Ssthen *s[lv]++ = v[2 * lv + 2]; 561898184e3Ssthen } 562898184e3Ssthen } 563898184e3Ssthen } 564898184e3Ssthen 565898184e3Ssthen dlen = 2 * (MaxLevel - 1); 566898184e3Ssthen for (lv = 0; lv < level; lv++) 567898184e3Ssthen dlen += s[lv] - eachlevel[lv]; 568898184e3Ssthen 569898184e3Ssthen dst = newSV(dlen); 570898184e3Ssthen (void)SvPOK_only(dst); 571898184e3Ssthen d = (U8*)SvPVX(dst); 572898184e3Ssthen 573898184e3Ssthen svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE); 574898184e3Ssthen back_flag = svp ? SvUV(*svp) : (UV)0; 575898184e3Ssthen 576898184e3Ssthen for (lv = 0; lv < level; lv++) { 577898184e3Ssthen if (back_flag & (1 << (lv + 1))) { 578898184e3Ssthen p = s[lv]; 579898184e3Ssthen e = eachlevel[lv]; 580898184e3Ssthen for ( ; e < p; p -= 2) { 581898184e3Ssthen *d++ = p[-2]; 582898184e3Ssthen *d++ = p[-1]; 583898184e3Ssthen } 584898184e3Ssthen } 585898184e3Ssthen else { 586898184e3Ssthen p = eachlevel[lv]; 587898184e3Ssthen e = s[lv]; 588898184e3Ssthen while (p < e) 589898184e3Ssthen *d++ = *p++; 590898184e3Ssthen } 591898184e3Ssthen if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ 592898184e3Ssthen *d++ = '\0'; 593898184e3Ssthen *d++ = '\0'; 594898184e3Ssthen } 595898184e3Ssthen } 596898184e3Ssthen 597898184e3Ssthen for (lv = level; lv < MaxLevel; lv++) { 598898184e3Ssthen if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ 599898184e3Ssthen *d++ = '\0'; 600898184e3Ssthen *d++ = '\0'; 601898184e3Ssthen } 602898184e3Ssthen } 603898184e3Ssthen 604898184e3Ssthen for (lv = 0; lv < level; lv++) { 605898184e3Ssthen Safefree(eachlevel[lv]); 606898184e3Ssthen } 607898184e3Ssthen } 608898184e3Ssthen *d = '\0'; 609898184e3Ssthen SvCUR_set(dst, d - (U8*)SvPVX(dst)); 610898184e3Ssthen RETVAL = dst; 611898184e3Ssthen OUTPUT: 612898184e3Ssthen RETVAL 613898184e3Ssthen 614898184e3Ssthen 615898184e3Ssthen SV* 616898184e3Ssthen varCE (self, vce) 617898184e3Ssthen SV* self; 618898184e3Ssthen SV* vce; 619898184e3Ssthen PREINIT: 620898184e3Ssthen SV *dst, *vbl, **svp; 621898184e3Ssthen HV *selfHV; 622898184e3Ssthen U8 *a, *v, *d; 623898184e3Ssthen STRLEN alen, vlen; 624898184e3Ssthen bool ig_l2; 6259f11ffb7Safresh1 IV uca_vers; 626898184e3Ssthen UV totwt; 627898184e3Ssthen CODE: 628898184e3Ssthen if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) 629898184e3Ssthen selfHV = (HV*)SvRV(self); 630898184e3Ssthen else 631898184e3Ssthen croak("$self is not a HASHREF."); 632898184e3Ssthen 633898184e3Ssthen svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE); 634898184e3Ssthen ig_l2 = svp ? SvTRUE(*svp) : FALSE; 635898184e3Ssthen 636898184e3Ssthen svp = hv_fetch(selfHV, "variable", 8, FALSE); 637898184e3Ssthen vbl = svp ? *svp : &PL_sv_no; 638898184e3Ssthen a = (U8*)SvPV(vbl, alen); 639898184e3Ssthen v = (U8*)SvPV(vce, vlen); 640898184e3Ssthen 641898184e3Ssthen dst = newSV(vlen); 642898184e3Ssthen d = (U8*)SvPVX(dst); 643898184e3Ssthen (void)SvPOK_only(dst); 644898184e3Ssthen Copy(v, d, vlen, U8); 645898184e3Ssthen SvCUR_set(dst, vlen); 646898184e3Ssthen d[vlen] = '\0'; 647898184e3Ssthen 648898184e3Ssthen /* primary weight == 0 && secondary weight != 0 */ 649898184e3Ssthen if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) { 650898184e3Ssthen d[3] = d[4] = d[5] = d[6] = '\0'; 651898184e3Ssthen } 652898184e3Ssthen 653898184e3Ssthen /* variable: checked only the first char and the length, 654898184e3Ssthen trusting checkCollator() and %VariableOK in Perl ... */ 655898184e3Ssthen 656b8851fccSafresh1 if (vlen >= VCE_Length && *a != 'n') { 657b8851fccSafresh1 if (*v) { 658898184e3Ssthen if (*a == 's') { /* shifted or shift-trimmed */ 659898184e3Ssthen d[7] = d[1]; /* wt level 1 to 4 */ 660898184e3Ssthen d[8] = d[2]; 661898184e3Ssthen } /* else blanked */ 662898184e3Ssthen d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0'; 663b8851fccSafresh1 } else if (*a == 's') { /* shifted or shift-trimmed */ 664898184e3Ssthen totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6]; 665898184e3Ssthen if (alen == 7 && totwt != 0) { /* shifted */ 66691f110e0Safresh1 if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */ 66791f110e0Safresh1 d[7] = d[1]; /* wt level 1 to 4 */ 66891f110e0Safresh1 d[8] = d[2]; 66991f110e0Safresh1 } else { 6709f11ffb7Safresh1 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); 6719f11ffb7Safresh1 if (!svp) 6729f11ffb7Safresh1 croak("Panic: no $self->{UCA_Version} in varCE"); 6739f11ffb7Safresh1 uca_vers = SvIV(*svp); 6749f11ffb7Safresh1 6759f11ffb7Safresh1 /* completely ignorable or the second derived CE */ 6769f11ffb7Safresh1 if (uca_vers >= 36 && d[3] + d[4] + d[5] + d[6] == 0) { 6779f11ffb7Safresh1 d[7] = d[8] = '\0'; 6789f11ffb7Safresh1 } else { 679898184e3Ssthen d[7] = (U8)(Shift4Wt >> 8); 680898184e3Ssthen d[8] = (U8)(Shift4Wt & 0xFF); 681898184e3Ssthen } 6829f11ffb7Safresh1 } 68391f110e0Safresh1 } else { /* shift-trimmed or completely ignorable */ 684898184e3Ssthen d[7] = d[8] = '\0'; 685898184e3Ssthen } 686b8851fccSafresh1 } /* else blanked */ 687b8851fccSafresh1 } /* else non-ignorable */ 688898184e3Ssthen RETVAL = dst; 689898184e3Ssthen OUTPUT: 690898184e3Ssthen RETVAL 691898184e3Ssthen 692898184e3Ssthen 693898184e3Ssthen 694898184e3Ssthen SV* 695898184e3Ssthen visualizeSortKey (self, key) 696898184e3Ssthen SV * self 697898184e3Ssthen SV * key 698898184e3Ssthen PREINIT: 699898184e3Ssthen HV *selfHV; 700898184e3Ssthen SV **svp, *dst; 701898184e3Ssthen U8 *s, *e, *d; 702898184e3Ssthen STRLEN klen, dlen; 703898184e3Ssthen UV uv; 70491f110e0Safresh1 IV uca_vers, sep = 0; 705b8851fccSafresh1 const char *upperhex = "0123456789ABCDEF"; 706898184e3Ssthen CODE: 707898184e3Ssthen if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) 708898184e3Ssthen selfHV = (HV*)SvRV(self); 709898184e3Ssthen else 710898184e3Ssthen croak("$self is not a HASHREF."); 711898184e3Ssthen 712898184e3Ssthen svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); 713898184e3Ssthen if (!svp) 714898184e3Ssthen croak("Panic: no $self->{UCA_Version} in visualizeSortKey"); 715898184e3Ssthen uca_vers = SvIV(*svp); 716898184e3Ssthen 717898184e3Ssthen s = (U8*)SvPV(key, klen); 718898184e3Ssthen 719898184e3Ssthen /* slightly *longer* than the need, but I'm afraid of miscounting; 72091f110e0Safresh1 = (klen / 2) * 5 - 1 72191f110e0Safresh1 # FFFF and ' ' for each 16bit units but ' ' is less by 1; 72291f110e0Safresh1 # ' ' and '|' for level boundaries including the identical level 723898184e3Ssthen + 2 # '[' and ']' 72491f110e0Safresh1 + 1 # '\0' 72591f110e0Safresh1 (a) if klen is odd (not expected), maybe more 5 bytes. 72691f110e0Safresh1 (b) there is not always the identical level. 727898184e3Ssthen */ 728898184e3Ssthen dlen = (klen / 2) * 5 + MaxLevel * 2 + 2; 729898184e3Ssthen dst = newSV(dlen); 730898184e3Ssthen (void)SvPOK_only(dst); 731898184e3Ssthen d = (U8*)SvPVX(dst); 732898184e3Ssthen 733898184e3Ssthen *d++ = '['; 734898184e3Ssthen for (e = s + klen; s < e; s += 2) { 735898184e3Ssthen uv = (U16)(*s << 8 | s[1]); 73691f110e0Safresh1 if (uv || sep >= MaxLevel) { 737898184e3Ssthen if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|'))) 738898184e3Ssthen *d++ = ' '; 739898184e3Ssthen *d++ = upperhex[ (s[0] >> 4) & 0xF ]; 740898184e3Ssthen *d++ = upperhex[ s[0] & 0xF ]; 741898184e3Ssthen *d++ = upperhex[ (s[1] >> 4) & 0xF ]; 742898184e3Ssthen *d++ = upperhex[ s[1] & 0xF ]; 74391f110e0Safresh1 } else { 744898184e3Ssthen if ((9 <= uca_vers) && (d[-1] != '[')) 745898184e3Ssthen *d++ = ' '; 746898184e3Ssthen *d++ = '|'; 74791f110e0Safresh1 ++sep; 748898184e3Ssthen } 749898184e3Ssthen } 750898184e3Ssthen *d++ = ']'; 751898184e3Ssthen *d = '\0'; 752898184e3Ssthen SvCUR_set(dst, d - (U8*)SvPVX(dst)); 753898184e3Ssthen RETVAL = dst; 754898184e3Ssthen OUTPUT: 755898184e3Ssthen RETVAL 756