xref: /openbsd-src/gnu/usr.bin/perl/cpan/Unicode-Collate/Collate.xs (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 
2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
3 
4 /* I guese no private function needs pTHX_ and aTHX_ */
5 
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 
10 /* This file is prepared by mkheader */
11 #include "ucatbl.h"
12 
13 /* Perl 5.6.1 ? */
14 #ifndef utf8n_to_uvuni
15 #define utf8n_to_uvuni  utf8_to_uv
16 #endif /* utf8n_to_uvuni */
17 
18 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
19 #ifndef UTF8_ALLOW_BOM
20 #define UTF8_ALLOW_BOM  (0)
21 #endif /* UTF8_ALLOW_BOM */
22 
23 #ifndef UTF8_ALLOW_SURROGATE
24 #define UTF8_ALLOW_SURROGATE  (0)
25 #endif /* UTF8_ALLOW_SURROGATE */
26 
27 #ifndef UTF8_ALLOW_FE_FF
28 #define UTF8_ALLOW_FE_FF  (0)
29 #endif /* UTF8_ALLOW_FE_FF */
30 
31 #ifndef UTF8_ALLOW_FFFF
32 #define UTF8_ALLOW_FFFF  (0)
33 #endif /* UTF8_ALLOW_FFFF */
34 
35 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
36 
37 /* perl 5.6.x workaround, before 5.8.0 */
38 #ifdef utf8n_to_uvuni
39 #define GET_UV_FOR_5_6	utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF)
40 #else
41 #define GET_UV_FOR_5_6	retlen = 1 /* avoid an infinite loop */
42 #endif /* utf8n_to_uvuni */
43 
44 /* At present, char > 0x10ffff are unaffected without complaint, right? */
45 #define VALID_UTF_MAX    (0x10ffff)
46 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
47 
48 static const UV max_div_16 = UV_MAX / 16;
49 
50 /* Supported Levels */
51 #define MinLevel	(1)
52 #define MaxLevel	(4)
53 
54 /* Shifted weight at 4th level */
55 #define Shift4Wt	(0xFFFF)
56 
57 #define VCE_Length	(9)
58 
59 #define Hangul_SBase  (0xAC00)
60 #define Hangul_SIni   (0xAC00)
61 #define Hangul_SFin   (0xD7A3)
62 #define Hangul_NCount (588)
63 #define Hangul_TCount (28)
64 #define Hangul_LBase  (0x1100)
65 #define Hangul_LIni   (0x1100)
66 #define Hangul_LFin   (0x1159)
67 #define Hangul_LFill  (0x115F)
68 #define Hangul_LEnd   (0x115F) /* Unicode 5.2 */
69 #define Hangul_VBase  (0x1161)
70 #define Hangul_VIni   (0x1160) /* from Vowel Filler */
71 #define Hangul_VFin   (0x11A2)
72 #define Hangul_VEnd   (0x11A7) /* Unicode 5.2 */
73 #define Hangul_TBase  (0x11A7) /* from "no-final" codepoint */
74 #define Hangul_TIni   (0x11A8)
75 #define Hangul_TFin   (0x11F9)
76 #define Hangul_TEnd   (0x11FF) /* Unicode 5.2 */
77 #define HangulL2Ini   (0xA960) /* Unicode 5.2 */
78 #define HangulL2Fin   (0xA97C) /* Unicode 5.2 */
79 #define HangulV2Ini   (0xD7B0) /* Unicode 5.2 */
80 #define HangulV2Fin   (0xD7C6) /* Unicode 5.2 */
81 #define HangulT2Ini   (0xD7CB) /* Unicode 5.2 */
82 #define HangulT2Fin   (0xD7FB) /* Unicode 5.2 */
83 
84 #define CJK_UidIni    (0x4E00)
85 #define CJK_UidFin    (0x9FA5)
86 #define CJK_UidF41    (0x9FBB)
87 #define CJK_UidF51    (0x9FC3)
88 #define CJK_UidF52    (0x9FCB)
89 #define CJK_UidF61    (0x9FCC)
90 #define CJK_ExtAIni   (0x3400) /* Unicode 3.0 */
91 #define CJK_ExtAFin   (0x4DB5) /* Unicode 3.0 */
92 #define CJK_ExtBIni  (0x20000) /* Unicode 3.1 */
93 #define CJK_ExtBFin  (0x2A6D6) /* Unicode 3.1 */
94 #define CJK_ExtCIni  (0x2A700) /* Unicode 5.2 */
95 #define CJK_ExtCFin  (0x2B734) /* Unicode 5.2 */
96 #define CJK_ExtDIni  (0x2B740) /* Unicode 6.0 */
97 #define CJK_ExtDFin  (0x2B81D) /* Unicode 6.0 */
98 
99 #define CJK_CompIni  (0xFA0E)
100 #define CJK_CompFin  (0xFA29)
101 static STDCHAR UnifiedCompat[] = {
102       1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
103 }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
104 
105 #define codeRange(bcode, ecode)	((bcode) <= code && code <= (ecode))
106 
107 MODULE = Unicode::Collate	PACKAGE = Unicode::Collate
108 
109 PROTOTYPES: DISABLE
110 
111 void
112 _fetch_rest ()
113   PREINIT:
114     char ** rest;
115   PPCODE:
116     for (rest = UCA_rest; *rest; ++rest) {
117 	XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
118     }
119 
120 
121 void
122 _fetch_simple (uv)
123     UV uv
124   PREINIT:
125     U8 ***plane, **row;
126     U8* result = NULL;
127   PPCODE:
128     if (!OVER_UTF_MAX(uv)){
129 	plane = (U8***)UCA_simple[uv >> 16];
130 	if (plane) {
131 	    row = plane[(uv >> 8) & 0xff];
132 	    result = row ? row[uv & 0xff] : NULL;
133 	}
134     }
135     if (result) {
136 	int i;
137 	int num = (int)*result;
138 	++result;
139 	for (i = 0; i < num; ++i) {
140 	    XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
141 	    result += VCE_Length;
142 	}
143     } else {
144 	XPUSHs(sv_2mortal(newSViv(0)));
145     }
146 
147 SV*
148 _ignorable_simple (uv)
149     UV uv
150   ALIAS:
151     _exists_simple = 1
152   PREINIT:
153     U8 ***plane, **row;
154     int num = -1;
155     U8* result = NULL;
156   CODE:
157     if (!OVER_UTF_MAX(uv)){
158 	plane = (U8***)UCA_simple[uv >> 16];
159 	if (plane) {
160 	    row = plane[(uv >> 8) & 0xff];
161 	    result = row ? row[uv & 0xff] : NULL;
162 	}
163 	if (result)
164 	    num = (int)*result; /* assuming 0 <= num < 128 */
165     }
166 
167     if (ix)
168 	RETVAL = boolSV(num >0);
169     else
170 	RETVAL = boolSV(num==0);
171   OUTPUT:
172     RETVAL
173 
174 
175 void
176 _getHexArray (src)
177     SV* src
178   PREINIT:
179     char *s, *e;
180     STRLEN byte;
181     UV value;
182     bool overflowed = FALSE;
183     const char *hexdigit;
184   PPCODE:
185     s = SvPV(src,byte);
186     for (e = s + byte; s < e;) {
187 	hexdigit = strchr((char *) PL_hexdigit, *s++);
188         if (! hexdigit)
189 	    continue;
190 	value = (hexdigit - PL_hexdigit) & 0xF;
191 	while (*s) {
192 	    hexdigit = strchr((char *) PL_hexdigit, *s++);
193 	    if (! hexdigit)
194 		break;
195 	    if (overflowed)
196 		continue;
197 	    if (value > max_div_16) {
198 		overflowed = TRUE;
199 		continue;
200 	    }
201 	    value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
202 	}
203 	XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
204     }
205 
206 
207 SV*
208 _isIllegal (sv)
209     SV* sv
210   PREINIT:
211     UV uv;
212   CODE:
213     if (!sv || !SvIOK(sv))
214 	XSRETURN_YES;
215     uv = SvUVX(sv);
216     RETVAL = boolSV(
217 	   0x10FFFF < uv                   /* out of range */
218 	|| ((uv & 0xFFFE) == 0xFFFE)       /* ??FFF[EF] */
219 	|| (0xD800 <= uv && uv <= 0xDFFF)  /* unpaired surrogates */
220 	|| (0xFDD0 <= uv && uv <= 0xFDEF)  /* other non-characters */
221     );
222 OUTPUT:
223     RETVAL
224 
225 
226 void
227 _decompHangul (code)
228     UV code
229   PREINIT:
230     UV sindex, lindex, vindex, tindex;
231   PPCODE:
232     /* code *must* be in Hangul syllable.
233      * Check it before you enter here. */
234     sindex =  code - Hangul_SBase;
235     lindex =  sindex / Hangul_NCount;
236     vindex = (sindex % Hangul_NCount) / Hangul_TCount;
237     tindex =  sindex % Hangul_TCount;
238 
239     XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
240     XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
241     if (tindex)
242 	XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
243 
244 
245 SV*
246 getHST (code, uca_vers = 0)
247     UV code;
248     IV uca_vers;
249   PREINIT:
250     const char * hangtype;
251     STRLEN typelen;
252   CODE:
253     if (codeRange(Hangul_SIni, Hangul_SFin)) {
254 	if ((code - Hangul_SBase) % Hangul_TCount) {
255 	    hangtype = "LVT"; typelen = 3;
256 	} else {
257 	    hangtype = "LV"; typelen = 2;
258 	}
259     } else if (uca_vers < 20) {
260 	if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
261 	    hangtype = "L"; typelen = 1;
262 	} else if (codeRange(Hangul_VIni, Hangul_VFin)) {
263 	    hangtype = "V"; typelen = 1;
264 	} else if (codeRange(Hangul_TIni, Hangul_TFin)) {
265 	    hangtype = "T"; typelen = 1;
266 	} else {
267 	    hangtype = ""; typelen = 0;
268 	}
269     } else {
270 	if        (codeRange(Hangul_LIni, Hangul_LEnd) ||
271 		   codeRange(HangulL2Ini, HangulL2Fin)) {
272 	    hangtype = "L"; typelen = 1;
273 	} else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
274 		   codeRange(HangulV2Ini, HangulV2Fin)) {
275 	    hangtype = "V"; typelen = 1;
276 	} else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
277 		   codeRange(HangulT2Ini, HangulT2Fin)) {
278 	    hangtype = "T"; typelen = 1;
279 	} else {
280 	    hangtype = ""; typelen = 0;
281 	}
282     }
283 
284     RETVAL = newSVpvn(hangtype, typelen);
285 OUTPUT:
286     RETVAL
287 
288 
289 void
290 _derivCE_9 (code)
291     UV code
292   ALIAS:
293     _derivCE_14 = 1
294     _derivCE_18 = 2
295     _derivCE_20 = 3
296     _derivCE_22 = 4
297     _derivCE_24 = 5
298   PREINIT:
299     UV base, aaaa, bbbb;
300     U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
301     U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
302     bool basic_unified = 0;
303   PPCODE:
304     if (CJK_UidIni <= code) {
305 	if (codeRange(CJK_CompIni, CJK_CompFin))
306 	    basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
307 	else
308 	    basic_unified = (ix >= 5 ? (code <= CJK_UidF61) :
309 			     ix >= 3 ? (code <= CJK_UidF52) :
310 			     ix == 2 ? (code <= CJK_UidF51) :
311 			     ix == 1 ? (code <= CJK_UidF41) :
312 				       (code <= CJK_UidFin));
313     }
314     base = (basic_unified)
315 	    ? 0xFB40 : /* CJK */
316 	   ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
317 		||
318 	    (codeRange(CJK_ExtBIni, CJK_ExtBFin))
319 		||
320 	    (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
321 		||
322 	    (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)))
323 	    ? 0xFB80   /* CJK ext. */
324 	    : 0xFBC0;  /* others */
325     aaaa =  base + (code >> 15);
326     bbbb = (code & 0x7FFF) | 0x8000;
327     a[1] = (U8)(aaaa >> 8);
328     a[2] = (U8)(aaaa & 0xFF);
329     b[1] = (U8)(bbbb >> 8);
330     b[2] = (U8)(bbbb & 0xFF);
331     a[7] = b[7] = (U8)(code >> 8);
332     a[8] = b[8] = (U8)(code & 0xFF);
333     XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
334     XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
335 
336 
337 void
338 _derivCE_8 (code)
339     UV code
340   PREINIT:
341     UV aaaa, bbbb;
342     U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF";
343     U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
344   PPCODE:
345     aaaa =  0xFF80 + (code >> 15);
346     bbbb = (code & 0x7FFF) | 0x8000;
347     a[1] = (U8)(aaaa >> 8);
348     a[2] = (U8)(aaaa & 0xFF);
349     b[1] = (U8)(bbbb >> 8);
350     b[2] = (U8)(bbbb & 0xFF);
351     a[7] = b[7] = (U8)(code >> 8);
352     a[8] = b[8] = (U8)(code & 0xFF);
353     XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
354     XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
355 
356 
357 void
358 _uideoCE_8 (code)
359     UV code
360   PREINIT:
361     U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
362   PPCODE:
363     uice[1] = uice[7] = (U8)(code >> 8);
364     uice[2] = uice[8] = (U8)(code & 0xFF);
365     XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
366 
367 
368 SV*
369 _isUIdeo (code, uca_vers)
370     UV code;
371     IV uca_vers;
372     bool basic_unified = 0;
373   CODE:
374     /* uca_vers = 0 for _uideoCE_8() */
375     if (CJK_UidIni <= code) {
376 	if (codeRange(CJK_CompIni, CJK_CompFin))
377 	    basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
378 	else
379 	    basic_unified = (uca_vers >= 24 ? (code <= CJK_UidF61) :
380 			     uca_vers >= 20 ? (code <= CJK_UidF52) :
381 			     uca_vers >= 18 ? (code <= CJK_UidF51) :
382 			     uca_vers >= 14 ? (code <= CJK_UidF41) :
383 					      (code <= CJK_UidFin));
384     }
385     RETVAL = boolSV(
386 	(basic_unified)
387 		||
388 	(codeRange(CJK_ExtAIni, CJK_ExtAFin))
389 		||
390 	(uca_vers >=  8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
391 		||
392 	(uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
393 		||
394 	(uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
395     );
396 OUTPUT:
397     RETVAL
398 
399 
400 SV*
401 mk_SortKey (self, buf)
402     SV* self;
403     SV* buf;
404   PREINIT:
405     SV *dst, **svp;
406     STRLEN dlen, vlen;
407     U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
408     AV *bufAV;
409     HV *selfHV;
410     UV back_flag;
411     I32 i, buf_len;
412     IV  lv, level, uca_vers;
413     bool upper_lower, kata_hira, v2i, last_is_var;
414   CODE:
415     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
416 	selfHV = (HV*)SvRV(self);
417     else
418 	croak("$self is not a HASHREF.");
419 
420     if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
421 	bufAV = (AV*)SvRV(buf);
422     else
423 	croak("XSUB, not an ARRAYREF.");
424 
425     buf_len = av_len(bufAV);
426 
427     if (buf_len < 0) { /* empty: -1 */
428 	dlen = 2 * (MaxLevel - 1);
429 	dst = newSV(dlen);
430 	(void)SvPOK_only(dst);
431 	d = (U8*)SvPVX(dst);
432 	while (dlen--)
433 	    *d++ = '\0';
434     } else {
435 	svp = hv_fetch(selfHV, "level", 5, FALSE);
436 	level = svp ? SvIV(*svp) : MaxLevel;
437 
438 	for (lv = 0; lv < level; lv++) {
439 	    New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
440 	    s[lv] = eachlevel[lv];
441 	}
442 
443 	svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
444 	upper_lower = svp ? SvTRUE(*svp) : FALSE;
445 	svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
446 	kata_hira = svp ? SvTRUE(*svp) : FALSE;
447 	svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
448 	uca_vers = SvIV(*svp);
449 	svp = hv_fetch(selfHV, "variable", 8, FALSE);
450 	v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
451 	    ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
452 	    : FALSE;
453 
454 	last_is_var = FALSE;
455 	for (i = 0; i <= buf_len; i++) {
456 	    svp = av_fetch(bufAV, i, FALSE);
457 
458 	    if (svp && SvPOK(*svp))
459 		v = (U8*)SvPV(*svp, vlen);
460 	    else
461 		croak("not a vwt.");
462 
463 	    if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
464 		continue;
465 
466 	    /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
467 	    if (v2i) {
468 		if (*v)
469 		    last_is_var = TRUE;
470 		else if (v[1] || v[2]) /* non zero primary weight */
471 		    last_is_var = FALSE;
472 		else if (last_is_var) /* zero primary weight; skipped */
473 		    continue;
474 	    }
475 
476 	    if (v[5] == 0) { /* tert wt < 256 */
477 		if (upper_lower) {
478 		    if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
479 			v[6] -= 6;
480 		    else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
481 			v[6] += 6;
482 		    else if (v[6] == 0x1C) /* square upper */
483 			v[6]++;
484 		    else if (v[6] == 0x1D) /* square lower */
485 			v[6]--;
486 		}
487 		if (kata_hira) {
488 		    if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
489 			v[6] -= 2;
490 		    else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
491 			v[6] += 5;
492 		}
493 	    }
494 
495 	    for (lv = 0; lv < level; lv++) {
496 		if (v[2 * lv + 1] || v[2 * lv + 2]) {
497 		    *s[lv]++ = v[2 * lv + 1];
498 		    *s[lv]++ = v[2 * lv + 2];
499 		}
500 	    }
501 	}
502 
503 	dlen = 2 * (MaxLevel - 1);
504 	for (lv = 0; lv < level; lv++)
505 	    dlen += s[lv] - eachlevel[lv];
506 
507 	dst = newSV(dlen);
508 	(void)SvPOK_only(dst);
509 	d = (U8*)SvPVX(dst);
510 
511 	svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
512 	back_flag = svp ? SvUV(*svp) : (UV)0;
513 
514 	for (lv = 0; lv < level; lv++) {
515 	    if (back_flag & (1 << (lv + 1))) {
516 		p = s[lv];
517 		e = eachlevel[lv];
518 		for ( ; e < p; p -= 2) {
519 		    *d++ = p[-2];
520 		    *d++ = p[-1];
521 		}
522 	    }
523 	    else {
524 		p = eachlevel[lv];
525 		e = s[lv];
526 		while (p < e)
527 		    *d++ = *p++;
528 	    }
529 	    if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
530 		*d++ = '\0';
531 		*d++ = '\0';
532 	    }
533 	}
534 
535 	for (lv = level; lv < MaxLevel; lv++) {
536 	    if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
537 		*d++ = '\0';
538 		*d++ = '\0';
539 	    }
540 	}
541 
542 	for (lv = 0; lv < level; lv++) {
543 	    Safefree(eachlevel[lv]);
544 	}
545     }
546     *d = '\0';
547     SvCUR_set(dst, d - (U8*)SvPVX(dst));
548     RETVAL = dst;
549 OUTPUT:
550     RETVAL
551 
552 
553 SV*
554 varCE (self, vce)
555     SV* self;
556     SV* vce;
557   PREINIT:
558     SV *dst, *vbl, **svp;
559     HV *selfHV;
560     U8 *a, *v, *d;
561     STRLEN alen, vlen;
562     bool ig_l2;
563     UV totwt;
564   CODE:
565     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
566 	selfHV = (HV*)SvRV(self);
567     else
568 	croak("$self is not a HASHREF.");
569 
570     svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE);
571     ig_l2 = svp ? SvTRUE(*svp) : FALSE;
572 
573     svp = hv_fetch(selfHV, "variable", 8, FALSE);
574     vbl = svp ? *svp : &PL_sv_no;
575     a = (U8*)SvPV(vbl, alen);
576     v = (U8*)SvPV(vce, vlen);
577 
578     dst = newSV(vlen);
579     d = (U8*)SvPVX(dst);
580     (void)SvPOK_only(dst);
581     Copy(v, d, vlen, U8);
582     SvCUR_set(dst, vlen);
583     d[vlen] = '\0';
584 
585     /* primary weight == 0 && secondary weight != 0 */
586     if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) {
587 	d[3] = d[4] = d[5] = d[6] = '\0';
588     }
589 
590     /* variable: checked only the first char and the length,
591        trusting checkCollator() and %VariableOK in Perl ... */
592 
593     if (vlen < VCE_Length /* ignore short VCE (unexpected) */
594 	||
595 	*a == 'n') /* non-ignorable */
596 	1;
597     else if (*v) {
598 	if (*a == 's') { /* shifted or shift-trimmed */
599 	    d[7] = d[1]; /* wt level 1 to 4 */
600 	    d[8] = d[2];
601 	} /* else blanked */
602 
603 	d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
604     }
605     else if (*a == 'b') /* blanked */
606 	1;
607     else if (*a == 's') { /* shifted or shift-trimmed */
608 	totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
609 	if (alen == 7 && totwt != 0) { /* shifted */
610 	    if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
611 		d[7] = d[1]; /* wt level 1 to 4 */
612 		d[8] = d[2];
613 	    } else {
614 		d[7] = (U8)(Shift4Wt >> 8);
615 		d[8] = (U8)(Shift4Wt & 0xFF);
616 	    }
617 	} else { /* shift-trimmed or completely ignorable */
618 	    d[7] = d[8] = '\0';
619 	}
620     }
621     else
622 	croak("unknown variable value '%s'", a);
623     RETVAL = dst;
624 OUTPUT:
625     RETVAL
626 
627 
628 
629 SV*
630 visualizeSortKey (self, key)
631     SV * self
632     SV * key
633   PREINIT:
634     HV *selfHV;
635     SV **svp, *dst;
636     U8 *s, *e, *d;
637     STRLEN klen, dlen;
638     UV uv;
639     IV uca_vers, sep = 0;
640     static const char *upperhex = "0123456789ABCDEF";
641   CODE:
642     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
643 	selfHV = (HV*)SvRV(self);
644     else
645 	croak("$self is not a HASHREF.");
646 
647     svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
648     if (!svp)
649 	croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
650     uca_vers = SvIV(*svp);
651 
652     s = (U8*)SvPV(key, klen);
653 
654    /* slightly *longer* than the need, but I'm afraid of miscounting;
655       = (klen / 2) * 5 - 1
656              # FFFF and ' ' for each 16bit units but ' ' is less by 1;
657              # ' ' and '|' for level boundaries including the identical level
658        + 2   # '[' and ']'
659        + 1   # '\0'
660        (a) if klen is odd (not expected), maybe more 5 bytes.
661        (b) there is not always the identical level.
662    */
663     dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
664     dst = newSV(dlen);
665     (void)SvPOK_only(dst);
666     d = (U8*)SvPVX(dst);
667 
668     *d++ = '[';
669     for (e = s + klen; s < e; s += 2) {
670 	uv = (U16)(*s << 8 | s[1]);
671 	if (uv || sep >= MaxLevel) {
672 	    if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
673 		*d++ = ' ';
674 	    *d++ = upperhex[ (s[0] >> 4) & 0xF ];
675 	    *d++ = upperhex[  s[0]       & 0xF ];
676 	    *d++ = upperhex[ (s[1] >> 4) & 0xF ];
677 	    *d++ = upperhex[  s[1]       & 0xF ];
678 	} else {
679 	    if ((9 <= uca_vers) && (d[-1] != '['))
680 		*d++ = ' ';
681 	    *d++ = '|';
682 	    ++sep;
683 	}
684     }
685     *d++ = ']';
686     *d   = '\0';
687     SvCUR_set(dst, d - (U8*)SvPVX(dst));
688     RETVAL = dst;
689 OUTPUT:
690     RETVAL
691 
692 
693 
694 void
695 unpackUfor56 (src)
696     SV* src
697   PREINIT:
698     STRLEN srclen, retlen;
699     U8 *s, *p, *e;
700     UV uv;
701   PPCODE:
702     s = (U8*)SvPV(src,srclen);
703     if (!SvUTF8(src)) {
704 	SV* tmpsv = sv_mortalcopy(src);
705 	if (!SvPOK(tmpsv))
706 	    (void)sv_pvn_force(tmpsv,&srclen);
707 	sv_utf8_upgrade(tmpsv);
708 	s = (U8*)SvPV(tmpsv,srclen);
709     }
710     e = s + srclen;
711 
712     for (p = s; p < e; p += retlen) {
713 	uv = GET_UV_FOR_5_6; /* perl 5.6.x workaround */
714 	if (!retlen)
715 	    croak("panic (Unicode::Collate): zero-length character");
716 	XPUSHs(sv_2mortal(newSVuv(uv)));
717     }
718 
719