1 
2 #include "EXTERN.h"
3 #include "perl.h"
4 #include "XSUB.h"
5 
6 /* These 5 files are prepared by mkheader */
7 #include "unfcmb.h"
8 #include "unfcan.h"
9 #include "unfcpt.h"
10 #include "unfcmp.h"
11 #include "unfexc.h"
12 
13 /* Perl 5.6.1 ? */
14 #ifndef uvuni_to_utf8
15 #define uvuni_to_utf8   uv_to_utf8
16 #endif /* uvuni_to_utf8 */
17 
18 /* Perl 5.6.1 ? */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni  utf8_to_uv
21 #endif /* utf8n_to_uvuni */
22 
23 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
24 #ifdef UTF8_ALLOW_BOM
25 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
26 #else
27 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
28 #endif
29 
30 /* if utf8n_to_uvuni() sets retlen to 0 (?) */
31 #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
32 
33 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
34 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
35 
36 /* At present, char > 0x10ffff are unaffected without complaint, right? */
37 #define VALID_UTF_MAX    (0x10ffff)
38 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
39 
40 /* HANGUL_H */
41 #define Hangul_SBase  0xAC00
42 #define Hangul_SFinal 0xD7A3
43 #define Hangul_SCount  11172
44 
45 #define Hangul_NCount    588
46 
47 #define Hangul_LBase  0x1100
48 #define Hangul_LFinal 0x1112
49 #define Hangul_LCount     19
50 
51 #define Hangul_VBase  0x1161
52 #define Hangul_VFinal 0x1175
53 #define Hangul_VCount     21
54 
55 #define Hangul_TBase  0x11A7
56 #define Hangul_TFinal 0x11C2
57 #define Hangul_TCount     28
58 
59 #define Hangul_IsS(u)  ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
60 #define Hangul_IsN(u)  (((u) - Hangul_SBase) % Hangul_TCount == 0)
61 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
62 #define Hangul_IsL(u)  ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
63 #define Hangul_IsV(u)  ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
64 #define Hangul_IsT(u)  ((Hangul_TBase  < (u)) && ((u) <= Hangul_TFinal))
65 /* HANGUL_H */
66 
67 /* this is used for canonical ordering of combining characters (c.c.). */
68 typedef struct {
69     U8 cc;	/* combining class */
70     UV uv;	/* codepoint */
71     STRLEN pos; /* position */
72 } UNF_cc;
73 
compare_cc(const void * a,const void * b)74 static int compare_cc (const void *a, const void *b)
75 {
76     int ret_cc;
77     ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
78     if (ret_cc)
79 	return ret_cc;
80 
81     return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
82 	 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
83 }
84 
dec_canonical(UV uv)85 static U8* dec_canonical (UV uv)
86 {
87     U8 ***plane, **row;
88     if (OVER_UTF_MAX(uv))
89 	return NULL;
90     plane = (U8***)UNF_canon[uv >> 16];
91     if (! plane)
92 	return NULL;
93     row = plane[(uv >> 8) & 0xff];
94     return row ? row[uv & 0xff] : NULL;
95 }
96 
dec_compat(UV uv)97 static U8* dec_compat (UV uv)
98 {
99     U8 ***plane, **row;
100     if (OVER_UTF_MAX(uv))
101 	return NULL;
102     plane = (U8***)UNF_compat[uv >> 16];
103     if (! plane)
104 	return NULL;
105     row = plane[(uv >> 8) & 0xff];
106     return row ? row[uv & 0xff] : NULL;
107 }
108 
composite_uv(UV uv,UV uv2)109 static UV composite_uv (UV uv, UV uv2)
110 {
111     UNF_complist ***plane, **row, *cell, *i;
112 
113     if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
114 	return 0;
115 
116     if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
117 	uv  -= Hangul_LBase; /* lindex */
118 	uv2 -= Hangul_VBase; /* vindex */
119 	return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
120     }
121     if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
122 	uv2 -= Hangul_TBase; /* tindex */
123 	return(uv + uv2);
124     }
125     plane = UNF_compos[uv >> 16];
126     if (! plane)
127 	return 0;
128     row = plane[(uv >> 8) & 0xff];
129     if (! row)
130 	return 0;
131     cell = row[uv & 0xff];
132     if (! cell)
133 	return 0;
134     for (i = cell; i->nextchar; i++) {
135 	if (uv2 == i->nextchar)
136 	    return i->composite;
137     }
138     return 0;
139 }
140 
getCombinClass(UV uv)141 static U8 getCombinClass (UV uv)
142 {
143     U8 **plane, *row;
144     if (OVER_UTF_MAX(uv))
145 	return 0;
146     plane = (U8**)UNF_combin[uv >> 16];
147     if (! plane)
148 	return 0;
149     row = plane[(uv >> 8) & 0xff];
150     return row ? row[uv & 0xff] : 0;
151 }
152 
sv_cat_decompHangul(SV * sv,UV uv)153 static void sv_cat_decompHangul (SV* sv, UV uv)
154 {
155     UV sindex, lindex, vindex, tindex;
156     U8 *t, tmp[3 * UTF8_MAXLEN + 1];
157 
158     if (! Hangul_IsS(uv))
159 	return;
160 
161     sindex =  uv - Hangul_SBase;
162     lindex =  sindex / Hangul_NCount;
163     vindex = (sindex % Hangul_NCount) / Hangul_TCount;
164     tindex =  sindex % Hangul_TCount;
165 
166     t = tmp;
167     t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
168     t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
169     if (tindex)
170 	t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
171     *t = '\0';
172     sv_catpvn(sv, (char *)tmp, t - tmp);
173 }
174 
sv_cat_uvuni(SV * sv,UV uv)175 static void sv_cat_uvuni (SV* sv, UV uv)
176 {
177     U8 *t, tmp[UTF8_MAXLEN + 1];
178 
179     t = tmp;
180     t = uvuni_to_utf8(t, uv);
181     *t = '\0';
182     sv_catpvn(sv, (char *)tmp, t - tmp);
183 }
184 
185 MODULE = Unicode::Normalize	PACKAGE = Unicode::Normalize
186 
187 SV*
188 decompose(arg, compat = &PL_sv_no)
189     SV * arg
190     SV * compat
191   PROTOTYPE: $;$
192   PREINIT:
193     UV uv;
194     SV *src, *dst;
195     STRLEN srclen, retlen;
196     U8 *s, *e, *p, *r;
197     bool iscompat;
198   CODE:
199     if (SvUTF8(arg)) {
200 	src = arg;
201     } else {
202 	src = sv_mortalcopy(arg);
203 	sv_utf8_upgrade(src);
204     }
205     iscompat = SvTRUE(compat);
206 
207     dst = newSV(1);
208     (void)SvPOK_only(dst);
209     SvUTF8_on(dst);
210 
211     s = (U8*)SvPV(src,srclen);
212     e = s + srclen;
213     for (p = s; p < e; p += retlen) {
214 	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
215 	if (!retlen)
216 	    croak(ErrRetlenIsZero);
217 
218 	if (Hangul_IsS(uv))
219 	    sv_cat_decompHangul(dst, uv);
220 	else {
221 	    r = iscompat ? dec_compat(uv) : dec_canonical(uv);
222 	    if (r)
223 		sv_catpv(dst, (char *)r);
224 	    else
225 		sv_cat_uvuni(dst, uv);
226 	}
227     }
228     RETVAL = dst;
229   OUTPUT:
230     RETVAL
231 
232 
233 
234 SV*
235 reorder(arg)
236     SV * arg
237   PROTOTYPE: $
238   PREINIT:
239     SV *src, *dst;
240     STRLEN srclen, dstlen, retlen, stk_cc_max;
241     U8 *s, *e, *p, *d, curCC;
242     UV uv, uvlast;
243     UNF_cc * stk_cc;
244     STRLEN i, cc_pos;
245     bool valid_uvlast;
246   CODE:
247     if (SvUTF8(arg)) {
248 	src = arg;
249     } else {
250 	src = sv_mortalcopy(arg);
251 	sv_utf8_upgrade(src);
252     }
253 
254     s = (U8*)SvPV(src, srclen);
255     e = s + srclen;
256     dstlen = srclen + 1;
257     dst = newSV(dstlen);
258     (void)SvPOK_only(dst);
259     SvUTF8_on(dst);
260     d = (U8*)SvPVX(dst);
261 
262     stk_cc_max = 10; /* enough as an initial value? */
263     New(0, stk_cc, stk_cc_max, UNF_cc);
264 
265     for (p = s; p < e;) {
266 	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
267 	if (!retlen)
268 	    croak(ErrRetlenIsZero);
269 	p += retlen;
270 
271 	curCC = getCombinClass(uv);
272 	if (curCC == 0) {
273 	    d = uvuni_to_utf8(d, uv);
274 	    continue;
275 	}
276 
277 	cc_pos = 0;
278 	stk_cc[cc_pos].cc  = curCC;
279 	stk_cc[cc_pos].uv  = uv;
280 	stk_cc[cc_pos].pos = cc_pos;
281 
282 	valid_uvlast = FALSE;
283 	while (p < e) {
284 	    uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
285 	    if (!retlen)
286 		croak(ErrRetlenIsZero);
287 	    p += retlen;
288 
289 	    curCC = getCombinClass(uv);
290 	    if (curCC == 0) {
291 		uvlast = uv;
292 		valid_uvlast = TRUE;
293 		break;
294 	    }
295 
296 	    cc_pos++;
297 	    if (stk_cc_max <= cc_pos) { /* extend if need */
298 		stk_cc_max = cc_pos + 1;
299 		Renew(stk_cc, stk_cc_max, UNF_cc);
300 	    }
301 	    stk_cc[cc_pos].cc  = curCC;
302 	    stk_cc[cc_pos].uv  = uv;
303 	    stk_cc[cc_pos].pos = cc_pos;
304 	}
305 
306 	/* reordered if there are two c.c.'s */
307 	if (cc_pos) {
308 	    qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
309 	}
310 
311 	for (i = 0; i <= cc_pos; i++) {
312 	    d = uvuni_to_utf8(d, stk_cc[i].uv);
313 	}
314 	if (valid_uvlast)
315 	{
316 	    d = uvuni_to_utf8(d, uvlast);
317 	}
318     }
319     *d = '\0';
320     SvCUR_set(dst, d - (U8*)SvPVX(dst));
321     Safefree(stk_cc);
322     RETVAL = dst;
323   OUTPUT:
324     RETVAL
325 
326 
327 
328 SV*
329 compose(arg)
330     SV * arg
331   PROTOTYPE: $
332   ALIAS:
333     composeContiguous = 1
334   PREINIT:
335     SV  *src, *dst, *tmp;
336     U8  *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
337     UV uv, uvS, uvComp;
338     STRLEN srclen, dstlen, tmplen, retlen;
339     bool beginning = TRUE;
340   CODE:
341     if (SvUTF8(arg)) {
342 	src = arg;
343     } else {
344 	src = sv_mortalcopy(arg);
345 	sv_utf8_upgrade(src);
346     }
347 
348     s = (U8*)SvPV(src, srclen);
349     e = s + srclen;
350     dstlen = srclen + 1;
351     dst = newSV(dstlen);
352     (void)SvPOK_only(dst);
353     SvUTF8_on(dst);
354     d = (U8*)SvPVX(dst);
355 
356   /* for uncomposed combining char */
357     tmp = sv_2mortal(newSV(dstlen));
358     (void)SvPOK_only(tmp);
359     SvUTF8_on(tmp);
360 
361     for (p = s; p < e;) {
362 	if (beginning) {
363 	    uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
364 	    if (!retlen)
365 		croak(ErrRetlenIsZero);
366 	    p += retlen;
367 
368             if (getCombinClass(uvS)) { /* no Starter found yet */
369 		d = uvuni_to_utf8(d, uvS);
370 		continue;
371 	    }
372             beginning = FALSE;
373 	}
374 
375     /* Starter */
376 	t = tmp_start = (U8*)SvPVX(tmp);
377 	preCC = 0;
378 
379     /* to the next Starter */
380 	while (p < e) {
381 	    uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
382 	    if (!retlen)
383 		croak(ErrRetlenIsZero);
384 	    p += retlen;
385 
386 	    curCC = getCombinClass(uv);
387 
388 	    if (preCC && preCC == curCC) {
389 		preCC = curCC;
390 		t = uvuni_to_utf8(t, uv);
391 	    } else {
392 		uvComp = composite_uv(uvS, uv);
393 
394 		if (uvComp && ! isExclusion(uvComp) &&
395 			(ix ? (t == tmp_start) : (preCC <= curCC))) {
396 		    STRLEN leftcur, rightcur, dstcur;
397 		    leftcur  = UNISKIP(uvComp);
398 		    rightcur = UNISKIP(uvS) + UNISKIP(uv);
399 
400 		    if (leftcur > rightcur) {
401 			dstcur = d - (U8*)SvPVX(dst);
402 			dstlen += leftcur - rightcur;
403 			d = (U8*)SvGROW(dst,dstlen) + dstcur;
404 		    }
405 		    /* preCC not changed to curCC */
406 		    uvS = uvComp;
407 		} else if (! curCC && p < e) { /* blocked */
408 		    break;
409 		} else {
410 		    preCC = curCC;
411 		    t = uvuni_to_utf8(t, uv);
412 		}
413 	    }
414 	}
415 	d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
416 	tmplen = t - tmp_start;
417 	if (tmplen) { /* uncomposed combining char */
418 	    t = (U8*)SvPVX(tmp);
419 	    while (tmplen--)
420 		*d++ = *t++;
421 	}
422 	uvS = uv;
423     } /* for */
424     *d = '\0';
425     SvCUR_set(dst, d - (U8*)SvPVX(dst));
426     RETVAL = dst;
427   OUTPUT:
428     RETVAL
429 
430 
431 void
432 checkNFD(arg)
433     SV * arg
434   PROTOTYPE: $
435   ALIAS:
436     checkNFKD = 1
437   PREINIT:
438     UV uv;
439     SV *src;
440     STRLEN srclen, retlen;
441     U8 *s, *e, *p, curCC, preCC;
442   CODE:
443     if (SvUTF8(arg)) {
444 	src = arg;
445     } else {
446 	src = sv_mortalcopy(arg);
447 	sv_utf8_upgrade(src);
448     }
449 
450     s = (U8*)SvPV(src,srclen);
451     e = s + srclen;
452 
453     preCC = 0;
454     for (p = s; p < e; p += retlen) {
455 	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
456 	if (!retlen)
457 	    croak(ErrRetlenIsZero);
458 
459 	curCC = getCombinClass(uv);
460 	if (preCC > curCC && curCC != 0) /* canonical ordering violated */
461 	    XSRETURN_NO;
462 	if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
463 	    XSRETURN_NO;
464 	preCC = curCC;
465     }
466     XSRETURN_YES;
467 
468 
469 
470 void
471 checkNFC(arg)
472     SV * arg
473   PROTOTYPE: $
474   ALIAS:
475     checkNFKC = 1
476   PREINIT:
477     UV uv;
478     SV *src;
479     STRLEN srclen, retlen;
480     U8 *s, *e, *p, curCC, preCC;
481     bool isMAYBE;
482   CODE:
483     if (SvUTF8(arg)) {
484 	src = arg;
485     } else {
486 	src = sv_mortalcopy(arg);
487 	sv_utf8_upgrade(src);
488     }
489 
490     s = (U8*)SvPV(src,srclen);
491     e = s + srclen;
492 
493     preCC = 0;
494     isMAYBE = FALSE;
495     for (p = s; p < e; p += retlen) {
496 	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
497 	if (!retlen)
498 	    croak(ErrRetlenIsZero);
499 
500 	curCC = getCombinClass(uv);
501 
502 	if (preCC > curCC && curCC != 0) /* canonical ordering violated */
503 	    XSRETURN_NO;
504 
505 	/* get NFC/NFKC property */
506 	if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
507 	    ; /* YES */
508 	else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
509 	    XSRETURN_NO;
510 	else if (isComp2nd(uv))
511 	    isMAYBE = TRUE;
512 	else if (ix) {
513 	    char *canon, *compat;
514 	  /* NFKC_NO when having compatibility mapping. */
515 	    canon  = (char *) dec_canonical(uv);
516 	    compat = (char *) dec_compat(uv);
517 	    if (compat && !(canon && strEQ(canon, compat)))
518 		XSRETURN_NO;
519 	} /* end of get NFC/NFKC property */
520 
521 	preCC = curCC;
522     }
523     if (isMAYBE)
524 	XSRETURN_UNDEF;
525     else
526 	XSRETURN_YES;
527 
528 
529 
530 void
531 checkFCD(arg)
532     SV * arg
533   PROTOTYPE: $
534   ALIAS:
535     checkFCC = 1
536   PREINIT:
537     UV uv, uvLead, uvTrail;
538     SV *src;
539     STRLEN srclen, retlen, canlen, canret;
540     U8 *s, *e, *p, curCC, preCC;
541     U8 *sCan, *pCan, *eCan;
542     bool isMAYBE;
543   CODE:
544     if (SvUTF8(arg)) {
545 	src = arg;
546     } else {
547 	src = sv_mortalcopy(arg);
548 	sv_utf8_upgrade(src);
549     }
550 
551     s = (U8*)SvPV(src,srclen);
552     e = s + srclen;
553 
554     preCC = 0;
555     isMAYBE = FALSE;
556     for (p = s; p < e; p += retlen) {
557 	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
558 	if (!retlen)
559 	    croak(ErrRetlenIsZero);
560 
561 	sCan = (U8*) dec_canonical(uv);
562 
563 	if (sCan) {
564 	    canlen = (STRLEN)strlen((char *) sCan);
565 	    uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
566 	}
567 	else {
568 	    uvLead = uv;
569 	}
570 
571 	curCC = getCombinClass(uvLead);
572 
573 	if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
574 	    XSRETURN_NO;
575 
576 	if (ix) {
577 	    if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
578 		XSRETURN_NO;
579 	    else if (isComp2nd(uv))
580 		isMAYBE = TRUE;
581 	}
582 
583 	if (sCan) {
584 	    eCan = sCan + canlen;
585 	    pCan = utf8_hop(eCan, -1);
586 	    if (pCan < sCan)
587 		croak(ErrHopBeforeStart);
588 	    uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
589 	    preCC = getCombinClass(uvTrail);
590 	}
591 	else {
592 	    preCC = curCC;
593 	}
594     }
595     if (isMAYBE)
596 	XSRETURN_UNDEF;
597     else
598 	XSRETURN_YES;
599 
600 
601 
602 U8
603 getCombinClass(uv)
604     UV uv
605   PROTOTYPE: $
606 
607 bool
608 isExclusion(uv)
609     UV uv
610   PROTOTYPE: $
611 
612 bool
613 isSingleton(uv)
614     UV uv
615   PROTOTYPE: $
616 
617 bool
618 isNonStDecomp(uv)
619     UV uv
620   PROTOTYPE: $
621 
622 bool
623 isComp2nd(uv)
624     UV uv
625   PROTOTYPE: $
626   ALIAS:
627     isNFC_MAYBE  = 1
628     isNFKC_MAYBE = 2
629 
630 
631 
632 void
633 isNFD_NO(uv)
634     UV uv
635   PROTOTYPE: $
636   ALIAS:
637     isNFKD_NO = 1
638   CODE:
639     if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
640 	XSRETURN_YES; /* NFD_NO or NFKD_NO */
641     else
642 	XSRETURN_NO;
643 
644 
645 
646 void
647 isComp_Ex(uv)
648     UV uv
649   PROTOTYPE: $
650   ALIAS:
651     isNFC_NO  = 0
652     isNFKC_NO = 1
653   CODE:
654     if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
655 	XSRETURN_YES; /* NFC_NO or NFKC_NO */
656     else if (ix) {
657 	char *canon, *compat;
658 	canon  = (char *) dec_canonical(uv);
659 	compat = (char *) dec_compat(uv);
660 	if (compat && (!canon || strNE(canon, compat)))
661 	    XSRETURN_YES; /* NFC_NO or NFKC_NO */
662 	else
663 	    XSRETURN_NO;
664     }
665     else
666 	XSRETURN_NO;
667 
668 
669 
670 SV*
671 getComposite(uv, uv2)
672     UV uv
673     UV uv2
674   PROTOTYPE: $$
675   PREINIT:
676     UV composite;
677   CODE:
678     composite = composite_uv(uv, uv2);
679     RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
680   OUTPUT:
681     RETVAL
682 
683 
684 
685 SV*
686 getCanon(uv)
687     UV uv
688   PROTOTYPE: $
689   ALIAS:
690     getCompat = 1
691   PREINIT:
692     U8 * rstr;
693   CODE:
694     if (Hangul_IsS(uv)) {
695 	SV * dst;
696 	dst = newSV(1);
697 	(void)SvPOK_only(dst);
698 	sv_cat_decompHangul(dst, uv);
699 	RETVAL = dst;
700     } else {
701 	rstr = ix ? dec_compat(uv) : dec_canonical(uv);
702 	if (!rstr)
703 	    XSRETURN_UNDEF;
704 	RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
705     }
706     SvUTF8_on(RETVAL);
707   OUTPUT:
708     RETVAL
709 
710 
711 void
712 splitOnLastStarter(arg)
713     SV * arg
714   PREINIT:
715     UV uv;
716     SV *src, *svp;
717     STRLEN srclen, retlen;
718     U8 *s, *e, *p;
719   PPCODE:
720     if (SvUTF8(arg)) {
721 	src = arg;
722     } else {
723 	src = sv_mortalcopy(arg);
724 	sv_utf8_upgrade(src);
725     }
726 
727     s = (U8*)SvPV(src,srclen);
728     e = s + srclen;
729 
730     for (p = e; s < p; ) {
731 	p = utf8_hop(p, -1);
732 	if (p < s)
733 	    croak(ErrHopBeforeStart);
734 	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
735 	if (getCombinClass(uv) == 0) /* Last Starter found */
736 	    break;
737     }
738 
739     svp = sv_2mortal(newSVpvn((char*)s, p - s));
740     SvUTF8_on(svp);
741     XPUSHs(svp);
742 
743     svp = sv_2mortal(newSVpvn((char*)p, e - p));
744     SvUTF8_on(svp);
745     XPUSHs(svp);
746 
747