1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  $Id: Unicode.xs,v 1.9 2003/12/29 02:47:16 dankogai Exp dankogai $
3*0Sstevel@tonic-gate  */
4*0Sstevel@tonic-gate 
5*0Sstevel@tonic-gate #define PERL_NO_GET_CONTEXT
6*0Sstevel@tonic-gate #include "EXTERN.h"
7*0Sstevel@tonic-gate #include "perl.h"
8*0Sstevel@tonic-gate #include "XSUB.h"
9*0Sstevel@tonic-gate #define U8 U8
10*0Sstevel@tonic-gate #include "../Encode/encode.h"
11*0Sstevel@tonic-gate 
12*0Sstevel@tonic-gate #define FBCHAR			0xFFFd
13*0Sstevel@tonic-gate #define BOM_BE			0xFeFF
14*0Sstevel@tonic-gate #define BOM16LE			0xFFFe
15*0Sstevel@tonic-gate #define BOM32LE			0xFFFe0000
16*0Sstevel@tonic-gate #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
17*0Sstevel@tonic-gate #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
18*0Sstevel@tonic-gate #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
19*0Sstevel@tonic-gate #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
20*0Sstevel@tonic-gate 
21*0Sstevel@tonic-gate static UV
enc_unpack(pTHX_ U8 ** sp,U8 * e,STRLEN size,U8 endian)22*0Sstevel@tonic-gate enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
23*0Sstevel@tonic-gate {
24*0Sstevel@tonic-gate     U8 *s = *sp;
25*0Sstevel@tonic-gate     UV v = 0;
26*0Sstevel@tonic-gate     if (s+size > e) {
27*0Sstevel@tonic-gate 	croak("Partial character %c",(char) endian);
28*0Sstevel@tonic-gate     }
29*0Sstevel@tonic-gate     switch(endian) {
30*0Sstevel@tonic-gate     case 'N':
31*0Sstevel@tonic-gate 	v = *s++;
32*0Sstevel@tonic-gate 	v = (v << 8) | *s++;
33*0Sstevel@tonic-gate     case 'n':
34*0Sstevel@tonic-gate 	v = (v << 8) | *s++;
35*0Sstevel@tonic-gate 	v = (v << 8) | *s++;
36*0Sstevel@tonic-gate 	break;
37*0Sstevel@tonic-gate     case 'V':
38*0Sstevel@tonic-gate     case 'v':
39*0Sstevel@tonic-gate 	v |= *s++;
40*0Sstevel@tonic-gate 	v |= (*s++ << 8);
41*0Sstevel@tonic-gate 	if (endian == 'v')
42*0Sstevel@tonic-gate 	    break;
43*0Sstevel@tonic-gate 	v |= (*s++ << 16);
44*0Sstevel@tonic-gate 	v |= (*s++ << 24);
45*0Sstevel@tonic-gate 	break;
46*0Sstevel@tonic-gate     default:
47*0Sstevel@tonic-gate 	croak("Unknown endian %c",(char) endian);
48*0Sstevel@tonic-gate 	break;
49*0Sstevel@tonic-gate     }
50*0Sstevel@tonic-gate     *sp = s;
51*0Sstevel@tonic-gate     return v;
52*0Sstevel@tonic-gate }
53*0Sstevel@tonic-gate 
54*0Sstevel@tonic-gate void
enc_pack(pTHX_ SV * result,STRLEN size,U8 endian,UV value)55*0Sstevel@tonic-gate enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
56*0Sstevel@tonic-gate {
57*0Sstevel@tonic-gate     U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1);
58*0Sstevel@tonic-gate     switch(endian) {
59*0Sstevel@tonic-gate     case 'v':
60*0Sstevel@tonic-gate     case 'V':
61*0Sstevel@tonic-gate 	d += SvCUR(result);
62*0Sstevel@tonic-gate 	SvCUR_set(result,SvCUR(result)+size);
63*0Sstevel@tonic-gate 	while (size--) {
64*0Sstevel@tonic-gate 	    *d++ = (U8)(value & 0xFF);
65*0Sstevel@tonic-gate 	    value >>= 8;
66*0Sstevel@tonic-gate 	}
67*0Sstevel@tonic-gate 	break;
68*0Sstevel@tonic-gate     case 'n':
69*0Sstevel@tonic-gate     case 'N':
70*0Sstevel@tonic-gate 	SvCUR_set(result,SvCUR(result)+size);
71*0Sstevel@tonic-gate 	d += SvCUR(result);
72*0Sstevel@tonic-gate 	while (size--) {
73*0Sstevel@tonic-gate 	    *--d = (U8)(value & 0xFF);
74*0Sstevel@tonic-gate 	    value >>= 8;
75*0Sstevel@tonic-gate 	}
76*0Sstevel@tonic-gate 	break;
77*0Sstevel@tonic-gate     default:
78*0Sstevel@tonic-gate 	croak("Unknown endian %c",(char) endian);
79*0Sstevel@tonic-gate 	break;
80*0Sstevel@tonic-gate     }
81*0Sstevel@tonic-gate }
82*0Sstevel@tonic-gate 
83*0Sstevel@tonic-gate MODULE = Encode::Unicode PACKAGE = Encode::Unicode
84*0Sstevel@tonic-gate 
85*0Sstevel@tonic-gate PROTOTYPES: DISABLE
86*0Sstevel@tonic-gate 
87*0Sstevel@tonic-gate #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
88*0Sstevel@tonic-gate     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
89*0Sstevel@tonic-gate 
90*0Sstevel@tonic-gate void
91*0Sstevel@tonic-gate decode_xs(obj, str, check = 0)
92*0Sstevel@tonic-gate SV *	obj
93*0Sstevel@tonic-gate SV *	str
94*0Sstevel@tonic-gate IV	check
95*0Sstevel@tonic-gate CODE:
96*0Sstevel@tonic-gate {
97*0Sstevel@tonic-gate     U8 endian   = *((U8 *)SvPV_nolen(attr("endian", 6)));
98*0Sstevel@tonic-gate     int size    =   SvIV(attr("size",   4));
99*0Sstevel@tonic-gate     int ucs2    = SvTRUE(attr("ucs2",   4));
100*0Sstevel@tonic-gate     int clone   = SvTRUE(attr("clone",  5));
101*0Sstevel@tonic-gate     SV *result  = newSVpvn("",0);
102*0Sstevel@tonic-gate     STRLEN ulen;
103*0Sstevel@tonic-gate     U8 *s = (U8 *)SvPVbyte(str,ulen);
104*0Sstevel@tonic-gate     U8 *e = (U8 *)SvEND(str);
105*0Sstevel@tonic-gate     ST(0) = sv_2mortal(result);
106*0Sstevel@tonic-gate     SvUTF8_on(result);
107*0Sstevel@tonic-gate 
108*0Sstevel@tonic-gate     if (!endian && s+size <= e) {
109*0Sstevel@tonic-gate 	UV bom;
110*0Sstevel@tonic-gate 	endian = (size == 4) ? 'N' : 'n';
111*0Sstevel@tonic-gate 	bom = enc_unpack(aTHX_ &s,e,size,endian);
112*0Sstevel@tonic-gate         if (bom != BOM_BE) {
113*0Sstevel@tonic-gate 	    if (bom == BOM16LE) {
114*0Sstevel@tonic-gate 		endian = 'v';
115*0Sstevel@tonic-gate 	    }
116*0Sstevel@tonic-gate 	    else if (bom == BOM32LE) {
117*0Sstevel@tonic-gate 		endian = 'V';
118*0Sstevel@tonic-gate 	    }
119*0Sstevel@tonic-gate 	    else {
120*0Sstevel@tonic-gate 		croak("%"SVf":Unrecognised BOM %"UVxf,
121*0Sstevel@tonic-gate                       *hv_fetch((HV *)SvRV(obj),"Name",4,0),
122*0Sstevel@tonic-gate 		      bom);
123*0Sstevel@tonic-gate 	    }
124*0Sstevel@tonic-gate 	}
125*0Sstevel@tonic-gate #if 1
126*0Sstevel@tonic-gate 	/* Update endian for next sequence */
127*0Sstevel@tonic-gate 	if (clone) {
128*0Sstevel@tonic-gate 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
129*0Sstevel@tonic-gate 	}
130*0Sstevel@tonic-gate #endif
131*0Sstevel@tonic-gate     }
132*0Sstevel@tonic-gate     while (s < e && s+size <= e) {
133*0Sstevel@tonic-gate 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
134*0Sstevel@tonic-gate 	U8 *d;
135*0Sstevel@tonic-gate 	if (size != 4 && invalid_ucs2(ord)) {
136*0Sstevel@tonic-gate 	    if (ucs2) {
137*0Sstevel@tonic-gate 		if (check) {
138*0Sstevel@tonic-gate 		    croak("%"SVf":no surrogates allowed %"UVxf,
139*0Sstevel@tonic-gate 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
140*0Sstevel@tonic-gate 			  ord);
141*0Sstevel@tonic-gate 		}
142*0Sstevel@tonic-gate 		if (s+size <= e) {
143*0Sstevel@tonic-gate                     /* skip the next one as well */
144*0Sstevel@tonic-gate 		    enc_unpack(aTHX_ &s,e,size,endian);
145*0Sstevel@tonic-gate 		}
146*0Sstevel@tonic-gate 		ord = FBCHAR;
147*0Sstevel@tonic-gate 	    }
148*0Sstevel@tonic-gate 	    else {
149*0Sstevel@tonic-gate 		UV lo;
150*0Sstevel@tonic-gate 		if (!isHiSurrogate(ord)) {
151*0Sstevel@tonic-gate 		    croak("%"SVf":Malformed HI surrogate %"UVxf,
152*0Sstevel@tonic-gate 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
153*0Sstevel@tonic-gate 			  ord);
154*0Sstevel@tonic-gate 		}
155*0Sstevel@tonic-gate 		if (s+size > e) {
156*0Sstevel@tonic-gate 		    /* Partial character */
157*0Sstevel@tonic-gate 		    s -= size;   /* back up to 1st half */
158*0Sstevel@tonic-gate 		    break;       /* And exit loop */
159*0Sstevel@tonic-gate 		}
160*0Sstevel@tonic-gate 		lo = enc_unpack(aTHX_ &s,e,size,endian);
161*0Sstevel@tonic-gate 		if (!isLoSurrogate(lo)){
162*0Sstevel@tonic-gate 		    croak("%"SVf":Malformed LO surrogate %"UVxf,
163*0Sstevel@tonic-gate 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
164*0Sstevel@tonic-gate 			  ord);
165*0Sstevel@tonic-gate 		}
166*0Sstevel@tonic-gate 		ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
167*0Sstevel@tonic-gate 	    }
168*0Sstevel@tonic-gate 	}
169*0Sstevel@tonic-gate 	d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
170*0Sstevel@tonic-gate 	d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
171*0Sstevel@tonic-gate 	SvCUR_set(result,d - (U8 *)SvPVX(result));
172*0Sstevel@tonic-gate     }
173*0Sstevel@tonic-gate     if (s < e) {
174*0Sstevel@tonic-gate 	/* unlikely to happen because it's fixed-length -- dankogai */
175*0Sstevel@tonic-gate 	if (check & ENCODE_WARN_ON_ERR){
176*0Sstevel@tonic-gate 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
177*0Sstevel@tonic-gate 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
178*0Sstevel@tonic-gate 	}
179*0Sstevel@tonic-gate     }
180*0Sstevel@tonic-gate     if (check && !(check & ENCODE_LEAVE_SRC)){
181*0Sstevel@tonic-gate 	if (s < e) {
182*0Sstevel@tonic-gate 	    Move(s,SvPVX(str),e-s,U8);
183*0Sstevel@tonic-gate 	    SvCUR_set(str,(e-s));
184*0Sstevel@tonic-gate 	}
185*0Sstevel@tonic-gate 	else {
186*0Sstevel@tonic-gate 	    SvCUR_set(str,0);
187*0Sstevel@tonic-gate 	}
188*0Sstevel@tonic-gate 	*SvEND(str) = '\0';
189*0Sstevel@tonic-gate     }
190*0Sstevel@tonic-gate     XSRETURN(1);
191*0Sstevel@tonic-gate }
192*0Sstevel@tonic-gate 
193*0Sstevel@tonic-gate void
194*0Sstevel@tonic-gate encode_xs(obj, utf8, check = 0)
195*0Sstevel@tonic-gate SV *	obj
196*0Sstevel@tonic-gate SV *	utf8
197*0Sstevel@tonic-gate IV	check
198*0Sstevel@tonic-gate CODE:
199*0Sstevel@tonic-gate {
200*0Sstevel@tonic-gate     U8 endian   = *((U8 *)SvPV_nolen(attr("endian", 6)));
201*0Sstevel@tonic-gate     int size    =   SvIV(attr("size",   4));
202*0Sstevel@tonic-gate     int ucs2    = SvTRUE(attr("ucs2",   4));
203*0Sstevel@tonic-gate     int clone   = SvTRUE(attr("clone",  5));
204*0Sstevel@tonic-gate     SV *result  = newSVpvn("",0);
205*0Sstevel@tonic-gate     STRLEN ulen;
206*0Sstevel@tonic-gate     U8 *s = (U8 *)SvPVutf8(utf8,ulen);
207*0Sstevel@tonic-gate     U8 *e = (U8 *)SvEND(utf8);
208*0Sstevel@tonic-gate     ST(0) = sv_2mortal(result);
209*0Sstevel@tonic-gate     if (!endian) {
210*0Sstevel@tonic-gate 	endian = (size == 4) ? 'N' : 'n';
211*0Sstevel@tonic-gate 	enc_pack(aTHX_ result,size,endian,BOM_BE);
212*0Sstevel@tonic-gate #if 1
213*0Sstevel@tonic-gate 	/* Update endian for next sequence */
214*0Sstevel@tonic-gate 	if (clone){
215*0Sstevel@tonic-gate 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
216*0Sstevel@tonic-gate 	}
217*0Sstevel@tonic-gate #endif
218*0Sstevel@tonic-gate     }
219*0Sstevel@tonic-gate     while (s < e && s+UTF8SKIP(s) <= e) {
220*0Sstevel@tonic-gate 	STRLEN len;
221*0Sstevel@tonic-gate 	UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
222*0Sstevel@tonic-gate         s += len;
223*0Sstevel@tonic-gate 	if (size != 4 && invalid_ucs2(ord)) {
224*0Sstevel@tonic-gate 	    if (!issurrogate(ord)){
225*0Sstevel@tonic-gate 		if (ucs2) {
226*0Sstevel@tonic-gate 		    if (check) {
227*0Sstevel@tonic-gate 			croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
228*0Sstevel@tonic-gate 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
229*0Sstevel@tonic-gate 		    }
230*0Sstevel@tonic-gate 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
231*0Sstevel@tonic-gate 		}else{
232*0Sstevel@tonic-gate 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
233*0Sstevel@tonic-gate 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
234*0Sstevel@tonic-gate 		    enc_pack(aTHX_ result,size,endian,hi);
235*0Sstevel@tonic-gate 		    enc_pack(aTHX_ result,size,endian,lo);
236*0Sstevel@tonic-gate 		}
237*0Sstevel@tonic-gate 	    }
238*0Sstevel@tonic-gate 	    else {
239*0Sstevel@tonic-gate 		/* not supposed to happen */
240*0Sstevel@tonic-gate 		enc_pack(aTHX_ result,size,endian,FBCHAR);
241*0Sstevel@tonic-gate 	    }
242*0Sstevel@tonic-gate 	}
243*0Sstevel@tonic-gate 	else {
244*0Sstevel@tonic-gate 	    enc_pack(aTHX_ result,size,endian,ord);
245*0Sstevel@tonic-gate 	}
246*0Sstevel@tonic-gate     }
247*0Sstevel@tonic-gate     if (s < e) {
248*0Sstevel@tonic-gate 	/* UTF-8 partial char happens often on PerlIO.
249*0Sstevel@tonic-gate 	   Since this is okay and normal, we do not warn.
250*0Sstevel@tonic-gate 	   But this is critical when you choose to LEAVE_SRC
251*0Sstevel@tonic-gate 	   in which case we die */
252*0Sstevel@tonic-gate 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
253*0Sstevel@tonic-gate 	    Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
254*0Sstevel@tonic-gate 		       "when CHECK = 0x%" UVuf,
255*0Sstevel@tonic-gate 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
256*0Sstevel@tonic-gate 	}
257*0Sstevel@tonic-gate 
258*0Sstevel@tonic-gate     }
259*0Sstevel@tonic-gate     if (check && !(check & ENCODE_LEAVE_SRC)){
260*0Sstevel@tonic-gate 	if (s < e) {
261*0Sstevel@tonic-gate 	    Move(s,SvPVX(utf8),e-s,U8);
262*0Sstevel@tonic-gate 	    SvCUR_set(utf8,(e-s));
263*0Sstevel@tonic-gate 	}
264*0Sstevel@tonic-gate 	else {
265*0Sstevel@tonic-gate 	    SvCUR_set(utf8,0);
266*0Sstevel@tonic-gate 	}
267*0Sstevel@tonic-gate 	*SvEND(utf8) = '\0';
268*0Sstevel@tonic-gate     }
269*0Sstevel@tonic-gate     XSRETURN(1);
270*0Sstevel@tonic-gate }
271*0Sstevel@tonic-gate 
272