1 /*
2  $Id: Unicode.xs,v 1.9 2003/12/29 02:47:16 dankogai Exp dankogai $
3  */
4 
5 #define PERL_NO_GET_CONTEXT
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 #define U8 U8
10 #include "../Encode/encode.h"
11 
12 #define FBCHAR			0xFFFd
13 #define BOM_BE			0xFeFF
14 #define BOM16LE			0xFFFe
15 #define BOM32LE			0xFFFe0000
16 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
17 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
18 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
19 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
20 
21 static UV
enc_unpack(pTHX_ U8 ** sp,U8 * e,STRLEN size,U8 endian)22 enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
23 {
24     U8 *s = *sp;
25     UV v = 0;
26     if (s+size > e) {
27 	croak("Partial character %c",(char) endian);
28     }
29     switch(endian) {
30     case 'N':
31 	v = *s++;
32 	v = (v << 8) | *s++;
33     case 'n':
34 	v = (v << 8) | *s++;
35 	v = (v << 8) | *s++;
36 	break;
37     case 'V':
38     case 'v':
39 	v |= *s++;
40 	v |= (*s++ << 8);
41 	if (endian == 'v')
42 	    break;
43 	v |= (*s++ << 16);
44 	v |= (*s++ << 24);
45 	break;
46     default:
47 	croak("Unknown endian %c",(char) endian);
48 	break;
49     }
50     *sp = s;
51     return v;
52 }
53 
54 void
enc_pack(pTHX_ SV * result,STRLEN size,U8 endian,UV value)55 enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
56 {
57     U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1);
58     switch(endian) {
59     case 'v':
60     case 'V':
61 	d += SvCUR(result);
62 	SvCUR_set(result,SvCUR(result)+size);
63 	while (size--) {
64 	    *d++ = (U8)(value & 0xFF);
65 	    value >>= 8;
66 	}
67 	break;
68     case 'n':
69     case 'N':
70 	SvCUR_set(result,SvCUR(result)+size);
71 	d += SvCUR(result);
72 	while (size--) {
73 	    *--d = (U8)(value & 0xFF);
74 	    value >>= 8;
75 	}
76 	break;
77     default:
78 	croak("Unknown endian %c",(char) endian);
79 	break;
80     }
81 }
82 
83 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
84 
85 PROTOTYPES: DISABLE
86 
87 #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
88     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
89 
90 void
91 decode_xs(obj, str, check = 0)
92 SV *	obj
93 SV *	str
94 IV	check
95 CODE:
96 {
97     U8 endian   = *((U8 *)SvPV_nolen(attr("endian", 6)));
98     int size    =   SvIV(attr("size",   4));
99     int ucs2    = SvTRUE(attr("ucs2",   4));
100     int clone   = SvTRUE(attr("clone",  5));
101     SV *result  = newSVpvn("",0);
102     STRLEN ulen;
103     U8 *s = (U8 *)SvPVbyte(str,ulen);
104     U8 *e = (U8 *)SvEND(str);
105     ST(0) = sv_2mortal(result);
106     SvUTF8_on(result);
107 
108     if (!endian && s+size <= e) {
109 	UV bom;
110 	endian = (size == 4) ? 'N' : 'n';
111 	bom = enc_unpack(aTHX_ &s,e,size,endian);
112         if (bom != BOM_BE) {
113 	    if (bom == BOM16LE) {
114 		endian = 'v';
115 	    }
116 	    else if (bom == BOM32LE) {
117 		endian = 'V';
118 	    }
119 	    else {
120 		croak("%"SVf":Unrecognised BOM %"UVxf,
121                       *hv_fetch((HV *)SvRV(obj),"Name",4,0),
122 		      bom);
123 	    }
124 	}
125 #if 1
126 	/* Update endian for next sequence */
127 	if (clone) {
128 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
129 	}
130 #endif
131     }
132     while (s < e && s+size <= e) {
133 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
134 	U8 *d;
135 	if (size != 4 && invalid_ucs2(ord)) {
136 	    if (ucs2) {
137 		if (check) {
138 		    croak("%"SVf":no surrogates allowed %"UVxf,
139 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
140 			  ord);
141 		}
142 		if (s+size <= e) {
143                     /* skip the next one as well */
144 		    enc_unpack(aTHX_ &s,e,size,endian);
145 		}
146 		ord = FBCHAR;
147 	    }
148 	    else {
149 		UV lo;
150 		if (!isHiSurrogate(ord)) {
151 		    croak("%"SVf":Malformed HI surrogate %"UVxf,
152 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
153 			  ord);
154 		}
155 		if (s+size > e) {
156 		    /* Partial character */
157 		    s -= size;   /* back up to 1st half */
158 		    break;       /* And exit loop */
159 		}
160 		lo = enc_unpack(aTHX_ &s,e,size,endian);
161 		if (!isLoSurrogate(lo)){
162 		    croak("%"SVf":Malformed LO surrogate %"UVxf,
163 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
164 			  ord);
165 		}
166 		ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
167 	    }
168 	}
169 	d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
170 	d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
171 	SvCUR_set(result,d - (U8 *)SvPVX(result));
172     }
173     if (s < e) {
174 	/* unlikely to happen because it's fixed-length -- dankogai */
175 	if (check & ENCODE_WARN_ON_ERR){
176 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
177 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
178 	}
179     }
180     if (check && !(check & ENCODE_LEAVE_SRC)){
181 	if (s < e) {
182 	    Move(s,SvPVX(str),e-s,U8);
183 	    SvCUR_set(str,(e-s));
184 	}
185 	else {
186 	    SvCUR_set(str,0);
187 	}
188 	*SvEND(str) = '\0';
189     }
190     XSRETURN(1);
191 }
192 
193 void
194 encode_xs(obj, utf8, check = 0)
195 SV *	obj
196 SV *	utf8
197 IV	check
198 CODE:
199 {
200     U8 endian   = *((U8 *)SvPV_nolen(attr("endian", 6)));
201     int size    =   SvIV(attr("size",   4));
202     int ucs2    = SvTRUE(attr("ucs2",   4));
203     int clone   = SvTRUE(attr("clone",  5));
204     SV *result  = newSVpvn("",0);
205     STRLEN ulen;
206     U8 *s = (U8 *)SvPVutf8(utf8,ulen);
207     U8 *e = (U8 *)SvEND(utf8);
208     ST(0) = sv_2mortal(result);
209     if (!endian) {
210 	endian = (size == 4) ? 'N' : 'n';
211 	enc_pack(aTHX_ result,size,endian,BOM_BE);
212 #if 1
213 	/* Update endian for next sequence */
214 	if (clone){
215 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
216 	}
217 #endif
218     }
219     while (s < e && s+UTF8SKIP(s) <= e) {
220 	STRLEN len;
221 	UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
222         s += len;
223 	if (size != 4 && invalid_ucs2(ord)) {
224 	    if (!issurrogate(ord)){
225 		if (ucs2) {
226 		    if (check) {
227 			croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
228 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
229 		    }
230 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
231 		}else{
232 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
233 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
234 		    enc_pack(aTHX_ result,size,endian,hi);
235 		    enc_pack(aTHX_ result,size,endian,lo);
236 		}
237 	    }
238 	    else {
239 		/* not supposed to happen */
240 		enc_pack(aTHX_ result,size,endian,FBCHAR);
241 	    }
242 	}
243 	else {
244 	    enc_pack(aTHX_ result,size,endian,ord);
245 	}
246     }
247     if (s < e) {
248 	/* UTF-8 partial char happens often on PerlIO.
249 	   Since this is okay and normal, we do not warn.
250 	   But this is critical when you choose to LEAVE_SRC
251 	   in which case we die */
252 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
253 	    Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
254 		       "when CHECK = 0x%" UVuf,
255 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
256 	}
257 
258     }
259     if (check && !(check & ENCODE_LEAVE_SRC)){
260 	if (s < e) {
261 	    Move(s,SvPVX(utf8),e-s,U8);
262 	    SvCUR_set(utf8,(e-s));
263 	}
264 	else {
265 	    SvCUR_set(utf8,0);
266 	}
267 	*SvEND(utf8) = '\0';
268     }
269     XSRETURN(1);
270 }
271 
272