1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate Data structures for encoding transformations.
3*0Sstevel@tonic-gate
4*0Sstevel@tonic-gate Perl works internally in either a native 'byte' encoding or
5*0Sstevel@tonic-gate in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
6*0Sstevel@tonic-gate representation. When we do we can use utf8_to_uv().
7*0Sstevel@tonic-gate
8*0Sstevel@tonic-gate Most character encodings are either simple byte mappings or
9*0Sstevel@tonic-gate variable length multi-byte encodings. UTF-8 can be viewed as a
10*0Sstevel@tonic-gate rather extreme case of the latter.
11*0Sstevel@tonic-gate
12*0Sstevel@tonic-gate So to solve an important part of perl's encode needs we need to solve the
13*0Sstevel@tonic-gate "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14*0Sstevel@tonic-gate case. (Where one of multi-bytes will usually be UTF-8.)
15*0Sstevel@tonic-gate
16*0Sstevel@tonic-gate The other type of encoding is a shift encoding where a prefix sequence
17*0Sstevel@tonic-gate determines what subsequent bytes mean. Such encodings have state.
18*0Sstevel@tonic-gate
19*0Sstevel@tonic-gate We also need to handle case where a character in one encoding has to be
20*0Sstevel@tonic-gate represented as multiple characters in the other. e.g. letter+diacritic.
21*0Sstevel@tonic-gate
22*0Sstevel@tonic-gate The process can be considered as pseudo perl:
23*0Sstevel@tonic-gate
24*0Sstevel@tonic-gate my $dst = '';
25*0Sstevel@tonic-gate while (length($src))
26*0Sstevel@tonic-gate {
27*0Sstevel@tonic-gate my $size = $count($src);
28*0Sstevel@tonic-gate my $in_seq = substr($src,0,$size,'');
29*0Sstevel@tonic-gate my $out_seq = $s2d_hash{$in_seq};
30*0Sstevel@tonic-gate if (defined $out_seq)
31*0Sstevel@tonic-gate {
32*0Sstevel@tonic-gate $dst .= $out_seq;
33*0Sstevel@tonic-gate }
34*0Sstevel@tonic-gate else
35*0Sstevel@tonic-gate {
36*0Sstevel@tonic-gate # an error condition
37*0Sstevel@tonic-gate }
38*0Sstevel@tonic-gate }
39*0Sstevel@tonic-gate return $dst;
40*0Sstevel@tonic-gate
41*0Sstevel@tonic-gate That has the following components:
42*0Sstevel@tonic-gate &src_count - a "rule" for how many bytes make up the next character in the
43*0Sstevel@tonic-gate source.
44*0Sstevel@tonic-gate %s2d_hash - a mapping from input sequences to output sequences
45*0Sstevel@tonic-gate
46*0Sstevel@tonic-gate The problem with that scheme is that it does not allow the output
47*0Sstevel@tonic-gate character repertoire to affect the characters considered from the
48*0Sstevel@tonic-gate input.
49*0Sstevel@tonic-gate
50*0Sstevel@tonic-gate So we use a "trie" representation which can also be considered
51*0Sstevel@tonic-gate a state machine:
52*0Sstevel@tonic-gate
53*0Sstevel@tonic-gate my $dst = '';
54*0Sstevel@tonic-gate my $seq = \@s2d_seq;
55*0Sstevel@tonic-gate my $next = \@s2d_next;
56*0Sstevel@tonic-gate while (length($src))
57*0Sstevel@tonic-gate {
58*0Sstevel@tonic-gate my $byte = $substr($src,0,1,'');
59*0Sstevel@tonic-gate my $out_seq = $seq->[$byte];
60*0Sstevel@tonic-gate if (defined $out_seq)
61*0Sstevel@tonic-gate {
62*0Sstevel@tonic-gate $dst .= $out_seq;
63*0Sstevel@tonic-gate }
64*0Sstevel@tonic-gate else
65*0Sstevel@tonic-gate {
66*0Sstevel@tonic-gate # an error condition
67*0Sstevel@tonic-gate }
68*0Sstevel@tonic-gate ($next,$seq) = @$next->[$byte] if $next;
69*0Sstevel@tonic-gate }
70*0Sstevel@tonic-gate return $dst;
71*0Sstevel@tonic-gate
72*0Sstevel@tonic-gate There is now a pair of data structures to represent everything.
73*0Sstevel@tonic-gate It is valid for output sequence at a particular point to
74*0Sstevel@tonic-gate be defined but zero length, that just means "don't know yet".
75*0Sstevel@tonic-gate For the single byte case there is no 'next' so new tables will be the same as
76*0Sstevel@tonic-gate the original tables. For a multi-byte case a prefix byte will flip to the tables
77*0Sstevel@tonic-gate for the next page (adding nothing to the output), then the tables for the page
78*0Sstevel@tonic-gate will provide the actual output and set tables back to original base page.
79*0Sstevel@tonic-gate
80*0Sstevel@tonic-gate This scheme can also handle shift encodings.
81*0Sstevel@tonic-gate
82*0Sstevel@tonic-gate A slight enhancement to the scheme also allows for look-ahead - if
83*0Sstevel@tonic-gate we add a flag to re-add the removed byte to the source we could handle
84*0Sstevel@tonic-gate a" -> �
85*0Sstevel@tonic-gate ab -> a (and take b back please)
86*0Sstevel@tonic-gate
87*0Sstevel@tonic-gate */
88*0Sstevel@tonic-gate
89*0Sstevel@tonic-gate #include <EXTERN.h>
90*0Sstevel@tonic-gate #include <perl.h>
91*0Sstevel@tonic-gate #define U8 U8
92*0Sstevel@tonic-gate #include "encode.h"
93*0Sstevel@tonic-gate
94*0Sstevel@tonic-gate int
do_encode(encpage_t * enc,const U8 * src,STRLEN * slen,U8 * dst,STRLEN dlen,STRLEN * dout,int approx,const U8 * term,STRLEN tlen)95*0Sstevel@tonic-gate do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96*0Sstevel@tonic-gate STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97*0Sstevel@tonic-gate {
98*0Sstevel@tonic-gate const U8 *s = src;
99*0Sstevel@tonic-gate const U8 *send = s + *slen;
100*0Sstevel@tonic-gate const U8 *last = s;
101*0Sstevel@tonic-gate U8 *d = dst;
102*0Sstevel@tonic-gate U8 *dend = d + dlen, *dlast = d;
103*0Sstevel@tonic-gate int code = 0;
104*0Sstevel@tonic-gate while (s < send) {
105*0Sstevel@tonic-gate encpage_t *e = enc;
106*0Sstevel@tonic-gate U8 byte = *s;
107*0Sstevel@tonic-gate while (byte > e->max)
108*0Sstevel@tonic-gate e++;
109*0Sstevel@tonic-gate if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
110*0Sstevel@tonic-gate const U8 *cend = s + (e->slen & 0x7f);
111*0Sstevel@tonic-gate if (cend <= send) {
112*0Sstevel@tonic-gate STRLEN n;
113*0Sstevel@tonic-gate if ((n = e->dlen)) {
114*0Sstevel@tonic-gate const U8 *out = e->seq + n * (byte - e->min);
115*0Sstevel@tonic-gate U8 *oend = d + n;
116*0Sstevel@tonic-gate if (dst) {
117*0Sstevel@tonic-gate if (oend <= dend) {
118*0Sstevel@tonic-gate while (d < oend)
119*0Sstevel@tonic-gate *d++ = *out++;
120*0Sstevel@tonic-gate }
121*0Sstevel@tonic-gate else {
122*0Sstevel@tonic-gate /* Out of space */
123*0Sstevel@tonic-gate code = ENCODE_NOSPACE;
124*0Sstevel@tonic-gate break;
125*0Sstevel@tonic-gate }
126*0Sstevel@tonic-gate }
127*0Sstevel@tonic-gate else
128*0Sstevel@tonic-gate d = oend;
129*0Sstevel@tonic-gate }
130*0Sstevel@tonic-gate enc = e->next;
131*0Sstevel@tonic-gate s++;
132*0Sstevel@tonic-gate if (s == cend) {
133*0Sstevel@tonic-gate if (approx && (e->slen & 0x80))
134*0Sstevel@tonic-gate code = ENCODE_FALLBACK;
135*0Sstevel@tonic-gate last = s;
136*0Sstevel@tonic-gate if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
137*0Sstevel@tonic-gate code = ENCODE_FOUND_TERM;
138*0Sstevel@tonic-gate break;
139*0Sstevel@tonic-gate }
140*0Sstevel@tonic-gate dlast = d;
141*0Sstevel@tonic-gate }
142*0Sstevel@tonic-gate }
143*0Sstevel@tonic-gate else {
144*0Sstevel@tonic-gate /* partial source character */
145*0Sstevel@tonic-gate code = ENCODE_PARTIAL;
146*0Sstevel@tonic-gate break;
147*0Sstevel@tonic-gate }
148*0Sstevel@tonic-gate }
149*0Sstevel@tonic-gate else {
150*0Sstevel@tonic-gate /* Cannot represent */
151*0Sstevel@tonic-gate code = ENCODE_NOREP;
152*0Sstevel@tonic-gate break;
153*0Sstevel@tonic-gate }
154*0Sstevel@tonic-gate }
155*0Sstevel@tonic-gate *slen = last - src;
156*0Sstevel@tonic-gate *dout = d - dst;
157*0Sstevel@tonic-gate return code;
158*0Sstevel@tonic-gate }
159