xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/ext/Encode/encengine.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate Data structures for encoding transformations.
3*0Sstevel@tonic-gate 
4*0Sstevel@tonic-gate Perl works internally in either a native 'byte' encoding or
5*0Sstevel@tonic-gate in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
6*0Sstevel@tonic-gate representation. When we do we can use utf8_to_uv().
7*0Sstevel@tonic-gate 
8*0Sstevel@tonic-gate Most character encodings are either simple byte mappings or
9*0Sstevel@tonic-gate variable length multi-byte encodings. UTF-8 can be viewed as a
10*0Sstevel@tonic-gate rather extreme case of the latter.
11*0Sstevel@tonic-gate 
12*0Sstevel@tonic-gate So to solve an important part of perl's encode needs we need to solve the
13*0Sstevel@tonic-gate "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14*0Sstevel@tonic-gate case. (Where one of multi-bytes will usually be UTF-8.)
15*0Sstevel@tonic-gate 
16*0Sstevel@tonic-gate The other type of encoding is a shift encoding where a prefix sequence
17*0Sstevel@tonic-gate determines what subsequent bytes mean. Such encodings have state.
18*0Sstevel@tonic-gate 
19*0Sstevel@tonic-gate We also need to handle case where a character in one encoding has to be
20*0Sstevel@tonic-gate represented as multiple characters in the other. e.g. letter+diacritic.
21*0Sstevel@tonic-gate 
22*0Sstevel@tonic-gate The process can be considered as pseudo perl:
23*0Sstevel@tonic-gate 
24*0Sstevel@tonic-gate my $dst = '';
25*0Sstevel@tonic-gate while (length($src))
26*0Sstevel@tonic-gate  {
27*0Sstevel@tonic-gate   my $size    = $count($src);
28*0Sstevel@tonic-gate   my $in_seq  = substr($src,0,$size,'');
29*0Sstevel@tonic-gate   my $out_seq = $s2d_hash{$in_seq};
30*0Sstevel@tonic-gate   if (defined $out_seq)
31*0Sstevel@tonic-gate    {
32*0Sstevel@tonic-gate     $dst .= $out_seq;
33*0Sstevel@tonic-gate    }
34*0Sstevel@tonic-gate   else
35*0Sstevel@tonic-gate    {
36*0Sstevel@tonic-gate     # an error condition
37*0Sstevel@tonic-gate    }
38*0Sstevel@tonic-gate  }
39*0Sstevel@tonic-gate return $dst;
40*0Sstevel@tonic-gate 
41*0Sstevel@tonic-gate That has the following components:
42*0Sstevel@tonic-gate  &src_count - a "rule" for how many bytes make up the next character in the
43*0Sstevel@tonic-gate               source.
44*0Sstevel@tonic-gate  %s2d_hash  - a mapping from input sequences to output sequences
45*0Sstevel@tonic-gate 
46*0Sstevel@tonic-gate The problem with that scheme is that it does not allow the output
47*0Sstevel@tonic-gate character repertoire to affect the characters considered from the
48*0Sstevel@tonic-gate input.
49*0Sstevel@tonic-gate 
50*0Sstevel@tonic-gate So we use a "trie" representation which can also be considered
51*0Sstevel@tonic-gate a state machine:
52*0Sstevel@tonic-gate 
53*0Sstevel@tonic-gate my $dst   = '';
54*0Sstevel@tonic-gate my $seq   = \@s2d_seq;
55*0Sstevel@tonic-gate my $next  = \@s2d_next;
56*0Sstevel@tonic-gate while (length($src))
57*0Sstevel@tonic-gate  {
58*0Sstevel@tonic-gate   my $byte    = $substr($src,0,1,'');
59*0Sstevel@tonic-gate   my $out_seq = $seq->[$byte];
60*0Sstevel@tonic-gate   if (defined $out_seq)
61*0Sstevel@tonic-gate    {
62*0Sstevel@tonic-gate     $dst .= $out_seq;
63*0Sstevel@tonic-gate    }
64*0Sstevel@tonic-gate   else
65*0Sstevel@tonic-gate    {
66*0Sstevel@tonic-gate     # an error condition
67*0Sstevel@tonic-gate    }
68*0Sstevel@tonic-gate   ($next,$seq) = @$next->[$byte] if $next;
69*0Sstevel@tonic-gate  }
70*0Sstevel@tonic-gate return $dst;
71*0Sstevel@tonic-gate 
72*0Sstevel@tonic-gate There is now a pair of data structures to represent everything.
73*0Sstevel@tonic-gate It is valid for output sequence at a particular point to
74*0Sstevel@tonic-gate be defined but zero length, that just means "don't know yet".
75*0Sstevel@tonic-gate For the single byte case there is no 'next' so new tables will be the same as
76*0Sstevel@tonic-gate the original tables. For a multi-byte case a prefix byte will flip to the tables
77*0Sstevel@tonic-gate for  the next page (adding nothing to the output), then the tables for the page
78*0Sstevel@tonic-gate will provide the actual output and set tables back to original base page.
79*0Sstevel@tonic-gate 
80*0Sstevel@tonic-gate This scheme can also handle shift encodings.
81*0Sstevel@tonic-gate 
82*0Sstevel@tonic-gate A slight enhancement to the scheme also allows for look-ahead - if
83*0Sstevel@tonic-gate we add a flag to re-add the removed byte to the source we could handle
84*0Sstevel@tonic-gate   a" -> �
85*0Sstevel@tonic-gate   ab -> a (and take b back please)
86*0Sstevel@tonic-gate 
87*0Sstevel@tonic-gate */
88*0Sstevel@tonic-gate 
89*0Sstevel@tonic-gate #include <EXTERN.h>
90*0Sstevel@tonic-gate #include <perl.h>
91*0Sstevel@tonic-gate #define U8 U8
92*0Sstevel@tonic-gate #include "encode.h"
93*0Sstevel@tonic-gate 
94*0Sstevel@tonic-gate int
do_encode(encpage_t * enc,const U8 * src,STRLEN * slen,U8 * dst,STRLEN dlen,STRLEN * dout,int approx,const U8 * term,STRLEN tlen)95*0Sstevel@tonic-gate do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96*0Sstevel@tonic-gate 	  STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97*0Sstevel@tonic-gate {
98*0Sstevel@tonic-gate     const U8 *s = src;
99*0Sstevel@tonic-gate     const U8 *send = s + *slen;
100*0Sstevel@tonic-gate     const U8 *last = s;
101*0Sstevel@tonic-gate     U8 *d = dst;
102*0Sstevel@tonic-gate     U8 *dend = d + dlen, *dlast = d;
103*0Sstevel@tonic-gate     int code = 0;
104*0Sstevel@tonic-gate     while (s < send) {
105*0Sstevel@tonic-gate 	encpage_t *e = enc;
106*0Sstevel@tonic-gate 	U8 byte = *s;
107*0Sstevel@tonic-gate 	while (byte > e->max)
108*0Sstevel@tonic-gate 	    e++;
109*0Sstevel@tonic-gate 	if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
110*0Sstevel@tonic-gate 	    const U8 *cend = s + (e->slen & 0x7f);
111*0Sstevel@tonic-gate 	    if (cend <= send) {
112*0Sstevel@tonic-gate 		STRLEN n;
113*0Sstevel@tonic-gate 		if ((n = e->dlen)) {
114*0Sstevel@tonic-gate 		    const U8 *out = e->seq + n * (byte - e->min);
115*0Sstevel@tonic-gate 		    U8 *oend = d + n;
116*0Sstevel@tonic-gate 		    if (dst) {
117*0Sstevel@tonic-gate 			if (oend <= dend) {
118*0Sstevel@tonic-gate 			    while (d < oend)
119*0Sstevel@tonic-gate 				*d++ = *out++;
120*0Sstevel@tonic-gate 			}
121*0Sstevel@tonic-gate 			else {
122*0Sstevel@tonic-gate 			    /* Out of space */
123*0Sstevel@tonic-gate 			    code = ENCODE_NOSPACE;
124*0Sstevel@tonic-gate 			    break;
125*0Sstevel@tonic-gate 			}
126*0Sstevel@tonic-gate 		    }
127*0Sstevel@tonic-gate 		    else
128*0Sstevel@tonic-gate 			d = oend;
129*0Sstevel@tonic-gate 		}
130*0Sstevel@tonic-gate 		enc = e->next;
131*0Sstevel@tonic-gate 		s++;
132*0Sstevel@tonic-gate 		if (s == cend) {
133*0Sstevel@tonic-gate 		    if (approx && (e->slen & 0x80))
134*0Sstevel@tonic-gate 			code = ENCODE_FALLBACK;
135*0Sstevel@tonic-gate 		    last = s;
136*0Sstevel@tonic-gate 		    if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
137*0Sstevel@tonic-gate 		      code = ENCODE_FOUND_TERM;
138*0Sstevel@tonic-gate 		      break;
139*0Sstevel@tonic-gate 		    }
140*0Sstevel@tonic-gate 		    dlast = d;
141*0Sstevel@tonic-gate 		}
142*0Sstevel@tonic-gate 	    }
143*0Sstevel@tonic-gate 	    else {
144*0Sstevel@tonic-gate 		/* partial source character */
145*0Sstevel@tonic-gate 		code = ENCODE_PARTIAL;
146*0Sstevel@tonic-gate 		break;
147*0Sstevel@tonic-gate 	    }
148*0Sstevel@tonic-gate 	}
149*0Sstevel@tonic-gate 	else {
150*0Sstevel@tonic-gate 	    /* Cannot represent */
151*0Sstevel@tonic-gate 	    code = ENCODE_NOREP;
152*0Sstevel@tonic-gate 	    break;
153*0Sstevel@tonic-gate 	}
154*0Sstevel@tonic-gate     }
155*0Sstevel@tonic-gate     *slen = last - src;
156*0Sstevel@tonic-gate     *dout = d - dst;
157*0Sstevel@tonic-gate     return code;
158*0Sstevel@tonic-gate }
159