1b39c5158Smillert /*
2b39c5158Smillert Data structures for encoding transformations.
3b39c5158Smillert
4b39c5158Smillert Perl works internally in either a native 'byte' encoding or
5b39c5158Smillert in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
6b39c5158Smillert representation. When we do we can use utf8_to_uv().
7b39c5158Smillert
8b39c5158Smillert Most character encodings are either simple byte mappings or
9b39c5158Smillert variable length multi-byte encodings. UTF-8 can be viewed as a
10b39c5158Smillert rather extreme case of the latter.
11b39c5158Smillert
12b39c5158Smillert So to solve an important part of perl's encode needs we need to solve the
13b39c5158Smillert "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14b39c5158Smillert case. (Where one of multi-bytes will usually be UTF-8.)
15b39c5158Smillert
16b39c5158Smillert The other type of encoding is a shift encoding where a prefix sequence
17b39c5158Smillert determines what subsequent bytes mean. Such encodings have state.
18b39c5158Smillert
19b39c5158Smillert We also need to handle case where a character in one encoding has to be
20b39c5158Smillert represented as multiple characters in the other. e.g. letter+diacritic.
21b39c5158Smillert
22b39c5158Smillert The process can be considered as pseudo perl:
23b39c5158Smillert
24b39c5158Smillert my $dst = '';
25b39c5158Smillert while (length($src))
26b39c5158Smillert {
27*b46d8ef2Safresh1 my $size = src_count($src);
28b39c5158Smillert my $in_seq = substr($src,0,$size,'');
29b39c5158Smillert my $out_seq = $s2d_hash{$in_seq};
30b39c5158Smillert if (defined $out_seq)
31b39c5158Smillert {
32b39c5158Smillert $dst .= $out_seq;
33b39c5158Smillert }
34b39c5158Smillert else
35b39c5158Smillert {
36b39c5158Smillert # an error condition
37b39c5158Smillert }
38b39c5158Smillert }
39b39c5158Smillert return $dst;
40b39c5158Smillert
41b39c5158Smillert That has the following components:
42b39c5158Smillert &src_count - a "rule" for how many bytes make up the next character in the
43b39c5158Smillert source.
44b39c5158Smillert %s2d_hash - a mapping from input sequences to output sequences
45b39c5158Smillert
46b39c5158Smillert The problem with that scheme is that it does not allow the output
47b39c5158Smillert character repertoire to affect the characters considered from the
48b39c5158Smillert input.
49b39c5158Smillert
50b39c5158Smillert So we use a "trie" representation which can also be considered
51b39c5158Smillert a state machine:
52b39c5158Smillert
53b39c5158Smillert my $dst = '';
54b39c5158Smillert my $seq = \@s2d_seq;
55b39c5158Smillert my $next = \@s2d_next;
56b39c5158Smillert while (length($src))
57b39c5158Smillert {
58b39c5158Smillert my $byte = $substr($src,0,1,'');
59b39c5158Smillert my $out_seq = $seq->[$byte];
60b39c5158Smillert if (defined $out_seq)
61b39c5158Smillert {
62b39c5158Smillert $dst .= $out_seq;
63b39c5158Smillert }
64b39c5158Smillert else
65b39c5158Smillert {
66b39c5158Smillert # an error condition
67b39c5158Smillert }
68b39c5158Smillert ($next,$seq) = @$next->[$byte] if $next;
69b39c5158Smillert }
70b39c5158Smillert return $dst;
71b39c5158Smillert
72b39c5158Smillert There is now a pair of data structures to represent everything.
73b39c5158Smillert It is valid for output sequence at a particular point to
74b39c5158Smillert be defined but zero length, that just means "don't know yet".
75b39c5158Smillert For the single byte case there is no 'next' so new tables will be the same as
76b39c5158Smillert the original tables. For a multi-byte case a prefix byte will flip to the tables
77b39c5158Smillert for the next page (adding nothing to the output), then the tables for the page
78b39c5158Smillert will provide the actual output and set tables back to original base page.
79b39c5158Smillert
80b39c5158Smillert This scheme can also handle shift encodings.
81b39c5158Smillert
82b39c5158Smillert A slight enhancement to the scheme also allows for look-ahead - if
83b39c5158Smillert we add a flag to re-add the removed byte to the source we could handle
846fb12b70Safresh1 a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
85b39c5158Smillert ab -> a (and take b back please)
86b39c5158Smillert
87b39c5158Smillert */
88b39c5158Smillert
89b8851fccSafresh1 #define PERL_NO_GET_CONTEXT
90b39c5158Smillert #include <EXTERN.h>
91b39c5158Smillert #include <perl.h>
92b39c5158Smillert #include "encode.h"
93b39c5158Smillert
94b39c5158Smillert int
do_encode(const encpage_t * enc,const U8 * src,STRLEN * slen,U8 * dst,STRLEN dlen,STRLEN * dout,int approx,const U8 * term,STRLEN tlen)95b39c5158Smillert do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96b39c5158Smillert STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97b39c5158Smillert {
98b39c5158Smillert const U8 *s = src;
99b39c5158Smillert const U8 *send = s + *slen;
100b39c5158Smillert const U8 *last = s;
101b39c5158Smillert U8 *d = dst;
102b39c5158Smillert U8 *dend = d + dlen, *dlast = d;
103b39c5158Smillert int code = 0;
104*b46d8ef2Safresh1 if (!dst)
105*b46d8ef2Safresh1 return ENCODE_NOSPACE;
106b39c5158Smillert while (s < send) {
107b39c5158Smillert const encpage_t *e = enc;
108b39c5158Smillert U8 byte = *s;
109b39c5158Smillert while (byte > e->max)
110b39c5158Smillert e++;
111b39c5158Smillert if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
112b39c5158Smillert const U8 *cend = s + (e->slen & 0x7f);
113b39c5158Smillert if (cend <= send) {
114b39c5158Smillert STRLEN n;
115b39c5158Smillert if ((n = e->dlen)) {
116b39c5158Smillert const U8 *out = e->seq + n * (byte - e->min);
117b39c5158Smillert U8 *oend = d + n;
118b39c5158Smillert if (dst) {
119b39c5158Smillert if (oend <= dend) {
120b39c5158Smillert while (d < oend)
121b39c5158Smillert *d++ = *out++;
122b39c5158Smillert }
123b39c5158Smillert else {
124b39c5158Smillert /* Out of space */
125b39c5158Smillert code = ENCODE_NOSPACE;
126b39c5158Smillert break;
127b39c5158Smillert }
128b39c5158Smillert }
129b39c5158Smillert else
130b39c5158Smillert d = oend;
131b39c5158Smillert }
132b39c5158Smillert enc = e->next;
133b39c5158Smillert s++;
134b39c5158Smillert if (s == cend) {
135b39c5158Smillert if (approx && (e->slen & 0x80))
136b39c5158Smillert code = ENCODE_FALLBACK;
137b39c5158Smillert last = s;
138b39c5158Smillert if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
139b39c5158Smillert code = ENCODE_FOUND_TERM;
140b39c5158Smillert break;
141b39c5158Smillert }
142b39c5158Smillert dlast = d;
143b39c5158Smillert }
144b39c5158Smillert }
145b39c5158Smillert else {
146b39c5158Smillert /* partial source character */
147b39c5158Smillert code = ENCODE_PARTIAL;
148b39c5158Smillert break;
149b39c5158Smillert }
150b39c5158Smillert }
151b39c5158Smillert else {
152b39c5158Smillert /* Cannot represent */
153b39c5158Smillert code = ENCODE_NOREP;
154b39c5158Smillert break;
155b39c5158Smillert }
156b39c5158Smillert }
157b39c5158Smillert *slen = last - src;
158b39c5158Smillert *dout = d - dst;
159b39c5158Smillert return code;
160b39c5158Smillert }
161