xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/encengine.c (revision b46d8ef224b95de1dddcd1f01c1ab482f0ab3778)
1b39c5158Smillert /*
2b39c5158Smillert Data structures for encoding transformations.
3b39c5158Smillert 
4b39c5158Smillert Perl works internally in either a native 'byte' encoding or
5b39c5158Smillert in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
6b39c5158Smillert representation. When we do we can use utf8_to_uv().
7b39c5158Smillert 
8b39c5158Smillert Most character encodings are either simple byte mappings or
9b39c5158Smillert variable length multi-byte encodings. UTF-8 can be viewed as a
10b39c5158Smillert rather extreme case of the latter.
11b39c5158Smillert 
12b39c5158Smillert So to solve an important part of perl's encode needs we need to solve the
13b39c5158Smillert "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14b39c5158Smillert case. (Where one of multi-bytes will usually be UTF-8.)
15b39c5158Smillert 
16b39c5158Smillert The other type of encoding is a shift encoding where a prefix sequence
17b39c5158Smillert determines what subsequent bytes mean. Such encodings have state.
18b39c5158Smillert 
19b39c5158Smillert We also need to handle case where a character in one encoding has to be
20b39c5158Smillert represented as multiple characters in the other. e.g. letter+diacritic.
21b39c5158Smillert 
22b39c5158Smillert The process can be considered as pseudo perl:
23b39c5158Smillert 
24b39c5158Smillert my $dst = '';
25b39c5158Smillert while (length($src))
26b39c5158Smillert  {
27*b46d8ef2Safresh1   my $size    = src_count($src);
28b39c5158Smillert   my $in_seq  = substr($src,0,$size,'');
29b39c5158Smillert   my $out_seq = $s2d_hash{$in_seq};
30b39c5158Smillert   if (defined $out_seq)
31b39c5158Smillert    {
32b39c5158Smillert     $dst .= $out_seq;
33b39c5158Smillert    }
34b39c5158Smillert   else
35b39c5158Smillert    {
36b39c5158Smillert     # an error condition
37b39c5158Smillert    }
38b39c5158Smillert  }
39b39c5158Smillert return $dst;
40b39c5158Smillert 
41b39c5158Smillert That has the following components:
42b39c5158Smillert  &src_count - a "rule" for how many bytes make up the next character in the
43b39c5158Smillert               source.
44b39c5158Smillert  %s2d_hash  - a mapping from input sequences to output sequences
45b39c5158Smillert 
46b39c5158Smillert The problem with that scheme is that it does not allow the output
47b39c5158Smillert character repertoire to affect the characters considered from the
48b39c5158Smillert input.
49b39c5158Smillert 
50b39c5158Smillert So we use a "trie" representation which can also be considered
51b39c5158Smillert a state machine:
52b39c5158Smillert 
53b39c5158Smillert my $dst   = '';
54b39c5158Smillert my $seq   = \@s2d_seq;
55b39c5158Smillert my $next  = \@s2d_next;
56b39c5158Smillert while (length($src))
57b39c5158Smillert  {
58b39c5158Smillert   my $byte    = $substr($src,0,1,'');
59b39c5158Smillert   my $out_seq = $seq->[$byte];
60b39c5158Smillert   if (defined $out_seq)
61b39c5158Smillert    {
62b39c5158Smillert     $dst .= $out_seq;
63b39c5158Smillert    }
64b39c5158Smillert   else
65b39c5158Smillert    {
66b39c5158Smillert     # an error condition
67b39c5158Smillert    }
68b39c5158Smillert   ($next,$seq) = @$next->[$byte] if $next;
69b39c5158Smillert  }
70b39c5158Smillert return $dst;
71b39c5158Smillert 
72b39c5158Smillert There is now a pair of data structures to represent everything.
73b39c5158Smillert It is valid for output sequence at a particular point to
74b39c5158Smillert be defined but zero length, that just means "don't know yet".
75b39c5158Smillert For the single byte case there is no 'next' so new tables will be the same as
76b39c5158Smillert the original tables. For a multi-byte case a prefix byte will flip to the tables
77b39c5158Smillert for  the next page (adding nothing to the output), then the tables for the page
78b39c5158Smillert will provide the actual output and set tables back to original base page.
79b39c5158Smillert 
80b39c5158Smillert This scheme can also handle shift encodings.
81b39c5158Smillert 
82b39c5158Smillert A slight enhancement to the scheme also allows for look-ahead - if
83b39c5158Smillert we add a flag to re-add the removed byte to the source we could handle
846fb12b70Safresh1   a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
85b39c5158Smillert   ab -> a (and take b back please)
86b39c5158Smillert 
87b39c5158Smillert */
88b39c5158Smillert 
89b8851fccSafresh1 #define PERL_NO_GET_CONTEXT
90b39c5158Smillert #include <EXTERN.h>
91b39c5158Smillert #include <perl.h>
92b39c5158Smillert #include "encode.h"
93b39c5158Smillert 
94b39c5158Smillert int
do_encode(const encpage_t * enc,const U8 * src,STRLEN * slen,U8 * dst,STRLEN dlen,STRLEN * dout,int approx,const U8 * term,STRLEN tlen)95b39c5158Smillert do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96b39c5158Smillert       STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97b39c5158Smillert {
98b39c5158Smillert     const U8 *s = src;
99b39c5158Smillert     const U8 *send = s + *slen;
100b39c5158Smillert     const U8 *last = s;
101b39c5158Smillert     U8 *d = dst;
102b39c5158Smillert     U8 *dend = d + dlen, *dlast = d;
103b39c5158Smillert     int code = 0;
104*b46d8ef2Safresh1     if (!dst)
105*b46d8ef2Safresh1       return ENCODE_NOSPACE;
106b39c5158Smillert     while (s < send) {
107b39c5158Smillert         const encpage_t *e = enc;
108b39c5158Smillert         U8 byte = *s;
109b39c5158Smillert         while (byte > e->max)
110b39c5158Smillert             e++;
111b39c5158Smillert         if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
112b39c5158Smillert             const U8 *cend = s + (e->slen & 0x7f);
113b39c5158Smillert             if (cend <= send) {
114b39c5158Smillert                 STRLEN n;
115b39c5158Smillert                 if ((n = e->dlen)) {
116b39c5158Smillert                     const U8 *out = e->seq + n * (byte - e->min);
117b39c5158Smillert                     U8 *oend = d + n;
118b39c5158Smillert                     if (dst) {
119b39c5158Smillert                         if (oend <= dend) {
120b39c5158Smillert                             while (d < oend)
121b39c5158Smillert                                 *d++ = *out++;
122b39c5158Smillert                         }
123b39c5158Smillert                         else {
124b39c5158Smillert                             /* Out of space */
125b39c5158Smillert                             code = ENCODE_NOSPACE;
126b39c5158Smillert                             break;
127b39c5158Smillert                         }
128b39c5158Smillert                     }
129b39c5158Smillert                     else
130b39c5158Smillert                         d = oend;
131b39c5158Smillert                 }
132b39c5158Smillert                 enc = e->next;
133b39c5158Smillert                 s++;
134b39c5158Smillert                 if (s == cend) {
135b39c5158Smillert                     if (approx && (e->slen & 0x80))
136b39c5158Smillert                         code = ENCODE_FALLBACK;
137b39c5158Smillert                     last = s;
138b39c5158Smillert                     if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
139b39c5158Smillert                         code = ENCODE_FOUND_TERM;
140b39c5158Smillert                         break;
141b39c5158Smillert                     }
142b39c5158Smillert                     dlast = d;
143b39c5158Smillert                 }
144b39c5158Smillert             }
145b39c5158Smillert             else {
146b39c5158Smillert                 /* partial source character */
147b39c5158Smillert                 code = ENCODE_PARTIAL;
148b39c5158Smillert                 break;
149b39c5158Smillert             }
150b39c5158Smillert         }
151b39c5158Smillert         else {
152b39c5158Smillert             /* Cannot represent */
153b39c5158Smillert             code = ENCODE_NOREP;
154b39c5158Smillert             break;
155b39c5158Smillert         }
156b39c5158Smillert     }
157b39c5158Smillert     *slen = last - src;
158b39c5158Smillert     *dout = d - dst;
159b39c5158Smillert     return code;
160b39c5158Smillert }
161