cpan/Encode/encengine.c

b39c5158Smillert/*
b39c5158SmillertData structures for encoding transformations.
b39c5158Smillert
b39c5158SmillertPerl works internally in either a native 'byte' encoding or
b39c5158Smillertin UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
b39c5158Smillertrepresentation. When we do we can use utf8_to_uv().
b39c5158Smillert
b39c5158SmillertMost character encodings are either simple byte mappings or
b39c5158Smillertvariable length multi-byte encodings. UTF-8 can be viewed as a
b39c5158Smillertrather extreme case of the latter.
b39c5158Smillert
b39c5158SmillertSo to solve an important part of perl's encode needs we need to solve the
b39c5158Smillert"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
b39c5158Smillertcase. (Where one of multi-bytes will usually be UTF-8.)
b39c5158Smillert
b39c5158SmillertThe other type of encoding is a shift encoding where a prefix sequence
b39c5158Smillertdetermines what subsequent bytes mean. Such encodings have state.
b39c5158Smillert
b39c5158SmillertWe also need to handle case where a character in one encoding has to be
b39c5158Smillertrepresented as multiple characters in the other. e.g. letter+diacritic.
b39c5158Smillert
b39c5158SmillertThe process can be considered as pseudo perl:
b39c5158Smillert
b39c5158Smillertmy $dst = '';
b39c5158Smillertwhile (length($src))
b39c5158Smillert {
*b46d8ef2Safresh1  my $size    = src_count($src);
b39c5158Smillert  my $in_seq  = substr($src,0,$size,'');
b39c5158Smillert  my $out_seq = $s2d_hash{$in_seq};
b39c5158Smillert  if (defined $out_seq)
b39c5158Smillert   {
b39c5158Smillert    $dst .= $out_seq;
b39c5158Smillert   }
b39c5158Smillert  else
b39c5158Smillert   {
b39c5158Smillert    # an error condition
b39c5158Smillert   }
b39c5158Smillert }
b39c5158Smillertreturn $dst;
b39c5158Smillert
b39c5158SmillertThat has the following components:
b39c5158Smillert &src_count - a "rule" for how many bytes make up the next character in the
b39c5158Smillert              source.
b39c5158Smillert %s2d_hash  - a mapping from input sequences to output sequences
b39c5158Smillert
b39c5158SmillertThe problem with that scheme is that it does not allow the output
b39c5158Smillertcharacter repertoire to affect the characters considered from the
b39c5158Smillertinput.
b39c5158Smillert
b39c5158SmillertSo we use a "trie" representation which can also be considered
b39c5158Smillerta state machine:
b39c5158Smillert
b39c5158Smillertmy $dst   = '';
b39c5158Smillertmy $seq   = \@s2d_seq;
b39c5158Smillertmy $next  = \@s2d_next;
b39c5158Smillertwhile (length($src))
b39c5158Smillert {
b39c5158Smillert  my $byte    = $substr($src,0,1,'');
b39c5158Smillert  my $out_seq = $seq->[$byte];
b39c5158Smillert  if (defined $out_seq)
b39c5158Smillert   {
b39c5158Smillert    $dst .= $out_seq;
b39c5158Smillert   }
b39c5158Smillert  else
b39c5158Smillert   {
b39c5158Smillert    # an error condition
b39c5158Smillert   }
b39c5158Smillert  ($next,$seq) = @$next->[$byte] if $next;
b39c5158Smillert }
b39c5158Smillertreturn $dst;
b39c5158Smillert
b39c5158SmillertThere is now a pair of data structures to represent everything.
b39c5158SmillertIt is valid for output sequence at a particular point to
b39c5158Smillertbe defined but zero length, that just means "don't know yet".
b39c5158SmillertFor the single byte case there is no 'next' so new tables will be the same as
b39c5158Smillertthe original tables. For a multi-byte case a prefix byte will flip to the tables
b39c5158Smillertfor  the next page (adding nothing to the output), then the tables for the page
b39c5158Smillertwill provide the actual output and set tables back to original base page.
b39c5158Smillert
b39c5158SmillertThis scheme can also handle shift encodings.
b39c5158Smillert
b39c5158SmillertA slight enhancement to the scheme also allows for look-ahead - if
b39c5158Smillertwe add a flag to re-add the removed byte to the source we could handle
6fb12b70Safresh1  a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
b39c5158Smillert  ab -> a (and take b back please)
b39c5158Smillert
b39c5158Smillert*/
b39c5158Smillert
b8851fccSafresh1#define PERL_NO_GET_CONTEXT
b39c5158Smillert#include <EXTERN.h>
b39c5158Smillert#include <perl.h>
b39c5158Smillert#include "encode.h"
b39c5158Smillert
b39c5158Smillertint
b39c5158Smillertdo_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
b39c5158Smillert      STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
b39c5158Smillert{
b39c5158Smillert    const U8 *s = src;
b39c5158Smillert    const U8 *send = s + *slen;
b39c5158Smillert    const U8 *last = s;
b39c5158Smillert    U8 *d = dst;
b39c5158Smillert    U8 *dend = d + dlen, *dlast = d;
b39c5158Smillert    int code = 0;
*b46d8ef2Safresh1    if (!dst)
*b46d8ef2Safresh1      return ENCODE_NOSPACE;
b39c5158Smillert    while (s < send) {
b39c5158Smillert        const encpage_t *e = enc;
b39c5158Smillert        U8 byte = *s;
b39c5158Smillert        while (byte > e->max)
b39c5158Smillert            e++;
b39c5158Smillert        if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
b39c5158Smillert            const U8 *cend = s + (e->slen & 0x7f);
b39c5158Smillert            if (cend <= send) {
b39c5158Smillert                STRLEN n;
b39c5158Smillert                if ((n = e->dlen)) {
b39c5158Smillert                    const U8 *out = e->seq + n * (byte - e->min);
b39c5158Smillert                    U8 *oend = d + n;
b39c5158Smillert                    if (dst) {
b39c5158Smillert                        if (oend <= dend) {
b39c5158Smillert                            while (d < oend)
b39c5158Smillert                                *d++ = *out++;
b39c5158Smillert                        }
b39c5158Smillert                        else {
b39c5158Smillert                            /* Out of space */
b39c5158Smillert                            code = ENCODE_NOSPACE;
b39c5158Smillert                            break;
b39c5158Smillert                        }
b39c5158Smillert                    }
b39c5158Smillert                    else
b39c5158Smillert                        d = oend;
b39c5158Smillert                }
b39c5158Smillert                enc = e->next;
b39c5158Smillert                s++;
b39c5158Smillert                if (s == cend) {
b39c5158Smillert                    if (approx && (e->slen & 0x80))
b39c5158Smillert                        code = ENCODE_FALLBACK;
b39c5158Smillert                    last = s;
b39c5158Smillert                    if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
b39c5158Smillert                        code = ENCODE_FOUND_TERM;
b39c5158Smillert                        break;
b39c5158Smillert                    }
b39c5158Smillert                    dlast = d;
b39c5158Smillert                }
b39c5158Smillert            }
b39c5158Smillert            else {
b39c5158Smillert                /* partial source character */
b39c5158Smillert                code = ENCODE_PARTIAL;
b39c5158Smillert                break;
b39c5158Smillert            }
b39c5158Smillert        }
b39c5158Smillert        else {
b39c5158Smillert            /* Cannot represent */
b39c5158Smillert            code = ENCODE_NOREP;
b39c5158Smillert            break;
b39c5158Smillert        }
b39c5158Smillert    }
b39c5158Smillert    *slen = last - src;
b39c5158Smillert    *dout = d - dst;
b39c5158Smillert    return code;
b39c5158Smillert}