ext/Encode/encengine.c

*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gateData structures for encoding transformations.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatePerl works internally in either a native 'byte' encoding or
*0Sstevel@tonic-gatein UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
*0Sstevel@tonic-gaterepresentation. When we do we can use utf8_to_uv().
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateMost character encodings are either simple byte mappings or
*0Sstevel@tonic-gatevariable length multi-byte encodings. UTF-8 can be viewed as a
*0Sstevel@tonic-gaterather extreme case of the latter.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSo to solve an important part of perl's encode needs we need to solve the
*0Sstevel@tonic-gate"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
*0Sstevel@tonic-gatecase. (Where one of multi-bytes will usually be UTF-8.)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe other type of encoding is a shift encoding where a prefix sequence
*0Sstevel@tonic-gatedetermines what subsequent bytes mean. Such encodings have state.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWe also need to handle case where a character in one encoding has to be
*0Sstevel@tonic-gaterepresented as multiple characters in the other. e.g. letter+diacritic.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe process can be considered as pseudo perl:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy $dst = '';
*0Sstevel@tonic-gatewhile (length($src))
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  my $size    = $count($src);
*0Sstevel@tonic-gate  my $in_seq  = substr($src,0,$size,'');
*0Sstevel@tonic-gate  my $out_seq = $s2d_hash{$in_seq};
*0Sstevel@tonic-gate  if (defined $out_seq)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    $dst .= $out_seq;
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  else
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    # an error condition
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gatereturn $dst;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThat has the following components:
*0Sstevel@tonic-gate &src_count - a "rule" for how many bytes make up the next character in the
*0Sstevel@tonic-gate              source.
*0Sstevel@tonic-gate %s2d_hash  - a mapping from input sequences to output sequences
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe problem with that scheme is that it does not allow the output
*0Sstevel@tonic-gatecharacter repertoire to affect the characters considered from the
*0Sstevel@tonic-gateinput.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSo we use a "trie" representation which can also be considered
*0Sstevel@tonic-gatea state machine:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy $dst   = '';
*0Sstevel@tonic-gatemy $seq   = \@s2d_seq;
*0Sstevel@tonic-gatemy $next  = \@s2d_next;
*0Sstevel@tonic-gatewhile (length($src))
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  my $byte    = $substr($src,0,1,'');
*0Sstevel@tonic-gate  my $out_seq = $seq->[$byte];
*0Sstevel@tonic-gate  if (defined $out_seq)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    $dst .= $out_seq;
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  else
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    # an error condition
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  ($next,$seq) = @$next->[$byte] if $next;
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gatereturn $dst;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThere is now a pair of data structures to represent everything.
*0Sstevel@tonic-gateIt is valid for output sequence at a particular point to
*0Sstevel@tonic-gatebe defined but zero length, that just means "don't know yet".
*0Sstevel@tonic-gateFor the single byte case there is no 'next' so new tables will be the same as
*0Sstevel@tonic-gatethe original tables. For a multi-byte case a prefix byte will flip to the tables
*0Sstevel@tonic-gatefor  the next page (adding nothing to the output), then the tables for the page
*0Sstevel@tonic-gatewill provide the actual output and set tables back to original base page.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThis scheme can also handle shift encodings.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateA slight enhancement to the scheme also allows for look-ahead - if
*0Sstevel@tonic-gatewe add a flag to re-add the removed byte to the source we could handle
*0Sstevel@tonic-gate  a" -> �
*0Sstevel@tonic-gate  ab -> a (and take b back please)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate*/
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate#include <EXTERN.h>
*0Sstevel@tonic-gate#include <perl.h>
*0Sstevel@tonic-gate#define U8 U8
*0Sstevel@tonic-gate#include "encode.h"
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateint
*0Sstevel@tonic-gatedo_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
*0Sstevel@tonic-gate	  STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate    const U8 *s = src;
*0Sstevel@tonic-gate    const U8 *send = s + *slen;
*0Sstevel@tonic-gate    const U8 *last = s;
*0Sstevel@tonic-gate    U8 *d = dst;
*0Sstevel@tonic-gate    U8 *dend = d + dlen, *dlast = d;
*0Sstevel@tonic-gate    int code = 0;
*0Sstevel@tonic-gate    while (s < send) {
*0Sstevel@tonic-gate	encpage_t *e = enc;
*0Sstevel@tonic-gate	U8 byte = *s;
*0Sstevel@tonic-gate	while (byte > e->max)
*0Sstevel@tonic-gate	    e++;
*0Sstevel@tonic-gate	if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
*0Sstevel@tonic-gate	    const U8 *cend = s + (e->slen & 0x7f);
*0Sstevel@tonic-gate	    if (cend <= send) {
*0Sstevel@tonic-gate		STRLEN n;
*0Sstevel@tonic-gate		if ((n = e->dlen)) {
*0Sstevel@tonic-gate		    const U8 *out = e->seq + n * (byte - e->min);
*0Sstevel@tonic-gate		    U8 *oend = d + n;
*0Sstevel@tonic-gate		    if (dst) {
*0Sstevel@tonic-gate			if (oend <= dend) {
*0Sstevel@tonic-gate			    while (d < oend)
*0Sstevel@tonic-gate				*d++ = *out++;
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate			else {
*0Sstevel@tonic-gate			    /* Out of space */
*0Sstevel@tonic-gate			    code = ENCODE_NOSPACE;
*0Sstevel@tonic-gate			    break;
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate		    }
*0Sstevel@tonic-gate		    else
*0Sstevel@tonic-gate			d = oend;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate		enc = e->next;
*0Sstevel@tonic-gate		s++;
*0Sstevel@tonic-gate		if (s == cend) {
*0Sstevel@tonic-gate		    if (approx && (e->slen & 0x80))
*0Sstevel@tonic-gate			code = ENCODE_FALLBACK;
*0Sstevel@tonic-gate		    last = s;
*0Sstevel@tonic-gate		    if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
*0Sstevel@tonic-gate		      code = ENCODE_FOUND_TERM;
*0Sstevel@tonic-gate		      break;
*0Sstevel@tonic-gate		    }
*0Sstevel@tonic-gate		    dlast = d;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    else {
*0Sstevel@tonic-gate		/* partial source character */
*0Sstevel@tonic-gate		code = ENCODE_PARTIAL;
*0Sstevel@tonic-gate		break;
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate	else {
*0Sstevel@tonic-gate	    /* Cannot represent */
*0Sstevel@tonic-gate	    code = ENCODE_NOREP;
*0Sstevel@tonic-gate	    break;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    *slen = last - src;
*0Sstevel@tonic-gate    *dout = d - dst;
*0Sstevel@tonic-gate    return code;
*0Sstevel@tonic-gate}