enc2xs (revision 0:68f95e015346) - OpenGrok cross reference for /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/ext/Encode/bin/enc2xs

*0Sstevel@tonic-gate#!./perl
*0Sstevel@tonic-gateBEGIN {
*0Sstevel@tonic-gate    # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
*0Sstevel@tonic-gate    # with $ENV{PERL_CORE} set
*0Sstevel@tonic-gate    # In case we need it in future...
*0Sstevel@tonic-gate    require Config; import Config;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gateuse strict;
*0Sstevel@tonic-gateuse warnings;
*0Sstevel@tonic-gateuse Getopt::Std;
*0Sstevel@tonic-gatemy @orig_ARGV = @ARGV;
*0Sstevel@tonic-gateour $VERSION  = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# These may get re-ordered.
*0Sstevel@tonic-gate# RAW is a do_now as inserted by &enter
*0Sstevel@tonic-gate# AGG is an aggreagated do_now, as built up by &process
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse constant {
*0Sstevel@tonic-gate  RAW_NEXT => 0,
*0Sstevel@tonic-gate  RAW_IN_LEN => 1,
*0Sstevel@tonic-gate  RAW_OUT_BYTES => 2,
*0Sstevel@tonic-gate  RAW_FALLBACK => 3,
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  AGG_MIN_IN => 0,
*0Sstevel@tonic-gate  AGG_MAX_IN => 1,
*0Sstevel@tonic-gate  AGG_OUT_BYTES => 2,
*0Sstevel@tonic-gate  AGG_NEXT => 3,
*0Sstevel@tonic-gate  AGG_IN_LEN => 4,
*0Sstevel@tonic-gate  AGG_OUT_LEN => 5,
*0Sstevel@tonic-gate  AGG_FALLBACK => 6,
*0Sstevel@tonic-gate};
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# (See the algorithm in encengine.c - we're building structures for it)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# There are two sorts of structures.
*0Sstevel@tonic-gate# "do_now" (an array, two variants of what needs storing) is whatever we need
*0Sstevel@tonic-gate# to do now we've read an input byte.
*0Sstevel@tonic-gate# It's housed in a "do_next" (which is how we got to it), and in turn points
*0Sstevel@tonic-gate# to a "do_next" which contains all the "do_now"s for the next input byte.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# There will be a "do_next" which is the start state.
*0Sstevel@tonic-gate# For a single byte encoding it's the only "do_next" - each "do_now" points
*0Sstevel@tonic-gate# back to it, and each "do_now" will cause bytes. There is no state.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# For a multi-byte encoding where all characters in the input are the same
*0Sstevel@tonic-gate# length, then there will be a tree of "do_now"->"do_next"->"do_now"
*0Sstevel@tonic-gate# branching out from the start state, one step for each input byte.
*0Sstevel@tonic-gate# The leaf "do_now"s will all be at the same distance from the start state,
*0Sstevel@tonic-gate# only the leaf "do_now"s cause output bytes, and they in turn point back to
*0Sstevel@tonic-gate# the start state.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# For an encoding where there are varaible length input byte sequences, you
*0Sstevel@tonic-gate# will encounter a leaf "do_now" sooner for the shorter input sequences, but
*0Sstevel@tonic-gate# as before the leaves will point back to the start state.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# The system will cope with escape encodings (imagine them as a mostly
*0Sstevel@tonic-gate# self-contained tree for each escape state, and cross links between trees
*0Sstevel@tonic-gate# at the state-switching characters) but so far no input format defines these.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# The system will also cope with having output "leaves" in the middle of
*0Sstevel@tonic-gate# the bifurcating branches, not just at the extremities, but again no
*0Sstevel@tonic-gate# input format does this yet.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# There are two variants of the "do_now" structure. The first, smaller variant
*0Sstevel@tonic-gate# is generated by &enter as the input file is read. There is one structure
*0Sstevel@tonic-gate# for each input byte. Say we are mapping a single byte encoding to a
*0Sstevel@tonic-gate# single byte encoding, with  "ABCD" going "abcd". There will be
*0Sstevel@tonic-gate# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# &process then walks the tree, building aggregate "do_now" structres for
*0Sstevel@tonic-gate# adjacent bytes where possible. The aggregate is for a contiguous range of
*0Sstevel@tonic-gate# bytes which each produce the same length of output, each move to the
*0Sstevel@tonic-gate# same next state, and each have the same fallback flag.
*0Sstevel@tonic-gate# So our 4 RAW "do_now"s above become replaced by a single structure
*0Sstevel@tonic-gate# containing:
*0Sstevel@tonic-gate# ["A", "D", "abcd", 1, ...]
*0Sstevel@tonic-gate# ie, for an input byte $_ in "A".."D", output 1 byte, found as
*0Sstevel@tonic-gate# substr ("abcd", (ord $_ - ord "A") * 1, 1)
*0Sstevel@tonic-gate# which maps very nicely into pointer arithmetic in C for encengine.c
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub encode_U
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate # UTF-8 encode long hand - only covers part of perl's range
*0Sstevel@tonic-gate ## my $uv = shift;
*0Sstevel@tonic-gate # chr() works in native space so convert value from table
*0Sstevel@tonic-gate # into that space before using chr().
*0Sstevel@tonic-gate my $ch = chr(utf8::unicode_to_native($_[0]));
*0Sstevel@tonic-gate # Now get core perl to encode that the way it likes.
*0Sstevel@tonic-gate utf8::encode($ch);
*0Sstevel@tonic-gate return $ch;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub encode_S
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate # encode single byte
*0Sstevel@tonic-gate ## my ($ch,$page) = @_; return chr($ch);
*0Sstevel@tonic-gate return chr $_[0];
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub encode_D
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate # encode double byte MS byte first
*0Sstevel@tonic-gate ## my ($ch,$page) = @_; return chr($page).chr($ch);
*0Sstevel@tonic-gate return chr ($_[1]) . chr $_[0];
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub encode_M
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate # encode Multi-byte - single for 0..255 otherwise double
*0Sstevel@tonic-gate ## my ($ch,$page) = @_;
*0Sstevel@tonic-gate ## return &encode_D if $page;
*0Sstevel@tonic-gate ## return &encode_S;
*0Sstevel@tonic-gate return chr ($_[1]) . chr $_[0] if $_[1];
*0Sstevel@tonic-gate return chr $_[0];
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy %encode_types = (U => \&encode_U,
*0Sstevel@tonic-gate                    S => \&encode_S,
*0Sstevel@tonic-gate                    D => \&encode_D,
*0Sstevel@tonic-gate                    M => \&encode_M,
*0Sstevel@tonic-gate                   );
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# Win32 does not expand globs on command line
*0Sstevel@tonic-gateeval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy %opt;
*0Sstevel@tonic-gate# I think these are:
*0Sstevel@tonic-gate# -Q to disable the duplicate codepoint test
*0Sstevel@tonic-gate# -S make mapping errors fatal
*0Sstevel@tonic-gate# -q to remove comments written to output files
*0Sstevel@tonic-gate# -O to enable the (brute force) substring optimiser
*0Sstevel@tonic-gate# -o <output> to specify the output file name (else it's the first arg)
*0Sstevel@tonic-gate# -f <inlist> to give a file with a list of input files (else use the args)
*0Sstevel@tonic-gate# -n <name> to name the encoding (else use the basename of the input file.
*0Sstevel@tonic-gategetopts('CM:SQqOo:f:n:',\%opt);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate$opt{M} and make_makefile_pl($opt{M}, @ARGV);
*0Sstevel@tonic-gate$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# This really should go first, else the die here causes empty (non-erroneous)
*0Sstevel@tonic-gate# output files to be written.
*0Sstevel@tonic-gatemy @encfiles;
*0Sstevel@tonic-gateif (exists $opt{'f'}) {
*0Sstevel@tonic-gate    # -F is followed by name of file containing list of filenames
*0Sstevel@tonic-gate    my $flist = $opt{'f'};
*0Sstevel@tonic-gate    open(FLIST,$flist) || die "Cannot open $flist:$!";
*0Sstevel@tonic-gate    chomp(@encfiles = <FLIST>);
*0Sstevel@tonic-gate    close(FLIST);
*0Sstevel@tonic-gate} else {
*0Sstevel@tonic-gate    @encfiles = @ARGV;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
*0Sstevel@tonic-gatechmod(0666,$cname) if -f $cname && !-w $cname;
*0Sstevel@tonic-gateopen(C,">$cname") || die "Cannot open $cname:$!";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy $dname = $cname;
*0Sstevel@tonic-gatemy $hname = $cname;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy ($doC,$doEnc,$doUcm,$doPet);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateif ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  $doC = 1;
*0Sstevel@tonic-gate  $dname =~ s/(\.[^\.]*)?$/.exh/;
*0Sstevel@tonic-gate  chmod(0666,$dname) if -f $cname && !-w $dname;
*0Sstevel@tonic-gate  open(D,">$dname") || die "Cannot open $dname:$!";
*0Sstevel@tonic-gate  $hname =~ s/(\.[^\.]*)?$/.h/;
*0Sstevel@tonic-gate  chmod(0666,$hname) if -f $cname && !-w $hname;
*0Sstevel@tonic-gate  open(H,">$hname") || die "Cannot open $hname:$!";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  foreach my $fh (\*C,\*D,\*H)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   print $fh <<"END" unless $opt{'q'};
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
*0Sstevel@tonic-gate This file was autogenerated by:
*0Sstevel@tonic-gate $^X $0 @orig_ARGV
*0Sstevel@tonic-gate*/
*0Sstevel@tonic-gateEND
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  if ($cname =~ /(\w+)\.xs$/)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    print C "#include <EXTERN.h>\n";
*0Sstevel@tonic-gate    print C "#include <perl.h>\n";
*0Sstevel@tonic-gate    print C "#include <XSUB.h>\n";
*0Sstevel@tonic-gate    print C "#define U8 U8\n";
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  print C "#include \"encode.h\"\n\n";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gateelsif ($cname =~ /\.enc$/)
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  $doEnc = 1;
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gateelsif ($cname =~ /\.ucm$/)
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  $doUcm = 1;
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gateelsif ($cname =~ /\.pet$/)
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  $doPet = 1;
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy %encoding;
*0Sstevel@tonic-gatemy %strings;
*0Sstevel@tonic-gatemy $string_acc;
*0Sstevel@tonic-gatemy %strings_in_acc;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy $saved = 0;
*0Sstevel@tonic-gatemy $subsave = 0;
*0Sstevel@tonic-gatemy $strings = 0;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub cmp_name
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate if ($a =~ /^.*-(\d+)/)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my $an = $1;
*0Sstevel@tonic-gate   if ($b =~ /^.*-(\d+)/)
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     my $r = $an <=> $1;
*0Sstevel@tonic-gate     return $r if $r;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate return $a cmp $b;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateforeach my $enc (sort cmp_name @encfiles)
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
*0Sstevel@tonic-gate  $name = $opt{'n'} if exists $opt{'n'};
*0Sstevel@tonic-gate  if (open(E,$enc))
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    if ($sfx eq 'enc')
*0Sstevel@tonic-gate     {
*0Sstevel@tonic-gate      compile_enc(\*E,lc($name));
*0Sstevel@tonic-gate     }
*0Sstevel@tonic-gate    else
*0Sstevel@tonic-gate     {
*0Sstevel@tonic-gate      compile_ucm(\*E,lc($name));
*0Sstevel@tonic-gate     }
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  else
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    warn "Cannot open $enc for $name:$!";
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateif ($doC)
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  print STDERR "Writing compiled form\n";
*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
*0Sstevel@tonic-gate    process($name.'_utf8',$e2u);
*0Sstevel@tonic-gate    addstrings(\*C,$e2u);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    process('utf8_'.$name,$u2e);
*0Sstevel@tonic-gate    addstrings(\*C,$u2e);
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  outbigstring(\*C,"enctable");
*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
*0Sstevel@tonic-gate    outtable(\*C,$e2u, "enctable");
*0Sstevel@tonic-gate    outtable(\*C,$u2e, "enctable");
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  foreach my $enc (sort cmp_name keys %encoding)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
*0Sstevel@tonic-gate    my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
*0Sstevel@tonic-gate    #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
*0Sstevel@tonic-gate    my $replen = 0;
*0Sstevel@tonic-gate    $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
*0Sstevel@tonic-gate    my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
*0Sstevel@tonic-gate    my $sym = "${enc}_encoding";
*0Sstevel@tonic-gate    $sym =~ s/\W+/_/g;
*0Sstevel@tonic-gate    print C "encode_t $sym = \n";
*0Sstevel@tonic-gate    # This is to make null encoding work -- dankogai
*0Sstevel@tonic-gate    for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
*0Sstevel@tonic-gate	$info[$i] ||= 1;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    # end of null tweak -- dankogai
*0Sstevel@tonic-gate    print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  foreach my $enc (sort cmp_name keys %encoding)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    my $sym = "${enc}_encoding";
*0Sstevel@tonic-gate    $sym =~ s/\W+/_/g;
*0Sstevel@tonic-gate    print H "extern encode_t $sym;\n";
*0Sstevel@tonic-gate    print D " Encode_XSEncoding(aTHX_ &$sym);\n";
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  if ($cname =~ /(\w+)\.xs$/)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    my $mod = $1;
*0Sstevel@tonic-gate    print C <<'END';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatestatic void
*0Sstevel@tonic-gateEncode_XSEncoding(pTHX_ encode_t *enc)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate dSP;
*0Sstevel@tonic-gate HV *stash = gv_stashpv("Encode::XS", TRUE);
*0Sstevel@tonic-gate SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
*0Sstevel@tonic-gate int i = 0;
*0Sstevel@tonic-gate PUSHMARK(sp);
*0Sstevel@tonic-gate XPUSHs(sv);
*0Sstevel@tonic-gate while (enc->name[i])
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   const char *name = enc->name[i++];
*0Sstevel@tonic-gate   XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate PUTBACK;
*0Sstevel@tonic-gate call_pv("Encode::define_encoding",G_DISCARD);
*0Sstevel@tonic-gate SvREFCNT_dec(sv);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateEND
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
*0Sstevel@tonic-gate    print C "BOOT:\n{\n";
*0Sstevel@tonic-gate    print C "#include \"$dname\"\n";
*0Sstevel@tonic-gate    print C "}\n";
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate  # Close in void context is bad, m'kay
*0Sstevel@tonic-gate  close(D) or warn "Error closing '$dname': $!";
*0Sstevel@tonic-gate  close(H) or warn "Error closing '$hname': $!";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  my $perc_saved    = $strings/($strings + $saved) * 100;
*0Sstevel@tonic-gate  my $perc_subsaved = $strings/($strings + $subsave) * 100;
*0Sstevel@tonic-gate  printf STDERR "%d bytes in string tables\n",$strings;
*0Sstevel@tonic-gate  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
*0Sstevel@tonic-gate    $saved, $perc_saved              if $saved;
*0Sstevel@tonic-gate  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
*0Sstevel@tonic-gate    $subsave, $perc_subsaved         if $subsave;
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gateelsif ($doEnc)
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
*0Sstevel@tonic-gate    output_enc(\*C,$name,$e2u);
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gateelsif ($doUcm)
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
*0Sstevel@tonic-gate   {
*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
*0Sstevel@tonic-gate    output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# writing half meg files and then not checking to see if you just filled the
*0Sstevel@tonic-gate# disk is bad, m'kay
*0Sstevel@tonic-gateclose(C) or die "Error closing '$cname': $!";
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# End of the main program.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub compile_ucm
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my ($fh,$name) = @_;
*0Sstevel@tonic-gate my $e2u = {};
*0Sstevel@tonic-gate my $u2e = {};
*0Sstevel@tonic-gate my $cs;
*0Sstevel@tonic-gate my %attr;
*0Sstevel@tonic-gate while (<$fh>)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   s/#.*$//;
*0Sstevel@tonic-gate   last if /^\s*CHARMAP\s*$/i;
*0Sstevel@tonic-gate   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     $attr{$1} = $2;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate if (!defined($cs =  $attr{'code_set_name'}))
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   warn "No <code_set_name> in $name\n";
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate else
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   $name = $cs unless exists $opt{'n'};
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate my $erep;
*0Sstevel@tonic-gate my $urep;
*0Sstevel@tonic-gate my $max_el;
*0Sstevel@tonic-gate my $min_el;
*0Sstevel@tonic-gate if (exists $attr{'subchar'})
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   #my @byte;
*0Sstevel@tonic-gate   #$attr{'subchar'} =~ /^\s*/cg;
*0Sstevel@tonic-gate   #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
*0Sstevel@tonic-gate   #$erep = join('',map(chr(hex($_)),@byte));
*0Sstevel@tonic-gate   $erep = $attr{'subchar'};
*0Sstevel@tonic-gate   $erep =~ s/^\s+//; $erep =~ s/\s+$//;
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate print "Reading $name ($cs)\n";
*0Sstevel@tonic-gate my $nfb = 0;
*0Sstevel@tonic-gate my $hfb = 0;
*0Sstevel@tonic-gate while (<$fh>)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   s/#.*$//;
*0Sstevel@tonic-gate   last if /^\s*END\s+CHARMAP\s*$/i;
*0Sstevel@tonic-gate   next if /^\s*$/;
*0Sstevel@tonic-gate   my (@uni, @byte) = ();
*0Sstevel@tonic-gate   my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
*0Sstevel@tonic-gate       or die "Bad line: $_";
*0Sstevel@tonic-gate   while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
*0Sstevel@tonic-gate       push @uni, map { substr($_, 1) } split(/\+/, $1);
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate   while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
*0Sstevel@tonic-gate       push @byte, $1;
*0Sstevel@tonic-gate   }
*0Sstevel@tonic-gate   if (@uni)
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     my $uch =  join('', map { encode_U(hex($_)) } @uni );
*0Sstevel@tonic-gate     my $ech = join('',map(chr(hex($_)),@byte));
*0Sstevel@tonic-gate     my $el  = length($ech);
*0Sstevel@tonic-gate     $max_el = $el if (!defined($max_el) || $el > $max_el);
*0Sstevel@tonic-gate     $min_el = $el if (!defined($min_el) || $el < $min_el);
*0Sstevel@tonic-gate     if (length($fb))
*0Sstevel@tonic-gate      {
*0Sstevel@tonic-gate       $fb = substr($fb,1);
*0Sstevel@tonic-gate       $hfb++;
*0Sstevel@tonic-gate      }
*0Sstevel@tonic-gate     else
*0Sstevel@tonic-gate      {
*0Sstevel@tonic-gate       $nfb++;
*0Sstevel@tonic-gate       $fb = '0';
*0Sstevel@tonic-gate      }
*0Sstevel@tonic-gate     # $fb is fallback flag
*0Sstevel@tonic-gate     # 0 - round trip safe
*0Sstevel@tonic-gate     # 1 - fallback for unicode -> enc
*0Sstevel@tonic-gate     # 2 - skip sub-char mapping
*0Sstevel@tonic-gate     # 3 - fallback enc -> unicode
*0Sstevel@tonic-gate     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
*0Sstevel@tonic-gate     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate   else
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     warn $_;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate if ($nfb && $hfb)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   die "$nfb entries without fallback, $hfb entries with\n";
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub compile_enc
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my ($fh,$name) = @_;
*0Sstevel@tonic-gate my $e2u = {};
*0Sstevel@tonic-gate my $u2e = {};
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate my $type;
*0Sstevel@tonic-gate while ($type = <$fh>)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   last if $type !~ /^\s*#/;
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate chomp($type);
*0Sstevel@tonic-gate return if $type eq 'E';
*0Sstevel@tonic-gate # Do the hash lookup once, rather than once per function call. 4% speedup.
*0Sstevel@tonic-gate my $type_func = $encode_types{$type};
*0Sstevel@tonic-gate my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
*0Sstevel@tonic-gate warn "$type encoded $name\n";
*0Sstevel@tonic-gate my $rep = '';
*0Sstevel@tonic-gate # Save a defined test by setting these to defined values.
*0Sstevel@tonic-gate my $min_el = ~0; # A very big integer
*0Sstevel@tonic-gate my $max_el = 0;  # Anything must be longer than 0
*0Sstevel@tonic-gate {
*0Sstevel@tonic-gate  my $v = hex($def);
*0Sstevel@tonic-gate  $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
*0Sstevel@tonic-gate }
*0Sstevel@tonic-gate my $errors;
*0Sstevel@tonic-gate my $seen;
*0Sstevel@tonic-gate # use -Q to silence the seen test. Makefile.PL uses this by default.
*0Sstevel@tonic-gate $seen = {} unless $opt{Q};
*0Sstevel@tonic-gate do
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my $line = <$fh>;
*0Sstevel@tonic-gate   chomp($line);
*0Sstevel@tonic-gate   my $page = hex($line);
*0Sstevel@tonic-gate   my $ch = 0;
*0Sstevel@tonic-gate   my $i = 16;
*0Sstevel@tonic-gate   do
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     # So why is it 1% faster to leave the my here?
*0Sstevel@tonic-gate     my $line = <$fh>;
*0Sstevel@tonic-gate     $line =~ s/\r\n$/\n/;
*0Sstevel@tonic-gate     die "$.:${line}Line should be exactly 65 characters long including
*0Sstevel@tonic-gate     newline (".length($line).")" unless length ($line) == 65;
*0Sstevel@tonic-gate     # Split line into groups of 4 hex digits, convert groups to ints
*0Sstevel@tonic-gate     # This takes 65.35
*0Sstevel@tonic-gate     # map {hex $_} $line =~ /(....)/g
*0Sstevel@tonic-gate     # This takes 63.75 (2.5% less time)
*0Sstevel@tonic-gate     # unpack "n*", pack "H*", $line
*0Sstevel@tonic-gate     # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
*0Sstevel@tonic-gate     # Doing it as while ($line =~ /(....)/g) took 74.63
*0Sstevel@tonic-gate     foreach my $val (unpack "n*", pack "H*", $line)
*0Sstevel@tonic-gate      {
*0Sstevel@tonic-gate       next if $val == 0xFFFD;
*0Sstevel@tonic-gate       my $ech = &$type_func($ch,$page);
*0Sstevel@tonic-gate       if ($val || (!$ch && !$page))
*0Sstevel@tonic-gate        {
*0Sstevel@tonic-gate         my $el  = length($ech);
*0Sstevel@tonic-gate         $max_el = $el if $el > $max_el;
*0Sstevel@tonic-gate         $min_el = $el if $el < $min_el;
*0Sstevel@tonic-gate         my $uch = encode_U($val);
*0Sstevel@tonic-gate         if ($seen) {
*0Sstevel@tonic-gate           # We're doing the test.
*0Sstevel@tonic-gate           # We don't need to read this quickly, so storing it as a scalar,
*0Sstevel@tonic-gate           # rather than 3 (anon array, plus the 2 scalars it holds) saves
*0Sstevel@tonic-gate           # RAM and may make us faster on low RAM systems. [see __END__]
*0Sstevel@tonic-gate           if (exists $seen->{$uch})
*0Sstevel@tonic-gate             {
*0Sstevel@tonic-gate               warn sprintf("U%04X is %02X%02X and %04X\n",
*0Sstevel@tonic-gate                            $val,$page,$ch,$seen->{$uch});
*0Sstevel@tonic-gate               $errors++;
*0Sstevel@tonic-gate             }
*0Sstevel@tonic-gate           else
*0Sstevel@tonic-gate             {
*0Sstevel@tonic-gate               $seen->{$uch} = $page << 8 | $ch;
*0Sstevel@tonic-gate             }
*0Sstevel@tonic-gate         }
*0Sstevel@tonic-gate         # Passing 2 extra args each time is 3.6% slower!
*0Sstevel@tonic-gate         # Even with having to add $fallback ||= 0 later
*0Sstevel@tonic-gate         enter_fb0($e2u,$ech,$uch);
*0Sstevel@tonic-gate         enter_fb0($u2e,$uch,$ech);
*0Sstevel@tonic-gate        }
*0Sstevel@tonic-gate       else
*0Sstevel@tonic-gate        {
*0Sstevel@tonic-gate         # No character at this position
*0Sstevel@tonic-gate         # enter($e2u,$ech,undef,$e2u);
*0Sstevel@tonic-gate        }
*0Sstevel@tonic-gate       $ch++;
*0Sstevel@tonic-gate      }
*0Sstevel@tonic-gate    } while --$i;
*0Sstevel@tonic-gate  } while --$pages;
*0Sstevel@tonic-gate die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
*0Sstevel@tonic-gate   if $min_el > $max_el;
*0Sstevel@tonic-gate die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
*0Sstevel@tonic-gate $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# my ($a,$s,$d,$t,$fb) = @_;
*0Sstevel@tonic-gatesub enter {
*0Sstevel@tonic-gate  my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
*0Sstevel@tonic-gate  # state we shift to after this (multibyte) input character defaults to same
*0Sstevel@tonic-gate  # as current state.
*0Sstevel@tonic-gate  $next ||= $current;
*0Sstevel@tonic-gate  # Making sure it is defined seems to be faster than {no warnings;} in
*0Sstevel@tonic-gate  # &process, or passing it in as 0 explicity.
*0Sstevel@tonic-gate  # XXX $fallback ||= 0;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # Start at the beginning and work forwards through the string to zero.
*0Sstevel@tonic-gate  # effectively we are removing 1 character from the front each time
*0Sstevel@tonic-gate  # but we don't actually edit the string. [this alone seems to be 14% speedup]
*0Sstevel@tonic-gate  # Hence -$pos is the length of the remaining string.
*0Sstevel@tonic-gate  my $pos = -length $inbytes;
*0Sstevel@tonic-gate  while (1) {
*0Sstevel@tonic-gate    my $byte = substr $inbytes, $pos, 1;
*0Sstevel@tonic-gate    #  RAW_NEXT => 0,
*0Sstevel@tonic-gate    #  RAW_IN_LEN => 1,
*0Sstevel@tonic-gate    #  RAW_OUT_BYTES => 2,
*0Sstevel@tonic-gate    #  RAW_FALLBACK => 3,
*0Sstevel@tonic-gate    # to unicode an array would seem to be better, because the pages are dense.
*0Sstevel@tonic-gate    # from unicode can be very sparse, favouring a hash.
*0Sstevel@tonic-gate    # hash using the bytes (all length 1) as keys rather than ord value,
*0Sstevel@tonic-gate    # as it's easier to sort these in &process.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    # It's faster to always add $fallback even if it's undef, rather than
*0Sstevel@tonic-gate    # choosing between 3 and 4 element array. (hence why we set it defined
*0Sstevel@tonic-gate    # above)
*0Sstevel@tonic-gate    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
*0Sstevel@tonic-gate    # When $pos was -1 we were at the last input character.
*0Sstevel@tonic-gate    unless (++$pos) {
*0Sstevel@tonic-gate      $do_now->[RAW_OUT_BYTES] = $outbytes;
*0Sstevel@tonic-gate      $do_now->[RAW_NEXT] = $next;
*0Sstevel@tonic-gate      return;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    # Tail recursion. The intermdiate state may not have a name yet.
*0Sstevel@tonic-gate    $current = $do_now->[RAW_NEXT];
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# This is purely for optimistation. It's just &enter hard coded for $fallback
*0Sstevel@tonic-gate# of 0, using only a 3 entry array ref to save memory for every entry.
*0Sstevel@tonic-gatesub enter_fb0 {
*0Sstevel@tonic-gate  my ($current,$inbytes,$outbytes,$next) = @_;
*0Sstevel@tonic-gate  $next ||= $current;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  my $pos = -length $inbytes;
*0Sstevel@tonic-gate  while (1) {
*0Sstevel@tonic-gate    my $byte = substr $inbytes, $pos, 1;
*0Sstevel@tonic-gate    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
*0Sstevel@tonic-gate    unless (++$pos) {
*0Sstevel@tonic-gate      $do_now->[RAW_OUT_BYTES] = $outbytes;
*0Sstevel@tonic-gate      $do_now->[RAW_NEXT] = $next;
*0Sstevel@tonic-gate      return;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    $current = $do_now->[RAW_NEXT];
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub process
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate  my ($name,$a) = @_;
*0Sstevel@tonic-gate  $name =~ s/\W+/_/g;
*0Sstevel@tonic-gate  $a->{Cname} = $name;
*0Sstevel@tonic-gate  my $raw = $a->{Raw};
*0Sstevel@tonic-gate  my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
*0Sstevel@tonic-gate  my @ent;
*0Sstevel@tonic-gate  $agg_max_in = 0;
*0Sstevel@tonic-gate  foreach my $key (sort keys %$raw) {
*0Sstevel@tonic-gate    #  RAW_NEXT => 0,
*0Sstevel@tonic-gate    #  RAW_IN_LEN => 1,
*0Sstevel@tonic-gate    #  RAW_OUT_BYTES => 2,
*0Sstevel@tonic-gate    #  RAW_FALLBACK => 3,
*0Sstevel@tonic-gate    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
*0Sstevel@tonic-gate    # Now we are converting from raw to aggregate, switch from 1 byte strings
*0Sstevel@tonic-gate    # to numbers
*0Sstevel@tonic-gate    my $b = ord $key;
*0Sstevel@tonic-gate    $fallback ||= 0;
*0Sstevel@tonic-gate    if ($l &&
*0Sstevel@tonic-gate        # If this == fails, we're going to reset $agg_max_in below anyway.
*0Sstevel@tonic-gate        $b == ++$agg_max_in &&
*0Sstevel@tonic-gate        # References in numeric context give the pointer as an int.
*0Sstevel@tonic-gate        $agg_next == $next &&
*0Sstevel@tonic-gate        $agg_in_len == $in_len &&
*0Sstevel@tonic-gate        $agg_out_len == length $out_bytes &&
*0Sstevel@tonic-gate        $agg_fallback == $fallback
*0Sstevel@tonic-gate        # && length($l->[AGG_OUT_BYTES]) < 16
*0Sstevel@tonic-gate       ) {
*0Sstevel@tonic-gate      #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
*0Sstevel@tonic-gate      # we can aggregate this byte onto the end.
*0Sstevel@tonic-gate      $l->[AGG_MAX_IN] = $b;
*0Sstevel@tonic-gate      $l->[AGG_OUT_BYTES] .= $out_bytes;
*0Sstevel@tonic-gate    } else {
*0Sstevel@tonic-gate      # AGG_MIN_IN => 0,
*0Sstevel@tonic-gate      # AGG_MAX_IN => 1,
*0Sstevel@tonic-gate      # AGG_OUT_BYTES => 2,
*0Sstevel@tonic-gate      # AGG_NEXT => 3,
*0Sstevel@tonic-gate      # AGG_IN_LEN => 4,
*0Sstevel@tonic-gate      # AGG_OUT_LEN => 5,
*0Sstevel@tonic-gate      # AGG_FALLBACK => 6,
*0Sstevel@tonic-gate      # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
*0Sstevel@tonic-gate      # (only gains .6% on euc-jp  -- is it worth it?)
*0Sstevel@tonic-gate      push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
*0Sstevel@tonic-gate                       $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
*0Sstevel@tonic-gate                       $agg_fallback = $fallback];
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    if (exists $next->{Cname}) {
*0Sstevel@tonic-gate      $next->{'Forward'} = 1 if $next != $a;
*0Sstevel@tonic-gate    } else {
*0Sstevel@tonic-gate      process(sprintf("%s_%02x",$name,$b),$next);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate  # encengine.c rules say that last entry must be for 255
*0Sstevel@tonic-gate  if ($agg_max_in < 255) {
*0Sstevel@tonic-gate    push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate  $a->{'Entries'} = \@ent;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub addstrings
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my ($fh,$a) = @_;
*0Sstevel@tonic-gate my $name = $a->{'Cname'};
*0Sstevel@tonic-gate # String tables
*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   next unless $b->[AGG_OUT_LEN];
*0Sstevel@tonic-gate   $strings{$b->[AGG_OUT_BYTES]} = undef;
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate if ($a->{'Forward'})
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my $var = $^O eq 'MacOS' ? 'extern' : 'static';
*0Sstevel@tonic-gate   print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate $a->{'DoneStrings'} = 1;
*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my ($s,$e,$out,$t,$end,$l) = @$b;
*0Sstevel@tonic-gate   addstrings($fh,$t) unless $t->{'DoneStrings'};
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub outbigstring
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate  my ($fh,$name) = @_;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $string_acc = '';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  # Make the big string in the string accumulator. Longest first, on the hope
*0Sstevel@tonic-gate  # that this makes it more likely that we find the short strings later on.
*0Sstevel@tonic-gate  # Not sure if it helps sorting strings of the same length lexcically.
*0Sstevel@tonic-gate  foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
*0Sstevel@tonic-gate    my $index = index $string_acc, $s;
*0Sstevel@tonic-gate    if ($index >= 0) {
*0Sstevel@tonic-gate      $saved += length($s);
*0Sstevel@tonic-gate      $strings_in_acc{$s} = $index;
*0Sstevel@tonic-gate    } else {
*0Sstevel@tonic-gate    OPTIMISER: {
*0Sstevel@tonic-gate	if ($opt{'O'}) {
*0Sstevel@tonic-gate	  my $sublength = length $s;
*0Sstevel@tonic-gate	  while (--$sublength > 0) {
*0Sstevel@tonic-gate	    # progressively lop characters off the end, to see if the start of
*0Sstevel@tonic-gate	    # the new string overlaps the end of the accumulator.
*0Sstevel@tonic-gate	    if (substr ($string_acc, -$sublength)
*0Sstevel@tonic-gate		eq substr ($s, 0, $sublength)) {
*0Sstevel@tonic-gate	      $subsave += $sublength;
*0Sstevel@tonic-gate	      $strings_in_acc{$s} = length ($string_acc) - $sublength;
*0Sstevel@tonic-gate	      # append the last bit on the end.
*0Sstevel@tonic-gate	      $string_acc .= substr ($s, $sublength);
*0Sstevel@tonic-gate	      last OPTIMISER;
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    # or if the end of the new string overlaps the start of the
*0Sstevel@tonic-gate	    # accumulator
*0Sstevel@tonic-gate	    next unless substr ($string_acc, 0, $sublength)
*0Sstevel@tonic-gate	      eq substr ($s, -$sublength);
*0Sstevel@tonic-gate	    # well, the last $sublength characters of the accumulator match.
*0Sstevel@tonic-gate	    # so as we're prepending to the accumulator, need to shift all our
*0Sstevel@tonic-gate	    # existing offsets forwards
*0Sstevel@tonic-gate	    $_ += $sublength foreach values %strings_in_acc;
*0Sstevel@tonic-gate	    $subsave += $sublength;
*0Sstevel@tonic-gate	    $strings_in_acc{$s} = 0;
*0Sstevel@tonic-gate	    # append the first bit on the start.
*0Sstevel@tonic-gate	    $string_acc = substr ($s, 0, -$sublength) . $string_acc;
*0Sstevel@tonic-gate	    last OPTIMISER;
*0Sstevel@tonic-gate	  }
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate	# Optimiser (if it ran) found nothing, so just going have to tack the
*0Sstevel@tonic-gate	# whole thing on the end.
*0Sstevel@tonic-gate	$strings_in_acc{$s} = length $string_acc;
*0Sstevel@tonic-gate	$string_acc .= $s;
*0Sstevel@tonic-gate      };
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $strings = length $string_acc;
*0Sstevel@tonic-gate  my $definition = "\nstatic const U8 $name\[$strings] = { " .
*0Sstevel@tonic-gate    join(',',unpack "C*",$string_acc);
*0Sstevel@tonic-gate  # We have a single long line. Split it at convenient commas.
*0Sstevel@tonic-gate  print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
*0Sstevel@tonic-gate  print $fh substr ($definition, pos $definition), " };\n";
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub findstring {
*0Sstevel@tonic-gate  my ($name,$s) = @_;
*0Sstevel@tonic-gate  my $offset = $strings_in_acc{$s};
*0Sstevel@tonic-gate  die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
*0Sstevel@tonic-gate    unless defined $offset;
*0Sstevel@tonic-gate  "$name + $offset";
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub outtable
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my ($fh,$a,$bigname) = @_;
*0Sstevel@tonic-gate my $name = $a->{'Cname'};
*0Sstevel@tonic-gate $a->{'Done'} = 1;
*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my ($s,$e,$out,$t,$end,$l) = @$b;
*0Sstevel@tonic-gate   outtable($fh,$t,$bigname) unless $t->{'Done'};
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
*0Sstevel@tonic-gate   # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
*0Sstevel@tonic-gate   print  $fh "{";
*0Sstevel@tonic-gate   if ($l)
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     printf $fh findstring($bigname,$out);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate   else
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     print  $fh "0";
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate   print  $fh ",",$t->{Cname};
*0Sstevel@tonic-gate   printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate print $fh "};\n";
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub output_enc
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my ($fh,$name,$a) = @_;
*0Sstevel@tonic-gate die "Changed - fix me for new structure";
*0Sstevel@tonic-gate foreach my $b (sort keys %$a)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub decode_U
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my $s = shift;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy @uname;
*0Sstevel@tonic-gatesub char_names
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my $s = do "unicore/Name.pl";
*0Sstevel@tonic-gate die "char_names: unicore/Name.pl: $!\n" unless defined $s;
*0Sstevel@tonic-gate pos($s) = 0;
*0Sstevel@tonic-gate while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   my $name = $3;
*0Sstevel@tonic-gate   my $s = hex($1);
*0Sstevel@tonic-gate   last if $s >= 0x10000;
*0Sstevel@tonic-gate   my $e = length($2) ? hex($2) : $s;
*0Sstevel@tonic-gate   for (my $i = $s; $i <= $e; $i++)
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     $uname[$i] = $name;
*0Sstevel@tonic-gate#    print sprintf("U%04X $name\n",$i);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub output_ucm_page
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate  my ($cmap,$a,$t,$pre) = @_;
*0Sstevel@tonic-gate  # warn sprintf("Page %x\n",$pre);
*0Sstevel@tonic-gate  my $raw = $t->{Raw};
*0Sstevel@tonic-gate  foreach my $key (sort keys %$raw) {
*0Sstevel@tonic-gate    #  RAW_NEXT => 0,
*0Sstevel@tonic-gate    #  RAW_IN_LEN => 1,
*0Sstevel@tonic-gate    #  RAW_OUT_BYTES => 2,
*0Sstevel@tonic-gate    #  RAW_FALLBACK => 3,
*0Sstevel@tonic-gate    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
*0Sstevel@tonic-gate    my $u = ord $key;
*0Sstevel@tonic-gate    $fallback ||= 0;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    if ($next != $a && $next != $t) {
*0Sstevel@tonic-gate      output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
*0Sstevel@tonic-gate    } elsif (length $out_bytes) {
*0Sstevel@tonic-gate      if ($pre) {
*0Sstevel@tonic-gate        $u = $pre|($u &0x3f);
*0Sstevel@tonic-gate      }
*0Sstevel@tonic-gate      my $s = sprintf "<U%04X> ",$u;
*0Sstevel@tonic-gate      #foreach my $c (split(//,$out_bytes)) {
*0Sstevel@tonic-gate      #  $s .= sprintf "\\x%02X",ord($c);
*0Sstevel@tonic-gate      #}
*0Sstevel@tonic-gate      # 9.5% faster changing that loop to this:
*0Sstevel@tonic-gate      $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
*0Sstevel@tonic-gate      $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
*0Sstevel@tonic-gate      push(@$cmap,$s);
*0Sstevel@tonic-gate    } else {
*0Sstevel@tonic-gate      warn join(',',$u, @{$raw->{$key}},$a,$t);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub output_ucm
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
*0Sstevel@tonic-gate print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
*0Sstevel@tonic-gate print $fh "<code_set_name> \"$name\"\n";
*0Sstevel@tonic-gate char_names();
*0Sstevel@tonic-gate if (defined $min_el)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   print $fh "<mb_cur_min> $min_el\n";
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate if (defined $max_el)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   print $fh "<mb_cur_max> $max_el\n";
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate if (defined $rep)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   print $fh "<subchar> ";
*0Sstevel@tonic-gate   foreach my $c (split(//,$rep))
*0Sstevel@tonic-gate    {
*0Sstevel@tonic-gate     printf $fh "\\x%02X",ord($c);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate   print $fh "\n";
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate my @cmap;
*0Sstevel@tonic-gate output_ucm_page(\@cmap,$h,$h,0);
*0Sstevel@tonic-gate print $fh "#\nCHARMAP\n";
*0Sstevel@tonic-gate foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
*0Sstevel@tonic-gate  {
*0Sstevel@tonic-gate   print $fh $line;
*0Sstevel@tonic-gate  }
*0Sstevel@tonic-gate print $fh "END CHARMAP\n";
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse vars qw(
*0Sstevel@tonic-gate    $_Enc2xs
*0Sstevel@tonic-gate    $_Version
*0Sstevel@tonic-gate    $_Inc
*0Sstevel@tonic-gate    $_E2X
*0Sstevel@tonic-gate    $_Name
*0Sstevel@tonic-gate    $_TableFiles
*0Sstevel@tonic-gate    $_Now
*0Sstevel@tonic-gate);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub find_e2x{
*0Sstevel@tonic-gate    eval { require File::Find; };
*0Sstevel@tonic-gate    my (@inc, %e2x_dir);
*0Sstevel@tonic-gate    for my $inc (@INC){
*0Sstevel@tonic-gate	push @inc, $inc unless $inc eq '.'; #skip current dir
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    File::Find::find(
*0Sstevel@tonic-gate	     sub {
*0Sstevel@tonic-gate		 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
*0Sstevel@tonic-gate		     $atime,$mtime,$ctime,$blksize,$blocks)
*0Sstevel@tonic-gate		     = lstat($_) or return;
*0Sstevel@tonic-gate		 -f _ or return;
*0Sstevel@tonic-gate		 if (/^.*\.e2x$/o){
*0Sstevel@tonic-gate		     no warnings 'once';
*0Sstevel@tonic-gate		     $e2x_dir{$File::Find::dir} ||= $mtime;
*0Sstevel@tonic-gate		 }
*0Sstevel@tonic-gate		 return;
*0Sstevel@tonic-gate	     }, @inc);
*0Sstevel@tonic-gate    warn join("\n", keys %e2x_dir), "\n";
*0Sstevel@tonic-gate    for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
*0Sstevel@tonic-gate	$_E2X = $d;
*0Sstevel@tonic-gate	# warn "$_E2X => ", scalar localtime($e2x_dir{$d});
*0Sstevel@tonic-gate	return $_E2X;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub make_makefile_pl
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate    eval { require Encode; };
*0Sstevel@tonic-gate    $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
*0Sstevel@tonic-gate    # our used for variable expanstion
*0Sstevel@tonic-gate    $_Enc2xs = $0;
*0Sstevel@tonic-gate    $_Version = $VERSION;
*0Sstevel@tonic-gate    $_E2X = find_e2x();
*0Sstevel@tonic-gate    $_Name = shift;
*0Sstevel@tonic-gate    $_TableFiles = join(",", map {qq('$_')} @_);
*0Sstevel@tonic-gate    $_Now = scalar localtime();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    eval { require File::Spec; };
*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
*0Sstevel@tonic-gate    exit;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse vars qw(
*0Sstevel@tonic-gate	    $_ModLines
*0Sstevel@tonic-gate	    $_LocalVer
*0Sstevel@tonic-gate	    );
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub make_configlocal_pm
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate    eval { require Encode; };
*0Sstevel@tonic-gate    $@ and die "Unable to require Encode: $@\n";
*0Sstevel@tonic-gate    eval { require File::Spec; };
*0Sstevel@tonic-gate    # our used for variable expanstion
*0Sstevel@tonic-gate    my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
*0Sstevel@tonic-gate    my %LocalMod = ();
*0Sstevel@tonic-gate    for my $d (@INC){
*0Sstevel@tonic-gate	my $inc = File::Spec->catfile($d, "Encode");
*0Sstevel@tonic-gate	-d $inc or next;
*0Sstevel@tonic-gate	opendir my $dh, $inc or die "$inc:$!";
*0Sstevel@tonic-gate	warn "Checking $inc...\n";
*0Sstevel@tonic-gate	for my $f (grep /\.pm$/o, readdir($dh)){
*0Sstevel@tonic-gate	    -f File::Spec->catfile($inc, "$f") or next;
*0Sstevel@tonic-gate	    $INC{"Encode/$f"} and next;
*0Sstevel@tonic-gate	    warn "require Encode/$f;\n";
*0Sstevel@tonic-gate	    eval { require "Encode/$f"; };
*0Sstevel@tonic-gate	    $@ and die "Can't require Encode/$f: $@\n";
*0Sstevel@tonic-gate	    for my $enc (Encode->encodings()){
*0Sstevel@tonic-gate		no warnings 'once';
*0Sstevel@tonic-gate		$in_core{$enc} and next;
*0Sstevel@tonic-gate		$Encode::Config::ExtModule{$enc} and next;
*0Sstevel@tonic-gate		my $mod = "Encode/$f";
*0Sstevel@tonic-gate		$mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
*0Sstevel@tonic-gate		$LocalMod{$enc} ||= $mod;
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    $_ModLines = "";
*0Sstevel@tonic-gate    for my $enc (sort keys %LocalMod){
*0Sstevel@tonic-gate	$_ModLines .=
*0Sstevel@tonic-gate	    qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    warn $_ModLines;
*0Sstevel@tonic-gate    $_LocalVer = _mkversion();
*0Sstevel@tonic-gate    $_E2X = find_e2x();
*0Sstevel@tonic-gate    $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
*0Sstevel@tonic-gate		  File::Spec->catfile($_Inc,"ConfigLocal.pm"),
*0Sstevel@tonic-gate		  1);
*0Sstevel@tonic-gate    exit;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _mkversion{
*0Sstevel@tonic-gate    my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
*0Sstevel@tonic-gate    $yyyy += 1900, $mo +=1;
*0Sstevel@tonic-gate    return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _print_expand{
*0Sstevel@tonic-gate    eval { require File::Basename; };
*0Sstevel@tonic-gate    $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
*0Sstevel@tonic-gate    File::Basename->import();
*0Sstevel@tonic-gate    my ($src, $dst, $clobber) = @_;
*0Sstevel@tonic-gate    if (!$clobber and -e $dst){
*0Sstevel@tonic-gate	warn "$dst exists. skipping\n";
*0Sstevel@tonic-gate	return;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    warn "Generating $dst...\n";
*0Sstevel@tonic-gate    open my $in, $src or die "$src : $!";
*0Sstevel@tonic-gate    if ((my $d = dirname($dst)) ne '.'){
*0Sstevel@tonic-gate	-d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    open my $out, ">$dst" or die "$!";
*0Sstevel@tonic-gate    my $asis = 0;
*0Sstevel@tonic-gate    while (<$in>){
*0Sstevel@tonic-gate	if (/^#### END_OF_HEADER/){
*0Sstevel@tonic-gate	    $asis = 1; next;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate	s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
*0Sstevel@tonic-gate	print $out $_;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate__END__
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 NAME
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateenc2xs -- Perl Encode Module Generator
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 SYNOPSIS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  enc2xs -[options]
*0Sstevel@tonic-gate  enc2xs -M ModName mapfiles...
*0Sstevel@tonic-gate  enc2xs -C
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 DESCRIPTION
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateF<enc2xs> builds a Perl extension for use by Encode from either
*0Sstevel@tonic-gateUnicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
*0Sstevel@tonic-gateBesides being used internally during the build process of the Encode
*0Sstevel@tonic-gatemodule, you can use F<enc2xs> to add your own encoding to perl.
*0Sstevel@tonic-gateNo knowledge of XS is necessary.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 Quick Guide
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf you want to know as little about Perl as possible but need to
*0Sstevel@tonic-gateadd a new encoding, just read this chapter and forget the rest.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 0.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateHave a .ucm file ready.  You can get it from somewhere or you can write
*0Sstevel@tonic-gateyour own from scratch or you can grab one from the Encode distribution
*0Sstevel@tonic-gateand customize it.  For the UCM format, see the next Chapter.  In the
*0Sstevel@tonic-gateexample below, I'll call my theoretical encoding myascii, defined
*0Sstevel@tonic-gatein I<my.ucm>.  C<$> is a shell prompt.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $ ls -F
*0Sstevel@tonic-gate  my.ucm
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 1.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIssue a command as follows;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $ enc2xs -M My my.ucm
*0Sstevel@tonic-gate  generating Makefile.PL
*0Sstevel@tonic-gate  generating My.pm
*0Sstevel@tonic-gate  generating README
*0Sstevel@tonic-gate  generating Changes
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNow take a look at your current directory.  It should look like this.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $ ls -F
*0Sstevel@tonic-gate  Makefile.PL   My.pm         my.ucm        t/
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe following files were created.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  Makefile.PL - MakeMaker script
*0Sstevel@tonic-gate  My.pm       - Encode submodule
*0Sstevel@tonic-gate  t/My.t      - test file
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 1.1.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf you want *.ucm installed together with the modules, do as follows;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $ mkdir Encode
*0Sstevel@tonic-gate  $ mv *.ucm Encode
*0Sstevel@tonic-gate  $ enc2xs -M My Encode/*ucm
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 2.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateEdit the files generated.  You don't have to if you have no time AND no
*0Sstevel@tonic-gateintention to give it to someone else.  But it is a good idea to edit
*0Sstevel@tonic-gatethe pod and to add more tests.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 3.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNow issue a command all Perl Mongers love:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $ perl Makefile.PL
*0Sstevel@tonic-gate  Writing Makefile for Encode::My
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 4.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNow all you have to do is make.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $ make
*0Sstevel@tonic-gate  cp My.pm blib/lib/Encode/My.pm
*0Sstevel@tonic-gate  /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
*0Sstevel@tonic-gate    -o encode_t.c -f encode_t.fnm
*0Sstevel@tonic-gate  Reading myascii (myascii)
*0Sstevel@tonic-gate  Writing compiled form
*0Sstevel@tonic-gate  128 bytes in string tables
*0Sstevel@tonic-gate  384 bytes (25%) saved spotting duplicates
*0Sstevel@tonic-gate  1 bytes (99.2%) saved using substrings
*0Sstevel@tonic-gate  ....
*0Sstevel@tonic-gate  chmod 644 blib/arch/auto/Encode/My/My.bs
*0Sstevel@tonic-gate  $
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe time it takes varies depending on how fast your machine is and
*0Sstevel@tonic-gatehow large your encoding is.  Unless you are working on something big
*0Sstevel@tonic-gatelike euc-tw, it won't take too long.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 5.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateYou can "make install" already but you should test first.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  $ make test
*0Sstevel@tonic-gate  PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
*0Sstevel@tonic-gate    -e 'use Test::Harness  qw(&runtests $verbose); \
*0Sstevel@tonic-gate    $verbose=0; runtests @ARGV;' t/*.t
*0Sstevel@tonic-gate  t/My....ok
*0Sstevel@tonic-gate  All tests successful.
*0Sstevel@tonic-gate  Files=1, Tests=2,  0 wallclock secs
*0Sstevel@tonic-gate   ( 0.09 cusr + 0.01 csys = 0.09 CPU)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 6.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf you are content with the test result, just "make install"
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item 7.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf you want to add your encoding to Encode's demand-loading list
*0Sstevel@tonic-gate(so you don't have to "use Encode::YourEncoding"), run
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  enc2xs -C
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateto update Encode::ConfigLocal, a module that controls local settings.
*0Sstevel@tonic-gateAfter that, "use Encode;" is enough to load your encodings on demand.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 The Unicode Character Map
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateEncode uses the Unicode Character Map (UCM) format for source character
*0Sstevel@tonic-gatemappings.  This format is used by IBM's ICU package and was adopted
*0Sstevel@tonic-gateby Nick Ing-Simmons for use with the Encode module.  Since UCM is
*0Sstevel@tonic-gatemore flexible than Tcl's Encoding Map and far more user-friendly,
*0Sstevel@tonic-gatethis is the recommended formet for Encode now.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateA UCM file looks like this.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  #
*0Sstevel@tonic-gate  # Comments
*0Sstevel@tonic-gate  #
*0Sstevel@tonic-gate  <code_set_name> "US-ascii" # Required
*0Sstevel@tonic-gate  <code_set_alias> "ascii"   # Optional
*0Sstevel@tonic-gate  <mb_cur_min> 1             # Required; usually 1
*0Sstevel@tonic-gate  <mb_cur_max> 1             # Max. # of bytes/char
*0Sstevel@tonic-gate  <subchar> \x3F             # Substitution char
*0Sstevel@tonic-gate  #
*0Sstevel@tonic-gate  CHARMAP
*0Sstevel@tonic-gate  <U0000> \x00 |0 # <control>
*0Sstevel@tonic-gate  <U0001> \x01 |0 # <control>
*0Sstevel@tonic-gate  <U0002> \x02 |0 # <control>
*0Sstevel@tonic-gate  ....
*0Sstevel@tonic-gate  <U007C> \x7C |0 # VERTICAL LINE
*0Sstevel@tonic-gate  <U007D> \x7D |0 # RIGHT CURLY BRACKET
*0Sstevel@tonic-gate  <U007E> \x7E |0 # TILDE
*0Sstevel@tonic-gate  <U007F> \x7F |0 # <control>
*0Sstevel@tonic-gate  END CHARMAP
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAnything that follows C<#> is treated as a comment.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe header section continues until a line containing the word
*0Sstevel@tonic-gateCHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
*0Sstevel@tonic-gatepair per line.  Strings used as values must be quoted. Barewords are
*0Sstevel@tonic-gatetreated as numbers.  I<\xXX> represents a byte.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateMost of the keywords are self-explanatory. I<subchar> means
*0Sstevel@tonic-gatesubstitution character, not subcharacter.  When you decode a Unicode
*0Sstevel@tonic-gatesequence to this encoding but no matching character is found, the byte
*0Sstevel@tonic-gatesequence defined here will be used.  For most cases, the value here is
*0Sstevel@tonic-gate\x3F; in ASCII, this is a question mark.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateCHARMAP starts the character map section.  Each line has a form as
*0Sstevel@tonic-gatefollows:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  <UXXXX> \xXX.. |0 # comment
*0Sstevel@tonic-gate    ^     ^      ^
*0Sstevel@tonic-gate    |     |      +- Fallback flag
*0Sstevel@tonic-gate    |     +-------- Encoded byte sequence
*0Sstevel@tonic-gate    +-------------- Unicode Character ID in hex
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe format is roughly the same as a header section except for the
*0Sstevel@tonic-gatefallback flag: | followed by 0..3.   The meaning of the possible
*0Sstevel@tonic-gatevalues is as follows:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item |0
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateRound trip safe.  A character decoded to Unicode encodes back to the
*0Sstevel@tonic-gatesame byte sequence.  Most characters have this flag.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item |1
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFallback for unicode -> encoding.  When seen, enc2xs adds this
*0Sstevel@tonic-gatecharacter for the encode map only.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item |2
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSkip sub-char mapping should there be no code point.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item |3
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFallback for encoding -> unicode.  When seen, enc2xs adds this
*0Sstevel@tonic-gatecharacter for the decode map only.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAnd finally, END OF CHARMAP ends the section.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen you are manually creating a UCM file, you should copy ascii.ucm
*0Sstevel@tonic-gateor an existing encoding which is close to yours, rather than write
*0Sstevel@tonic-gateyour own from scratch.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen you do so, make sure you leave at least B<U0000> to B<U0020> as
*0Sstevel@tonic-gateis, unless your environment is EBCDIC.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateB<CAVEAT>: not all features in UCM are implemented.  For example,
*0Sstevel@tonic-gateicu:state is not used.  Because of that, you need to write a perl
*0Sstevel@tonic-gatemodule if you want to support algorithmical encodings, notably
*0Sstevel@tonic-gatethe ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
*0Sstevel@tonic-gateL<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Coping with duplicate mappings
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen you create a map, you SHOULD make your mappings round-trip safe.
*0Sstevel@tonic-gateThat is, C<encode('your-encoding', decode('your-encoding', $data)) eq
*0Sstevel@tonic-gate$data> stands for all characters that are marked as C<|0>.  Here is
*0Sstevel@tonic-gatehow to make sure:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSort your map in Unicode order.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWhen you have a duplicate entry, mark either one with '|1' or '|3'.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateAnd make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateHere is an example from big5-eten.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  <U2550> \xF9\xF9 |0
*0Sstevel@tonic-gate  <U2550> \xA2\xA4 |3
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateInternally Encoding -> Unicode and Unicode -> Encoding Map looks like
*0Sstevel@tonic-gatethis;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  E to U               U to E
*0Sstevel@tonic-gate  --------------------------------------
*0Sstevel@tonic-gate  \xF9\xF9 => U2550    U2550 => \xF9\xF9
*0Sstevel@tonic-gate  \xA2\xA4 => U2550
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSo it is round-trip safe for \xF9\xF9.  But if the line above is upside
*0Sstevel@tonic-gatedown, here is what happens.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate  E to U               U to E
*0Sstevel@tonic-gate  --------------------------------------
*0Sstevel@tonic-gate  \xA2\xA4 => U2550    U2550 => \xF9\xF9
*0Sstevel@tonic-gate  (\xF9\xF9 => U2550 is now overwritten!)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe Encode package comes with F<ucmlint>, a crude but sufficient
*0Sstevel@tonic-gateutility to check the integrity of a UCM file.  Check under the
*0Sstevel@tonic-gateEncode/bin directory for this.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 Bookmarks
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=over 4
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateICU Home Page
*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/>
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateICU Character Mapping Tables
*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/charset/>
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=item *
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateICU:Conversion Data
*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=back
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 SEE ALSO
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateL<Encode>,
*0Sstevel@tonic-gateL<perlmod>,
*0Sstevel@tonic-gateL<perlpod>
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# -Q to disable the duplicate codepoint test
*0Sstevel@tonic-gate# -S make mapping errors fatal
*0Sstevel@tonic-gate# -q to remove comments written to output files
*0Sstevel@tonic-gate# -O to enable the (brute force) substring optimiser
*0Sstevel@tonic-gate# -o <output> to specify the output file name (else it's the first arg)
*0Sstevel@tonic-gate# -f <inlist> to give a file with a list of input files (else use the args)
*0Sstevel@tonic-gate# -n <name> to name the encoding (else use the basename of the input file.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWith %seen holding array refs:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate      865.66 real        28.80 user         8.79 sys
*0Sstevel@tonic-gate      7904  maximum resident set size
*0Sstevel@tonic-gate      1356  average shared memory size
*0Sstevel@tonic-gate     18566  average unshared data size
*0Sstevel@tonic-gate       229  average unshared stack size
*0Sstevel@tonic-gate     46080  page reclaims
*0Sstevel@tonic-gate     33373  page faults
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWith %seen holding simple scalars:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate      342.16 real        27.11 user         3.54 sys
*0Sstevel@tonic-gate      8388  maximum resident set size
*0Sstevel@tonic-gate      1394  average shared memory size
*0Sstevel@tonic-gate     14969  average unshared data size
*0Sstevel@tonic-gate       236  average unshared stack size
*0Sstevel@tonic-gate     28159  page reclaims
*0Sstevel@tonic-gate      9839  page faults
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateYes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
*0Sstevel@tonic-gatehow %seen is storing things its seen. So it is pathalogically bad on a 16M
*0Sstevel@tonic-gateRAM machine, but it's going to help even on modern machines.
*0Sstevel@tonic-gateSwapping is bad, m'kay :-)