xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/ext/Encode/bin/enc2xs (revision 0:68f95e015346)
1*0Sstevel@tonic-gate#!./perl
2*0Sstevel@tonic-gateBEGIN {
3*0Sstevel@tonic-gate    # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
4*0Sstevel@tonic-gate    # with $ENV{PERL_CORE} set
5*0Sstevel@tonic-gate    # In case we need it in future...
6*0Sstevel@tonic-gate    require Config; import Config;
7*0Sstevel@tonic-gate}
8*0Sstevel@tonic-gateuse strict;
9*0Sstevel@tonic-gateuse warnings;
10*0Sstevel@tonic-gateuse Getopt::Std;
11*0Sstevel@tonic-gatemy @orig_ARGV = @ARGV;
12*0Sstevel@tonic-gateour $VERSION  = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
13*0Sstevel@tonic-gate
14*0Sstevel@tonic-gate# These may get re-ordered.
15*0Sstevel@tonic-gate# RAW is a do_now as inserted by &enter
16*0Sstevel@tonic-gate# AGG is an aggreagated do_now, as built up by &process
17*0Sstevel@tonic-gate
18*0Sstevel@tonic-gateuse constant {
19*0Sstevel@tonic-gate  RAW_NEXT => 0,
20*0Sstevel@tonic-gate  RAW_IN_LEN => 1,
21*0Sstevel@tonic-gate  RAW_OUT_BYTES => 2,
22*0Sstevel@tonic-gate  RAW_FALLBACK => 3,
23*0Sstevel@tonic-gate
24*0Sstevel@tonic-gate  AGG_MIN_IN => 0,
25*0Sstevel@tonic-gate  AGG_MAX_IN => 1,
26*0Sstevel@tonic-gate  AGG_OUT_BYTES => 2,
27*0Sstevel@tonic-gate  AGG_NEXT => 3,
28*0Sstevel@tonic-gate  AGG_IN_LEN => 4,
29*0Sstevel@tonic-gate  AGG_OUT_LEN => 5,
30*0Sstevel@tonic-gate  AGG_FALLBACK => 6,
31*0Sstevel@tonic-gate};
32*0Sstevel@tonic-gate
33*0Sstevel@tonic-gate# (See the algorithm in encengine.c - we're building structures for it)
34*0Sstevel@tonic-gate
35*0Sstevel@tonic-gate# There are two sorts of structures.
36*0Sstevel@tonic-gate# "do_now" (an array, two variants of what needs storing) is whatever we need
37*0Sstevel@tonic-gate# to do now we've read an input byte.
38*0Sstevel@tonic-gate# It's housed in a "do_next" (which is how we got to it), and in turn points
39*0Sstevel@tonic-gate# to a "do_next" which contains all the "do_now"s for the next input byte.
40*0Sstevel@tonic-gate
41*0Sstevel@tonic-gate# There will be a "do_next" which is the start state.
42*0Sstevel@tonic-gate# For a single byte encoding it's the only "do_next" - each "do_now" points
43*0Sstevel@tonic-gate# back to it, and each "do_now" will cause bytes. There is no state.
44*0Sstevel@tonic-gate
45*0Sstevel@tonic-gate# For a multi-byte encoding where all characters in the input are the same
46*0Sstevel@tonic-gate# length, then there will be a tree of "do_now"->"do_next"->"do_now"
47*0Sstevel@tonic-gate# branching out from the start state, one step for each input byte.
48*0Sstevel@tonic-gate# The leaf "do_now"s will all be at the same distance from the start state,
49*0Sstevel@tonic-gate# only the leaf "do_now"s cause output bytes, and they in turn point back to
50*0Sstevel@tonic-gate# the start state.
51*0Sstevel@tonic-gate
52*0Sstevel@tonic-gate# For an encoding where there are varaible length input byte sequences, you
53*0Sstevel@tonic-gate# will encounter a leaf "do_now" sooner for the shorter input sequences, but
54*0Sstevel@tonic-gate# as before the leaves will point back to the start state.
55*0Sstevel@tonic-gate
56*0Sstevel@tonic-gate# The system will cope with escape encodings (imagine them as a mostly
57*0Sstevel@tonic-gate# self-contained tree for each escape state, and cross links between trees
58*0Sstevel@tonic-gate# at the state-switching characters) but so far no input format defines these.
59*0Sstevel@tonic-gate
60*0Sstevel@tonic-gate# The system will also cope with having output "leaves" in the middle of
61*0Sstevel@tonic-gate# the bifurcating branches, not just at the extremities, but again no
62*0Sstevel@tonic-gate# input format does this yet.
63*0Sstevel@tonic-gate
64*0Sstevel@tonic-gate# There are two variants of the "do_now" structure. The first, smaller variant
65*0Sstevel@tonic-gate# is generated by &enter as the input file is read. There is one structure
66*0Sstevel@tonic-gate# for each input byte. Say we are mapping a single byte encoding to a
67*0Sstevel@tonic-gate# single byte encoding, with  "ABCD" going "abcd". There will be
68*0Sstevel@tonic-gate# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
69*0Sstevel@tonic-gate
70*0Sstevel@tonic-gate# &process then walks the tree, building aggregate "do_now" structres for
71*0Sstevel@tonic-gate# adjacent bytes where possible. The aggregate is for a contiguous range of
72*0Sstevel@tonic-gate# bytes which each produce the same length of output, each move to the
73*0Sstevel@tonic-gate# same next state, and each have the same fallback flag.
74*0Sstevel@tonic-gate# So our 4 RAW "do_now"s above become replaced by a single structure
75*0Sstevel@tonic-gate# containing:
76*0Sstevel@tonic-gate# ["A", "D", "abcd", 1, ...]
77*0Sstevel@tonic-gate# ie, for an input byte $_ in "A".."D", output 1 byte, found as
78*0Sstevel@tonic-gate# substr ("abcd", (ord $_ - ord "A") * 1, 1)
79*0Sstevel@tonic-gate# which maps very nicely into pointer arithmetic in C for encengine.c
80*0Sstevel@tonic-gate
81*0Sstevel@tonic-gatesub encode_U
82*0Sstevel@tonic-gate{
83*0Sstevel@tonic-gate # UTF-8 encode long hand - only covers part of perl's range
84*0Sstevel@tonic-gate ## my $uv = shift;
85*0Sstevel@tonic-gate # chr() works in native space so convert value from table
86*0Sstevel@tonic-gate # into that space before using chr().
87*0Sstevel@tonic-gate my $ch = chr(utf8::unicode_to_native($_[0]));
88*0Sstevel@tonic-gate # Now get core perl to encode that the way it likes.
89*0Sstevel@tonic-gate utf8::encode($ch);
90*0Sstevel@tonic-gate return $ch;
91*0Sstevel@tonic-gate}
92*0Sstevel@tonic-gate
93*0Sstevel@tonic-gatesub encode_S
94*0Sstevel@tonic-gate{
95*0Sstevel@tonic-gate # encode single byte
96*0Sstevel@tonic-gate ## my ($ch,$page) = @_; return chr($ch);
97*0Sstevel@tonic-gate return chr $_[0];
98*0Sstevel@tonic-gate}
99*0Sstevel@tonic-gate
100*0Sstevel@tonic-gatesub encode_D
101*0Sstevel@tonic-gate{
102*0Sstevel@tonic-gate # encode double byte MS byte first
103*0Sstevel@tonic-gate ## my ($ch,$page) = @_; return chr($page).chr($ch);
104*0Sstevel@tonic-gate return chr ($_[1]) . chr $_[0];
105*0Sstevel@tonic-gate}
106*0Sstevel@tonic-gate
107*0Sstevel@tonic-gatesub encode_M
108*0Sstevel@tonic-gate{
109*0Sstevel@tonic-gate # encode Multi-byte - single for 0..255 otherwise double
110*0Sstevel@tonic-gate ## my ($ch,$page) = @_;
111*0Sstevel@tonic-gate ## return &encode_D if $page;
112*0Sstevel@tonic-gate ## return &encode_S;
113*0Sstevel@tonic-gate return chr ($_[1]) . chr $_[0] if $_[1];
114*0Sstevel@tonic-gate return chr $_[0];
115*0Sstevel@tonic-gate}
116*0Sstevel@tonic-gate
117*0Sstevel@tonic-gatemy %encode_types = (U => \&encode_U,
118*0Sstevel@tonic-gate                    S => \&encode_S,
119*0Sstevel@tonic-gate                    D => \&encode_D,
120*0Sstevel@tonic-gate                    M => \&encode_M,
121*0Sstevel@tonic-gate                   );
122*0Sstevel@tonic-gate
123*0Sstevel@tonic-gate# Win32 does not expand globs on command line
124*0Sstevel@tonic-gateeval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
125*0Sstevel@tonic-gate
126*0Sstevel@tonic-gatemy %opt;
127*0Sstevel@tonic-gate# I think these are:
128*0Sstevel@tonic-gate# -Q to disable the duplicate codepoint test
129*0Sstevel@tonic-gate# -S make mapping errors fatal
130*0Sstevel@tonic-gate# -q to remove comments written to output files
131*0Sstevel@tonic-gate# -O to enable the (brute force) substring optimiser
132*0Sstevel@tonic-gate# -o <output> to specify the output file name (else it's the first arg)
133*0Sstevel@tonic-gate# -f <inlist> to give a file with a list of input files (else use the args)
134*0Sstevel@tonic-gate# -n <name> to name the encoding (else use the basename of the input file.
135*0Sstevel@tonic-gategetopts('CM:SQqOo:f:n:',\%opt);
136*0Sstevel@tonic-gate
137*0Sstevel@tonic-gate$opt{M} and make_makefile_pl($opt{M}, @ARGV);
138*0Sstevel@tonic-gate$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
139*0Sstevel@tonic-gate
140*0Sstevel@tonic-gate# This really should go first, else the die here causes empty (non-erroneous)
141*0Sstevel@tonic-gate# output files to be written.
142*0Sstevel@tonic-gatemy @encfiles;
143*0Sstevel@tonic-gateif (exists $opt{'f'}) {
144*0Sstevel@tonic-gate    # -F is followed by name of file containing list of filenames
145*0Sstevel@tonic-gate    my $flist = $opt{'f'};
146*0Sstevel@tonic-gate    open(FLIST,$flist) || die "Cannot open $flist:$!";
147*0Sstevel@tonic-gate    chomp(@encfiles = <FLIST>);
148*0Sstevel@tonic-gate    close(FLIST);
149*0Sstevel@tonic-gate} else {
150*0Sstevel@tonic-gate    @encfiles = @ARGV;
151*0Sstevel@tonic-gate}
152*0Sstevel@tonic-gate
153*0Sstevel@tonic-gatemy $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
154*0Sstevel@tonic-gatechmod(0666,$cname) if -f $cname && !-w $cname;
155*0Sstevel@tonic-gateopen(C,">$cname") || die "Cannot open $cname:$!";
156*0Sstevel@tonic-gate
157*0Sstevel@tonic-gatemy $dname = $cname;
158*0Sstevel@tonic-gatemy $hname = $cname;
159*0Sstevel@tonic-gate
160*0Sstevel@tonic-gatemy ($doC,$doEnc,$doUcm,$doPet);
161*0Sstevel@tonic-gate
162*0Sstevel@tonic-gateif ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
163*0Sstevel@tonic-gate {
164*0Sstevel@tonic-gate  $doC = 1;
165*0Sstevel@tonic-gate  $dname =~ s/(\.[^\.]*)?$/.exh/;
166*0Sstevel@tonic-gate  chmod(0666,$dname) if -f $cname && !-w $dname;
167*0Sstevel@tonic-gate  open(D,">$dname") || die "Cannot open $dname:$!";
168*0Sstevel@tonic-gate  $hname =~ s/(\.[^\.]*)?$/.h/;
169*0Sstevel@tonic-gate  chmod(0666,$hname) if -f $cname && !-w $hname;
170*0Sstevel@tonic-gate  open(H,">$hname") || die "Cannot open $hname:$!";
171*0Sstevel@tonic-gate
172*0Sstevel@tonic-gate  foreach my $fh (\*C,\*D,\*H)
173*0Sstevel@tonic-gate  {
174*0Sstevel@tonic-gate   print $fh <<"END" unless $opt{'q'};
175*0Sstevel@tonic-gate/*
176*0Sstevel@tonic-gate !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
177*0Sstevel@tonic-gate This file was autogenerated by:
178*0Sstevel@tonic-gate $^X $0 @orig_ARGV
179*0Sstevel@tonic-gate*/
180*0Sstevel@tonic-gateEND
181*0Sstevel@tonic-gate  }
182*0Sstevel@tonic-gate
183*0Sstevel@tonic-gate  if ($cname =~ /(\w+)\.xs$/)
184*0Sstevel@tonic-gate   {
185*0Sstevel@tonic-gate    print C "#include <EXTERN.h>\n";
186*0Sstevel@tonic-gate    print C "#include <perl.h>\n";
187*0Sstevel@tonic-gate    print C "#include <XSUB.h>\n";
188*0Sstevel@tonic-gate    print C "#define U8 U8\n";
189*0Sstevel@tonic-gate   }
190*0Sstevel@tonic-gate  print C "#include \"encode.h\"\n\n";
191*0Sstevel@tonic-gate
192*0Sstevel@tonic-gate }
193*0Sstevel@tonic-gateelsif ($cname =~ /\.enc$/)
194*0Sstevel@tonic-gate {
195*0Sstevel@tonic-gate  $doEnc = 1;
196*0Sstevel@tonic-gate }
197*0Sstevel@tonic-gateelsif ($cname =~ /\.ucm$/)
198*0Sstevel@tonic-gate {
199*0Sstevel@tonic-gate  $doUcm = 1;
200*0Sstevel@tonic-gate }
201*0Sstevel@tonic-gateelsif ($cname =~ /\.pet$/)
202*0Sstevel@tonic-gate {
203*0Sstevel@tonic-gate  $doPet = 1;
204*0Sstevel@tonic-gate }
205*0Sstevel@tonic-gate
206*0Sstevel@tonic-gatemy %encoding;
207*0Sstevel@tonic-gatemy %strings;
208*0Sstevel@tonic-gatemy $string_acc;
209*0Sstevel@tonic-gatemy %strings_in_acc;
210*0Sstevel@tonic-gate
211*0Sstevel@tonic-gatemy $saved = 0;
212*0Sstevel@tonic-gatemy $subsave = 0;
213*0Sstevel@tonic-gatemy $strings = 0;
214*0Sstevel@tonic-gate
215*0Sstevel@tonic-gatesub cmp_name
216*0Sstevel@tonic-gate{
217*0Sstevel@tonic-gate if ($a =~ /^.*-(\d+)/)
218*0Sstevel@tonic-gate  {
219*0Sstevel@tonic-gate   my $an = $1;
220*0Sstevel@tonic-gate   if ($b =~ /^.*-(\d+)/)
221*0Sstevel@tonic-gate    {
222*0Sstevel@tonic-gate     my $r = $an <=> $1;
223*0Sstevel@tonic-gate     return $r if $r;
224*0Sstevel@tonic-gate    }
225*0Sstevel@tonic-gate  }
226*0Sstevel@tonic-gate return $a cmp $b;
227*0Sstevel@tonic-gate}
228*0Sstevel@tonic-gate
229*0Sstevel@tonic-gate
230*0Sstevel@tonic-gateforeach my $enc (sort cmp_name @encfiles)
231*0Sstevel@tonic-gate {
232*0Sstevel@tonic-gate  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
233*0Sstevel@tonic-gate  $name = $opt{'n'} if exists $opt{'n'};
234*0Sstevel@tonic-gate  if (open(E,$enc))
235*0Sstevel@tonic-gate   {
236*0Sstevel@tonic-gate    if ($sfx eq 'enc')
237*0Sstevel@tonic-gate     {
238*0Sstevel@tonic-gate      compile_enc(\*E,lc($name));
239*0Sstevel@tonic-gate     }
240*0Sstevel@tonic-gate    else
241*0Sstevel@tonic-gate     {
242*0Sstevel@tonic-gate      compile_ucm(\*E,lc($name));
243*0Sstevel@tonic-gate     }
244*0Sstevel@tonic-gate   }
245*0Sstevel@tonic-gate  else
246*0Sstevel@tonic-gate   {
247*0Sstevel@tonic-gate    warn "Cannot open $enc for $name:$!";
248*0Sstevel@tonic-gate   }
249*0Sstevel@tonic-gate }
250*0Sstevel@tonic-gate
251*0Sstevel@tonic-gateif ($doC)
252*0Sstevel@tonic-gate {
253*0Sstevel@tonic-gate  print STDERR "Writing compiled form\n";
254*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
255*0Sstevel@tonic-gate   {
256*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
257*0Sstevel@tonic-gate    process($name.'_utf8',$e2u);
258*0Sstevel@tonic-gate    addstrings(\*C,$e2u);
259*0Sstevel@tonic-gate
260*0Sstevel@tonic-gate    process('utf8_'.$name,$u2e);
261*0Sstevel@tonic-gate    addstrings(\*C,$u2e);
262*0Sstevel@tonic-gate   }
263*0Sstevel@tonic-gate  outbigstring(\*C,"enctable");
264*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
265*0Sstevel@tonic-gate   {
266*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
267*0Sstevel@tonic-gate    outtable(\*C,$e2u, "enctable");
268*0Sstevel@tonic-gate    outtable(\*C,$u2e, "enctable");
269*0Sstevel@tonic-gate
270*0Sstevel@tonic-gate    # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
271*0Sstevel@tonic-gate   }
272*0Sstevel@tonic-gate  foreach my $enc (sort cmp_name keys %encoding)
273*0Sstevel@tonic-gate   {
274*0Sstevel@tonic-gate    # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
275*0Sstevel@tonic-gate    my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
276*0Sstevel@tonic-gate    #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
277*0Sstevel@tonic-gate    my $replen = 0;
278*0Sstevel@tonic-gate    $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
279*0Sstevel@tonic-gate    my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
280*0Sstevel@tonic-gate    my $sym = "${enc}_encoding";
281*0Sstevel@tonic-gate    $sym =~ s/\W+/_/g;
282*0Sstevel@tonic-gate    print C "encode_t $sym = \n";
283*0Sstevel@tonic-gate    # This is to make null encoding work -- dankogai
284*0Sstevel@tonic-gate    for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
285*0Sstevel@tonic-gate	$info[$i] ||= 1;
286*0Sstevel@tonic-gate    }
287*0Sstevel@tonic-gate    # end of null tweak -- dankogai
288*0Sstevel@tonic-gate    print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
289*0Sstevel@tonic-gate   }
290*0Sstevel@tonic-gate
291*0Sstevel@tonic-gate  foreach my $enc (sort cmp_name keys %encoding)
292*0Sstevel@tonic-gate   {
293*0Sstevel@tonic-gate    my $sym = "${enc}_encoding";
294*0Sstevel@tonic-gate    $sym =~ s/\W+/_/g;
295*0Sstevel@tonic-gate    print H "extern encode_t $sym;\n";
296*0Sstevel@tonic-gate    print D " Encode_XSEncoding(aTHX_ &$sym);\n";
297*0Sstevel@tonic-gate   }
298*0Sstevel@tonic-gate
299*0Sstevel@tonic-gate  if ($cname =~ /(\w+)\.xs$/)
300*0Sstevel@tonic-gate   {
301*0Sstevel@tonic-gate    my $mod = $1;
302*0Sstevel@tonic-gate    print C <<'END';
303*0Sstevel@tonic-gate
304*0Sstevel@tonic-gatestatic void
305*0Sstevel@tonic-gateEncode_XSEncoding(pTHX_ encode_t *enc)
306*0Sstevel@tonic-gate{
307*0Sstevel@tonic-gate dSP;
308*0Sstevel@tonic-gate HV *stash = gv_stashpv("Encode::XS", TRUE);
309*0Sstevel@tonic-gate SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
310*0Sstevel@tonic-gate int i = 0;
311*0Sstevel@tonic-gate PUSHMARK(sp);
312*0Sstevel@tonic-gate XPUSHs(sv);
313*0Sstevel@tonic-gate while (enc->name[i])
314*0Sstevel@tonic-gate  {
315*0Sstevel@tonic-gate   const char *name = enc->name[i++];
316*0Sstevel@tonic-gate   XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
317*0Sstevel@tonic-gate  }
318*0Sstevel@tonic-gate PUTBACK;
319*0Sstevel@tonic-gate call_pv("Encode::define_encoding",G_DISCARD);
320*0Sstevel@tonic-gate SvREFCNT_dec(sv);
321*0Sstevel@tonic-gate}
322*0Sstevel@tonic-gate
323*0Sstevel@tonic-gateEND
324*0Sstevel@tonic-gate
325*0Sstevel@tonic-gate    print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
326*0Sstevel@tonic-gate    print C "BOOT:\n{\n";
327*0Sstevel@tonic-gate    print C "#include \"$dname\"\n";
328*0Sstevel@tonic-gate    print C "}\n";
329*0Sstevel@tonic-gate   }
330*0Sstevel@tonic-gate  # Close in void context is bad, m'kay
331*0Sstevel@tonic-gate  close(D) or warn "Error closing '$dname': $!";
332*0Sstevel@tonic-gate  close(H) or warn "Error closing '$hname': $!";
333*0Sstevel@tonic-gate
334*0Sstevel@tonic-gate  my $perc_saved    = $strings/($strings + $saved) * 100;
335*0Sstevel@tonic-gate  my $perc_subsaved = $strings/($strings + $subsave) * 100;
336*0Sstevel@tonic-gate  printf STDERR "%d bytes in string tables\n",$strings;
337*0Sstevel@tonic-gate  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
338*0Sstevel@tonic-gate    $saved, $perc_saved              if $saved;
339*0Sstevel@tonic-gate  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
340*0Sstevel@tonic-gate    $subsave, $perc_subsaved         if $subsave;
341*0Sstevel@tonic-gate }
342*0Sstevel@tonic-gateelsif ($doEnc)
343*0Sstevel@tonic-gate {
344*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
345*0Sstevel@tonic-gate   {
346*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
347*0Sstevel@tonic-gate    output_enc(\*C,$name,$e2u);
348*0Sstevel@tonic-gate   }
349*0Sstevel@tonic-gate }
350*0Sstevel@tonic-gateelsif ($doUcm)
351*0Sstevel@tonic-gate {
352*0Sstevel@tonic-gate  foreach my $name (sort cmp_name keys %encoding)
353*0Sstevel@tonic-gate   {
354*0Sstevel@tonic-gate    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
355*0Sstevel@tonic-gate    output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
356*0Sstevel@tonic-gate   }
357*0Sstevel@tonic-gate }
358*0Sstevel@tonic-gate
359*0Sstevel@tonic-gate# writing half meg files and then not checking to see if you just filled the
360*0Sstevel@tonic-gate# disk is bad, m'kay
361*0Sstevel@tonic-gateclose(C) or die "Error closing '$cname': $!";
362*0Sstevel@tonic-gate
363*0Sstevel@tonic-gate# End of the main program.
364*0Sstevel@tonic-gate
365*0Sstevel@tonic-gatesub compile_ucm
366*0Sstevel@tonic-gate{
367*0Sstevel@tonic-gate my ($fh,$name) = @_;
368*0Sstevel@tonic-gate my $e2u = {};
369*0Sstevel@tonic-gate my $u2e = {};
370*0Sstevel@tonic-gate my $cs;
371*0Sstevel@tonic-gate my %attr;
372*0Sstevel@tonic-gate while (<$fh>)
373*0Sstevel@tonic-gate  {
374*0Sstevel@tonic-gate   s/#.*$//;
375*0Sstevel@tonic-gate   last if /^\s*CHARMAP\s*$/i;
376*0Sstevel@tonic-gate   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
377*0Sstevel@tonic-gate    {
378*0Sstevel@tonic-gate     $attr{$1} = $2;
379*0Sstevel@tonic-gate    }
380*0Sstevel@tonic-gate  }
381*0Sstevel@tonic-gate if (!defined($cs =  $attr{'code_set_name'}))
382*0Sstevel@tonic-gate  {
383*0Sstevel@tonic-gate   warn "No <code_set_name> in $name\n";
384*0Sstevel@tonic-gate  }
385*0Sstevel@tonic-gate else
386*0Sstevel@tonic-gate  {
387*0Sstevel@tonic-gate   $name = $cs unless exists $opt{'n'};
388*0Sstevel@tonic-gate  }
389*0Sstevel@tonic-gate my $erep;
390*0Sstevel@tonic-gate my $urep;
391*0Sstevel@tonic-gate my $max_el;
392*0Sstevel@tonic-gate my $min_el;
393*0Sstevel@tonic-gate if (exists $attr{'subchar'})
394*0Sstevel@tonic-gate  {
395*0Sstevel@tonic-gate   #my @byte;
396*0Sstevel@tonic-gate   #$attr{'subchar'} =~ /^\s*/cg;
397*0Sstevel@tonic-gate   #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
398*0Sstevel@tonic-gate   #$erep = join('',map(chr(hex($_)),@byte));
399*0Sstevel@tonic-gate   $erep = $attr{'subchar'};
400*0Sstevel@tonic-gate   $erep =~ s/^\s+//; $erep =~ s/\s+$//;
401*0Sstevel@tonic-gate  }
402*0Sstevel@tonic-gate print "Reading $name ($cs)\n";
403*0Sstevel@tonic-gate my $nfb = 0;
404*0Sstevel@tonic-gate my $hfb = 0;
405*0Sstevel@tonic-gate while (<$fh>)
406*0Sstevel@tonic-gate  {
407*0Sstevel@tonic-gate   s/#.*$//;
408*0Sstevel@tonic-gate   last if /^\s*END\s+CHARMAP\s*$/i;
409*0Sstevel@tonic-gate   next if /^\s*$/;
410*0Sstevel@tonic-gate   my (@uni, @byte) = ();
411*0Sstevel@tonic-gate   my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
412*0Sstevel@tonic-gate       or die "Bad line: $_";
413*0Sstevel@tonic-gate   while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
414*0Sstevel@tonic-gate       push @uni, map { substr($_, 1) } split(/\+/, $1);
415*0Sstevel@tonic-gate   }
416*0Sstevel@tonic-gate   while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
417*0Sstevel@tonic-gate       push @byte, $1;
418*0Sstevel@tonic-gate   }
419*0Sstevel@tonic-gate   if (@uni)
420*0Sstevel@tonic-gate    {
421*0Sstevel@tonic-gate     my $uch =  join('', map { encode_U(hex($_)) } @uni );
422*0Sstevel@tonic-gate     my $ech = join('',map(chr(hex($_)),@byte));
423*0Sstevel@tonic-gate     my $el  = length($ech);
424*0Sstevel@tonic-gate     $max_el = $el if (!defined($max_el) || $el > $max_el);
425*0Sstevel@tonic-gate     $min_el = $el if (!defined($min_el) || $el < $min_el);
426*0Sstevel@tonic-gate     if (length($fb))
427*0Sstevel@tonic-gate      {
428*0Sstevel@tonic-gate       $fb = substr($fb,1);
429*0Sstevel@tonic-gate       $hfb++;
430*0Sstevel@tonic-gate      }
431*0Sstevel@tonic-gate     else
432*0Sstevel@tonic-gate      {
433*0Sstevel@tonic-gate       $nfb++;
434*0Sstevel@tonic-gate       $fb = '0';
435*0Sstevel@tonic-gate      }
436*0Sstevel@tonic-gate     # $fb is fallback flag
437*0Sstevel@tonic-gate     # 0 - round trip safe
438*0Sstevel@tonic-gate     # 1 - fallback for unicode -> enc
439*0Sstevel@tonic-gate     # 2 - skip sub-char mapping
440*0Sstevel@tonic-gate     # 3 - fallback enc -> unicode
441*0Sstevel@tonic-gate     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
442*0Sstevel@tonic-gate     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
443*0Sstevel@tonic-gate    }
444*0Sstevel@tonic-gate   else
445*0Sstevel@tonic-gate    {
446*0Sstevel@tonic-gate     warn $_;
447*0Sstevel@tonic-gate    }
448*0Sstevel@tonic-gate  }
449*0Sstevel@tonic-gate if ($nfb && $hfb)
450*0Sstevel@tonic-gate  {
451*0Sstevel@tonic-gate   die "$nfb entries without fallback, $hfb entries with\n";
452*0Sstevel@tonic-gate  }
453*0Sstevel@tonic-gate $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
454*0Sstevel@tonic-gate}
455*0Sstevel@tonic-gate
456*0Sstevel@tonic-gate
457*0Sstevel@tonic-gate
458*0Sstevel@tonic-gatesub compile_enc
459*0Sstevel@tonic-gate{
460*0Sstevel@tonic-gate my ($fh,$name) = @_;
461*0Sstevel@tonic-gate my $e2u = {};
462*0Sstevel@tonic-gate my $u2e = {};
463*0Sstevel@tonic-gate
464*0Sstevel@tonic-gate my $type;
465*0Sstevel@tonic-gate while ($type = <$fh>)
466*0Sstevel@tonic-gate  {
467*0Sstevel@tonic-gate   last if $type !~ /^\s*#/;
468*0Sstevel@tonic-gate  }
469*0Sstevel@tonic-gate chomp($type);
470*0Sstevel@tonic-gate return if $type eq 'E';
471*0Sstevel@tonic-gate # Do the hash lookup once, rather than once per function call. 4% speedup.
472*0Sstevel@tonic-gate my $type_func = $encode_types{$type};
473*0Sstevel@tonic-gate my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
474*0Sstevel@tonic-gate warn "$type encoded $name\n";
475*0Sstevel@tonic-gate my $rep = '';
476*0Sstevel@tonic-gate # Save a defined test by setting these to defined values.
477*0Sstevel@tonic-gate my $min_el = ~0; # A very big integer
478*0Sstevel@tonic-gate my $max_el = 0;  # Anything must be longer than 0
479*0Sstevel@tonic-gate {
480*0Sstevel@tonic-gate  my $v = hex($def);
481*0Sstevel@tonic-gate  $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
482*0Sstevel@tonic-gate }
483*0Sstevel@tonic-gate my $errors;
484*0Sstevel@tonic-gate my $seen;
485*0Sstevel@tonic-gate # use -Q to silence the seen test. Makefile.PL uses this by default.
486*0Sstevel@tonic-gate $seen = {} unless $opt{Q};
487*0Sstevel@tonic-gate do
488*0Sstevel@tonic-gate  {
489*0Sstevel@tonic-gate   my $line = <$fh>;
490*0Sstevel@tonic-gate   chomp($line);
491*0Sstevel@tonic-gate   my $page = hex($line);
492*0Sstevel@tonic-gate   my $ch = 0;
493*0Sstevel@tonic-gate   my $i = 16;
494*0Sstevel@tonic-gate   do
495*0Sstevel@tonic-gate    {
496*0Sstevel@tonic-gate     # So why is it 1% faster to leave the my here?
497*0Sstevel@tonic-gate     my $line = <$fh>;
498*0Sstevel@tonic-gate     $line =~ s/\r\n$/\n/;
499*0Sstevel@tonic-gate     die "$.:${line}Line should be exactly 65 characters long including
500*0Sstevel@tonic-gate     newline (".length($line).")" unless length ($line) == 65;
501*0Sstevel@tonic-gate     # Split line into groups of 4 hex digits, convert groups to ints
502*0Sstevel@tonic-gate     # This takes 65.35
503*0Sstevel@tonic-gate     # map {hex $_} $line =~ /(....)/g
504*0Sstevel@tonic-gate     # This takes 63.75 (2.5% less time)
505*0Sstevel@tonic-gate     # unpack "n*", pack "H*", $line
506*0Sstevel@tonic-gate     # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
507*0Sstevel@tonic-gate     # Doing it as while ($line =~ /(....)/g) took 74.63
508*0Sstevel@tonic-gate     foreach my $val (unpack "n*", pack "H*", $line)
509*0Sstevel@tonic-gate      {
510*0Sstevel@tonic-gate       next if $val == 0xFFFD;
511*0Sstevel@tonic-gate       my $ech = &$type_func($ch,$page);
512*0Sstevel@tonic-gate       if ($val || (!$ch && !$page))
513*0Sstevel@tonic-gate        {
514*0Sstevel@tonic-gate         my $el  = length($ech);
515*0Sstevel@tonic-gate         $max_el = $el if $el > $max_el;
516*0Sstevel@tonic-gate         $min_el = $el if $el < $min_el;
517*0Sstevel@tonic-gate         my $uch = encode_U($val);
518*0Sstevel@tonic-gate         if ($seen) {
519*0Sstevel@tonic-gate           # We're doing the test.
520*0Sstevel@tonic-gate           # We don't need to read this quickly, so storing it as a scalar,
521*0Sstevel@tonic-gate           # rather than 3 (anon array, plus the 2 scalars it holds) saves
522*0Sstevel@tonic-gate           # RAM and may make us faster on low RAM systems. [see __END__]
523*0Sstevel@tonic-gate           if (exists $seen->{$uch})
524*0Sstevel@tonic-gate             {
525*0Sstevel@tonic-gate               warn sprintf("U%04X is %02X%02X and %04X\n",
526*0Sstevel@tonic-gate                            $val,$page,$ch,$seen->{$uch});
527*0Sstevel@tonic-gate               $errors++;
528*0Sstevel@tonic-gate             }
529*0Sstevel@tonic-gate           else
530*0Sstevel@tonic-gate             {
531*0Sstevel@tonic-gate               $seen->{$uch} = $page << 8 | $ch;
532*0Sstevel@tonic-gate             }
533*0Sstevel@tonic-gate         }
534*0Sstevel@tonic-gate         # Passing 2 extra args each time is 3.6% slower!
535*0Sstevel@tonic-gate         # Even with having to add $fallback ||= 0 later
536*0Sstevel@tonic-gate         enter_fb0($e2u,$ech,$uch);
537*0Sstevel@tonic-gate         enter_fb0($u2e,$uch,$ech);
538*0Sstevel@tonic-gate        }
539*0Sstevel@tonic-gate       else
540*0Sstevel@tonic-gate        {
541*0Sstevel@tonic-gate         # No character at this position
542*0Sstevel@tonic-gate         # enter($e2u,$ech,undef,$e2u);
543*0Sstevel@tonic-gate        }
544*0Sstevel@tonic-gate       $ch++;
545*0Sstevel@tonic-gate      }
546*0Sstevel@tonic-gate    } while --$i;
547*0Sstevel@tonic-gate  } while --$pages;
548*0Sstevel@tonic-gate die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
549*0Sstevel@tonic-gate   if $min_el > $max_el;
550*0Sstevel@tonic-gate die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
551*0Sstevel@tonic-gate $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
552*0Sstevel@tonic-gate}
553*0Sstevel@tonic-gate
554*0Sstevel@tonic-gate# my ($a,$s,$d,$t,$fb) = @_;
555*0Sstevel@tonic-gatesub enter {
556*0Sstevel@tonic-gate  my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
557*0Sstevel@tonic-gate  # state we shift to after this (multibyte) input character defaults to same
558*0Sstevel@tonic-gate  # as current state.
559*0Sstevel@tonic-gate  $next ||= $current;
560*0Sstevel@tonic-gate  # Making sure it is defined seems to be faster than {no warnings;} in
561*0Sstevel@tonic-gate  # &process, or passing it in as 0 explicity.
562*0Sstevel@tonic-gate  # XXX $fallback ||= 0;
563*0Sstevel@tonic-gate
564*0Sstevel@tonic-gate  # Start at the beginning and work forwards through the string to zero.
565*0Sstevel@tonic-gate  # effectively we are removing 1 character from the front each time
566*0Sstevel@tonic-gate  # but we don't actually edit the string. [this alone seems to be 14% speedup]
567*0Sstevel@tonic-gate  # Hence -$pos is the length of the remaining string.
568*0Sstevel@tonic-gate  my $pos = -length $inbytes;
569*0Sstevel@tonic-gate  while (1) {
570*0Sstevel@tonic-gate    my $byte = substr $inbytes, $pos, 1;
571*0Sstevel@tonic-gate    #  RAW_NEXT => 0,
572*0Sstevel@tonic-gate    #  RAW_IN_LEN => 1,
573*0Sstevel@tonic-gate    #  RAW_OUT_BYTES => 2,
574*0Sstevel@tonic-gate    #  RAW_FALLBACK => 3,
575*0Sstevel@tonic-gate    # to unicode an array would seem to be better, because the pages are dense.
576*0Sstevel@tonic-gate    # from unicode can be very sparse, favouring a hash.
577*0Sstevel@tonic-gate    # hash using the bytes (all length 1) as keys rather than ord value,
578*0Sstevel@tonic-gate    # as it's easier to sort these in &process.
579*0Sstevel@tonic-gate
580*0Sstevel@tonic-gate    # It's faster to always add $fallback even if it's undef, rather than
581*0Sstevel@tonic-gate    # choosing between 3 and 4 element array. (hence why we set it defined
582*0Sstevel@tonic-gate    # above)
583*0Sstevel@tonic-gate    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
584*0Sstevel@tonic-gate    # When $pos was -1 we were at the last input character.
585*0Sstevel@tonic-gate    unless (++$pos) {
586*0Sstevel@tonic-gate      $do_now->[RAW_OUT_BYTES] = $outbytes;
587*0Sstevel@tonic-gate      $do_now->[RAW_NEXT] = $next;
588*0Sstevel@tonic-gate      return;
589*0Sstevel@tonic-gate    }
590*0Sstevel@tonic-gate    # Tail recursion. The intermdiate state may not have a name yet.
591*0Sstevel@tonic-gate    $current = $do_now->[RAW_NEXT];
592*0Sstevel@tonic-gate  }
593*0Sstevel@tonic-gate}
594*0Sstevel@tonic-gate
595*0Sstevel@tonic-gate# This is purely for optimistation. It's just &enter hard coded for $fallback
596*0Sstevel@tonic-gate# of 0, using only a 3 entry array ref to save memory for every entry.
597*0Sstevel@tonic-gatesub enter_fb0 {
598*0Sstevel@tonic-gate  my ($current,$inbytes,$outbytes,$next) = @_;
599*0Sstevel@tonic-gate  $next ||= $current;
600*0Sstevel@tonic-gate
601*0Sstevel@tonic-gate  my $pos = -length $inbytes;
602*0Sstevel@tonic-gate  while (1) {
603*0Sstevel@tonic-gate    my $byte = substr $inbytes, $pos, 1;
604*0Sstevel@tonic-gate    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
605*0Sstevel@tonic-gate    unless (++$pos) {
606*0Sstevel@tonic-gate      $do_now->[RAW_OUT_BYTES] = $outbytes;
607*0Sstevel@tonic-gate      $do_now->[RAW_NEXT] = $next;
608*0Sstevel@tonic-gate      return;
609*0Sstevel@tonic-gate    }
610*0Sstevel@tonic-gate    $current = $do_now->[RAW_NEXT];
611*0Sstevel@tonic-gate  }
612*0Sstevel@tonic-gate}
613*0Sstevel@tonic-gate
614*0Sstevel@tonic-gatesub process
615*0Sstevel@tonic-gate{
616*0Sstevel@tonic-gate  my ($name,$a) = @_;
617*0Sstevel@tonic-gate  $name =~ s/\W+/_/g;
618*0Sstevel@tonic-gate  $a->{Cname} = $name;
619*0Sstevel@tonic-gate  my $raw = $a->{Raw};
620*0Sstevel@tonic-gate  my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
621*0Sstevel@tonic-gate  my @ent;
622*0Sstevel@tonic-gate  $agg_max_in = 0;
623*0Sstevel@tonic-gate  foreach my $key (sort keys %$raw) {
624*0Sstevel@tonic-gate    #  RAW_NEXT => 0,
625*0Sstevel@tonic-gate    #  RAW_IN_LEN => 1,
626*0Sstevel@tonic-gate    #  RAW_OUT_BYTES => 2,
627*0Sstevel@tonic-gate    #  RAW_FALLBACK => 3,
628*0Sstevel@tonic-gate    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
629*0Sstevel@tonic-gate    # Now we are converting from raw to aggregate, switch from 1 byte strings
630*0Sstevel@tonic-gate    # to numbers
631*0Sstevel@tonic-gate    my $b = ord $key;
632*0Sstevel@tonic-gate    $fallback ||= 0;
633*0Sstevel@tonic-gate    if ($l &&
634*0Sstevel@tonic-gate        # If this == fails, we're going to reset $agg_max_in below anyway.
635*0Sstevel@tonic-gate        $b == ++$agg_max_in &&
636*0Sstevel@tonic-gate        # References in numeric context give the pointer as an int.
637*0Sstevel@tonic-gate        $agg_next == $next &&
638*0Sstevel@tonic-gate        $agg_in_len == $in_len &&
639*0Sstevel@tonic-gate        $agg_out_len == length $out_bytes &&
640*0Sstevel@tonic-gate        $agg_fallback == $fallback
641*0Sstevel@tonic-gate        # && length($l->[AGG_OUT_BYTES]) < 16
642*0Sstevel@tonic-gate       ) {
643*0Sstevel@tonic-gate      #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
644*0Sstevel@tonic-gate      # we can aggregate this byte onto the end.
645*0Sstevel@tonic-gate      $l->[AGG_MAX_IN] = $b;
646*0Sstevel@tonic-gate      $l->[AGG_OUT_BYTES] .= $out_bytes;
647*0Sstevel@tonic-gate    } else {
648*0Sstevel@tonic-gate      # AGG_MIN_IN => 0,
649*0Sstevel@tonic-gate      # AGG_MAX_IN => 1,
650*0Sstevel@tonic-gate      # AGG_OUT_BYTES => 2,
651*0Sstevel@tonic-gate      # AGG_NEXT => 3,
652*0Sstevel@tonic-gate      # AGG_IN_LEN => 4,
653*0Sstevel@tonic-gate      # AGG_OUT_LEN => 5,
654*0Sstevel@tonic-gate      # AGG_FALLBACK => 6,
655*0Sstevel@tonic-gate      # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
656*0Sstevel@tonic-gate      # (only gains .6% on euc-jp  -- is it worth it?)
657*0Sstevel@tonic-gate      push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
658*0Sstevel@tonic-gate                       $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
659*0Sstevel@tonic-gate                       $agg_fallback = $fallback];
660*0Sstevel@tonic-gate    }
661*0Sstevel@tonic-gate    if (exists $next->{Cname}) {
662*0Sstevel@tonic-gate      $next->{'Forward'} = 1 if $next != $a;
663*0Sstevel@tonic-gate    } else {
664*0Sstevel@tonic-gate      process(sprintf("%s_%02x",$name,$b),$next);
665*0Sstevel@tonic-gate    }
666*0Sstevel@tonic-gate  }
667*0Sstevel@tonic-gate  # encengine.c rules say that last entry must be for 255
668*0Sstevel@tonic-gate  if ($agg_max_in < 255) {
669*0Sstevel@tonic-gate    push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
670*0Sstevel@tonic-gate  }
671*0Sstevel@tonic-gate  $a->{'Entries'} = \@ent;
672*0Sstevel@tonic-gate}
673*0Sstevel@tonic-gate
674*0Sstevel@tonic-gate
675*0Sstevel@tonic-gatesub addstrings
676*0Sstevel@tonic-gate{
677*0Sstevel@tonic-gate my ($fh,$a) = @_;
678*0Sstevel@tonic-gate my $name = $a->{'Cname'};
679*0Sstevel@tonic-gate # String tables
680*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
681*0Sstevel@tonic-gate  {
682*0Sstevel@tonic-gate   next unless $b->[AGG_OUT_LEN];
683*0Sstevel@tonic-gate   $strings{$b->[AGG_OUT_BYTES]} = undef;
684*0Sstevel@tonic-gate  }
685*0Sstevel@tonic-gate if ($a->{'Forward'})
686*0Sstevel@tonic-gate  {
687*0Sstevel@tonic-gate   my $var = $^O eq 'MacOS' ? 'extern' : 'static';
688*0Sstevel@tonic-gate   print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
689*0Sstevel@tonic-gate  }
690*0Sstevel@tonic-gate $a->{'DoneStrings'} = 1;
691*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
692*0Sstevel@tonic-gate  {
693*0Sstevel@tonic-gate   my ($s,$e,$out,$t,$end,$l) = @$b;
694*0Sstevel@tonic-gate   addstrings($fh,$t) unless $t->{'DoneStrings'};
695*0Sstevel@tonic-gate  }
696*0Sstevel@tonic-gate}
697*0Sstevel@tonic-gate
698*0Sstevel@tonic-gatesub outbigstring
699*0Sstevel@tonic-gate{
700*0Sstevel@tonic-gate  my ($fh,$name) = @_;
701*0Sstevel@tonic-gate
702*0Sstevel@tonic-gate  $string_acc = '';
703*0Sstevel@tonic-gate
704*0Sstevel@tonic-gate  # Make the big string in the string accumulator. Longest first, on the hope
705*0Sstevel@tonic-gate  # that this makes it more likely that we find the short strings later on.
706*0Sstevel@tonic-gate  # Not sure if it helps sorting strings of the same length lexcically.
707*0Sstevel@tonic-gate  foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
708*0Sstevel@tonic-gate    my $index = index $string_acc, $s;
709*0Sstevel@tonic-gate    if ($index >= 0) {
710*0Sstevel@tonic-gate      $saved += length($s);
711*0Sstevel@tonic-gate      $strings_in_acc{$s} = $index;
712*0Sstevel@tonic-gate    } else {
713*0Sstevel@tonic-gate    OPTIMISER: {
714*0Sstevel@tonic-gate	if ($opt{'O'}) {
715*0Sstevel@tonic-gate	  my $sublength = length $s;
716*0Sstevel@tonic-gate	  while (--$sublength > 0) {
717*0Sstevel@tonic-gate	    # progressively lop characters off the end, to see if the start of
718*0Sstevel@tonic-gate	    # the new string overlaps the end of the accumulator.
719*0Sstevel@tonic-gate	    if (substr ($string_acc, -$sublength)
720*0Sstevel@tonic-gate		eq substr ($s, 0, $sublength)) {
721*0Sstevel@tonic-gate	      $subsave += $sublength;
722*0Sstevel@tonic-gate	      $strings_in_acc{$s} = length ($string_acc) - $sublength;
723*0Sstevel@tonic-gate	      # append the last bit on the end.
724*0Sstevel@tonic-gate	      $string_acc .= substr ($s, $sublength);
725*0Sstevel@tonic-gate	      last OPTIMISER;
726*0Sstevel@tonic-gate	    }
727*0Sstevel@tonic-gate	    # or if the end of the new string overlaps the start of the
728*0Sstevel@tonic-gate	    # accumulator
729*0Sstevel@tonic-gate	    next unless substr ($string_acc, 0, $sublength)
730*0Sstevel@tonic-gate	      eq substr ($s, -$sublength);
731*0Sstevel@tonic-gate	    # well, the last $sublength characters of the accumulator match.
732*0Sstevel@tonic-gate	    # so as we're prepending to the accumulator, need to shift all our
733*0Sstevel@tonic-gate	    # existing offsets forwards
734*0Sstevel@tonic-gate	    $_ += $sublength foreach values %strings_in_acc;
735*0Sstevel@tonic-gate	    $subsave += $sublength;
736*0Sstevel@tonic-gate	    $strings_in_acc{$s} = 0;
737*0Sstevel@tonic-gate	    # append the first bit on the start.
738*0Sstevel@tonic-gate	    $string_acc = substr ($s, 0, -$sublength) . $string_acc;
739*0Sstevel@tonic-gate	    last OPTIMISER;
740*0Sstevel@tonic-gate	  }
741*0Sstevel@tonic-gate	}
742*0Sstevel@tonic-gate	# Optimiser (if it ran) found nothing, so just going have to tack the
743*0Sstevel@tonic-gate	# whole thing on the end.
744*0Sstevel@tonic-gate	$strings_in_acc{$s} = length $string_acc;
745*0Sstevel@tonic-gate	$string_acc .= $s;
746*0Sstevel@tonic-gate      };
747*0Sstevel@tonic-gate    }
748*0Sstevel@tonic-gate  }
749*0Sstevel@tonic-gate
750*0Sstevel@tonic-gate  $strings = length $string_acc;
751*0Sstevel@tonic-gate  my $definition = "\nstatic const U8 $name\[$strings] = { " .
752*0Sstevel@tonic-gate    join(',',unpack "C*",$string_acc);
753*0Sstevel@tonic-gate  # We have a single long line. Split it at convenient commas.
754*0Sstevel@tonic-gate  print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
755*0Sstevel@tonic-gate  print $fh substr ($definition, pos $definition), " };\n";
756*0Sstevel@tonic-gate}
757*0Sstevel@tonic-gate
758*0Sstevel@tonic-gatesub findstring {
759*0Sstevel@tonic-gate  my ($name,$s) = @_;
760*0Sstevel@tonic-gate  my $offset = $strings_in_acc{$s};
761*0Sstevel@tonic-gate  die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
762*0Sstevel@tonic-gate    unless defined $offset;
763*0Sstevel@tonic-gate  "$name + $offset";
764*0Sstevel@tonic-gate}
765*0Sstevel@tonic-gate
766*0Sstevel@tonic-gatesub outtable
767*0Sstevel@tonic-gate{
768*0Sstevel@tonic-gate my ($fh,$a,$bigname) = @_;
769*0Sstevel@tonic-gate my $name = $a->{'Cname'};
770*0Sstevel@tonic-gate $a->{'Done'} = 1;
771*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
772*0Sstevel@tonic-gate  {
773*0Sstevel@tonic-gate   my ($s,$e,$out,$t,$end,$l) = @$b;
774*0Sstevel@tonic-gate   outtable($fh,$t,$bigname) unless $t->{'Done'};
775*0Sstevel@tonic-gate  }
776*0Sstevel@tonic-gate print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
777*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}})
778*0Sstevel@tonic-gate  {
779*0Sstevel@tonic-gate   my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
780*0Sstevel@tonic-gate   # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
781*0Sstevel@tonic-gate   print  $fh "{";
782*0Sstevel@tonic-gate   if ($l)
783*0Sstevel@tonic-gate    {
784*0Sstevel@tonic-gate     printf $fh findstring($bigname,$out);
785*0Sstevel@tonic-gate    }
786*0Sstevel@tonic-gate   else
787*0Sstevel@tonic-gate    {
788*0Sstevel@tonic-gate     print  $fh "0";
789*0Sstevel@tonic-gate    }
790*0Sstevel@tonic-gate   print  $fh ",",$t->{Cname};
791*0Sstevel@tonic-gate   printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
792*0Sstevel@tonic-gate  }
793*0Sstevel@tonic-gate print $fh "};\n";
794*0Sstevel@tonic-gate}
795*0Sstevel@tonic-gate
796*0Sstevel@tonic-gatesub output_enc
797*0Sstevel@tonic-gate{
798*0Sstevel@tonic-gate my ($fh,$name,$a) = @_;
799*0Sstevel@tonic-gate die "Changed - fix me for new structure";
800*0Sstevel@tonic-gate foreach my $b (sort keys %$a)
801*0Sstevel@tonic-gate  {
802*0Sstevel@tonic-gate   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
803*0Sstevel@tonic-gate  }
804*0Sstevel@tonic-gate}
805*0Sstevel@tonic-gate
806*0Sstevel@tonic-gatesub decode_U
807*0Sstevel@tonic-gate{
808*0Sstevel@tonic-gate my $s = shift;
809*0Sstevel@tonic-gate}
810*0Sstevel@tonic-gate
811*0Sstevel@tonic-gatemy @uname;
812*0Sstevel@tonic-gatesub char_names
813*0Sstevel@tonic-gate{
814*0Sstevel@tonic-gate my $s = do "unicore/Name.pl";
815*0Sstevel@tonic-gate die "char_names: unicore/Name.pl: $!\n" unless defined $s;
816*0Sstevel@tonic-gate pos($s) = 0;
817*0Sstevel@tonic-gate while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
818*0Sstevel@tonic-gate  {
819*0Sstevel@tonic-gate   my $name = $3;
820*0Sstevel@tonic-gate   my $s = hex($1);
821*0Sstevel@tonic-gate   last if $s >= 0x10000;
822*0Sstevel@tonic-gate   my $e = length($2) ? hex($2) : $s;
823*0Sstevel@tonic-gate   for (my $i = $s; $i <= $e; $i++)
824*0Sstevel@tonic-gate    {
825*0Sstevel@tonic-gate     $uname[$i] = $name;
826*0Sstevel@tonic-gate#    print sprintf("U%04X $name\n",$i);
827*0Sstevel@tonic-gate    }
828*0Sstevel@tonic-gate  }
829*0Sstevel@tonic-gate}
830*0Sstevel@tonic-gate
831*0Sstevel@tonic-gatesub output_ucm_page
832*0Sstevel@tonic-gate{
833*0Sstevel@tonic-gate  my ($cmap,$a,$t,$pre) = @_;
834*0Sstevel@tonic-gate  # warn sprintf("Page %x\n",$pre);
835*0Sstevel@tonic-gate  my $raw = $t->{Raw};
836*0Sstevel@tonic-gate  foreach my $key (sort keys %$raw) {
837*0Sstevel@tonic-gate    #  RAW_NEXT => 0,
838*0Sstevel@tonic-gate    #  RAW_IN_LEN => 1,
839*0Sstevel@tonic-gate    #  RAW_OUT_BYTES => 2,
840*0Sstevel@tonic-gate    #  RAW_FALLBACK => 3,
841*0Sstevel@tonic-gate    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
842*0Sstevel@tonic-gate    my $u = ord $key;
843*0Sstevel@tonic-gate    $fallback ||= 0;
844*0Sstevel@tonic-gate
845*0Sstevel@tonic-gate    if ($next != $a && $next != $t) {
846*0Sstevel@tonic-gate      output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
847*0Sstevel@tonic-gate    } elsif (length $out_bytes) {
848*0Sstevel@tonic-gate      if ($pre) {
849*0Sstevel@tonic-gate        $u = $pre|($u &0x3f);
850*0Sstevel@tonic-gate      }
851*0Sstevel@tonic-gate      my $s = sprintf "<U%04X> ",$u;
852*0Sstevel@tonic-gate      #foreach my $c (split(//,$out_bytes)) {
853*0Sstevel@tonic-gate      #  $s .= sprintf "\\x%02X",ord($c);
854*0Sstevel@tonic-gate      #}
855*0Sstevel@tonic-gate      # 9.5% faster changing that loop to this:
856*0Sstevel@tonic-gate      $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
857*0Sstevel@tonic-gate      $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
858*0Sstevel@tonic-gate      push(@$cmap,$s);
859*0Sstevel@tonic-gate    } else {
860*0Sstevel@tonic-gate      warn join(',',$u, @{$raw->{$key}},$a,$t);
861*0Sstevel@tonic-gate    }
862*0Sstevel@tonic-gate  }
863*0Sstevel@tonic-gate}
864*0Sstevel@tonic-gate
865*0Sstevel@tonic-gatesub output_ucm
866*0Sstevel@tonic-gate{
867*0Sstevel@tonic-gate my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
868*0Sstevel@tonic-gate print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
869*0Sstevel@tonic-gate print $fh "<code_set_name> \"$name\"\n";
870*0Sstevel@tonic-gate char_names();
871*0Sstevel@tonic-gate if (defined $min_el)
872*0Sstevel@tonic-gate  {
873*0Sstevel@tonic-gate   print $fh "<mb_cur_min> $min_el\n";
874*0Sstevel@tonic-gate  }
875*0Sstevel@tonic-gate if (defined $max_el)
876*0Sstevel@tonic-gate  {
877*0Sstevel@tonic-gate   print $fh "<mb_cur_max> $max_el\n";
878*0Sstevel@tonic-gate  }
879*0Sstevel@tonic-gate if (defined $rep)
880*0Sstevel@tonic-gate  {
881*0Sstevel@tonic-gate   print $fh "<subchar> ";
882*0Sstevel@tonic-gate   foreach my $c (split(//,$rep))
883*0Sstevel@tonic-gate    {
884*0Sstevel@tonic-gate     printf $fh "\\x%02X",ord($c);
885*0Sstevel@tonic-gate    }
886*0Sstevel@tonic-gate   print $fh "\n";
887*0Sstevel@tonic-gate  }
888*0Sstevel@tonic-gate my @cmap;
889*0Sstevel@tonic-gate output_ucm_page(\@cmap,$h,$h,0);
890*0Sstevel@tonic-gate print $fh "#\nCHARMAP\n";
891*0Sstevel@tonic-gate foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
892*0Sstevel@tonic-gate  {
893*0Sstevel@tonic-gate   print $fh $line;
894*0Sstevel@tonic-gate  }
895*0Sstevel@tonic-gate print $fh "END CHARMAP\n";
896*0Sstevel@tonic-gate}
897*0Sstevel@tonic-gate
898*0Sstevel@tonic-gateuse vars qw(
899*0Sstevel@tonic-gate    $_Enc2xs
900*0Sstevel@tonic-gate    $_Version
901*0Sstevel@tonic-gate    $_Inc
902*0Sstevel@tonic-gate    $_E2X
903*0Sstevel@tonic-gate    $_Name
904*0Sstevel@tonic-gate    $_TableFiles
905*0Sstevel@tonic-gate    $_Now
906*0Sstevel@tonic-gate);
907*0Sstevel@tonic-gate
908*0Sstevel@tonic-gatesub find_e2x{
909*0Sstevel@tonic-gate    eval { require File::Find; };
910*0Sstevel@tonic-gate    my (@inc, %e2x_dir);
911*0Sstevel@tonic-gate    for my $inc (@INC){
912*0Sstevel@tonic-gate	push @inc, $inc unless $inc eq '.'; #skip current dir
913*0Sstevel@tonic-gate    }
914*0Sstevel@tonic-gate    File::Find::find(
915*0Sstevel@tonic-gate	     sub {
916*0Sstevel@tonic-gate		 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
917*0Sstevel@tonic-gate		     $atime,$mtime,$ctime,$blksize,$blocks)
918*0Sstevel@tonic-gate		     = lstat($_) or return;
919*0Sstevel@tonic-gate		 -f _ or return;
920*0Sstevel@tonic-gate		 if (/^.*\.e2x$/o){
921*0Sstevel@tonic-gate		     no warnings 'once';
922*0Sstevel@tonic-gate		     $e2x_dir{$File::Find::dir} ||= $mtime;
923*0Sstevel@tonic-gate		 }
924*0Sstevel@tonic-gate		 return;
925*0Sstevel@tonic-gate	     }, @inc);
926*0Sstevel@tonic-gate    warn join("\n", keys %e2x_dir), "\n";
927*0Sstevel@tonic-gate    for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
928*0Sstevel@tonic-gate	$_E2X = $d;
929*0Sstevel@tonic-gate	# warn "$_E2X => ", scalar localtime($e2x_dir{$d});
930*0Sstevel@tonic-gate	return $_E2X;
931*0Sstevel@tonic-gate    }
932*0Sstevel@tonic-gate}
933*0Sstevel@tonic-gate
934*0Sstevel@tonic-gatesub make_makefile_pl
935*0Sstevel@tonic-gate{
936*0Sstevel@tonic-gate    eval { require Encode; };
937*0Sstevel@tonic-gate    $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
938*0Sstevel@tonic-gate    # our used for variable expanstion
939*0Sstevel@tonic-gate    $_Enc2xs = $0;
940*0Sstevel@tonic-gate    $_Version = $VERSION;
941*0Sstevel@tonic-gate    $_E2X = find_e2x();
942*0Sstevel@tonic-gate    $_Name = shift;
943*0Sstevel@tonic-gate    $_TableFiles = join(",", map {qq('$_')} @_);
944*0Sstevel@tonic-gate    $_Now = scalar localtime();
945*0Sstevel@tonic-gate
946*0Sstevel@tonic-gate    eval { require File::Spec; };
947*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
948*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
949*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
950*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
951*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
952*0Sstevel@tonic-gate    exit;
953*0Sstevel@tonic-gate}
954*0Sstevel@tonic-gate
955*0Sstevel@tonic-gateuse vars qw(
956*0Sstevel@tonic-gate	    $_ModLines
957*0Sstevel@tonic-gate	    $_LocalVer
958*0Sstevel@tonic-gate	    );
959*0Sstevel@tonic-gate
960*0Sstevel@tonic-gatesub make_configlocal_pm
961*0Sstevel@tonic-gate{
962*0Sstevel@tonic-gate    eval { require Encode; };
963*0Sstevel@tonic-gate    $@ and die "Unable to require Encode: $@\n";
964*0Sstevel@tonic-gate    eval { require File::Spec; };
965*0Sstevel@tonic-gate    # our used for variable expanstion
966*0Sstevel@tonic-gate    my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
967*0Sstevel@tonic-gate    my %LocalMod = ();
968*0Sstevel@tonic-gate    for my $d (@INC){
969*0Sstevel@tonic-gate	my $inc = File::Spec->catfile($d, "Encode");
970*0Sstevel@tonic-gate	-d $inc or next;
971*0Sstevel@tonic-gate	opendir my $dh, $inc or die "$inc:$!";
972*0Sstevel@tonic-gate	warn "Checking $inc...\n";
973*0Sstevel@tonic-gate	for my $f (grep /\.pm$/o, readdir($dh)){
974*0Sstevel@tonic-gate	    -f File::Spec->catfile($inc, "$f") or next;
975*0Sstevel@tonic-gate	    $INC{"Encode/$f"} and next;
976*0Sstevel@tonic-gate	    warn "require Encode/$f;\n";
977*0Sstevel@tonic-gate	    eval { require "Encode/$f"; };
978*0Sstevel@tonic-gate	    $@ and die "Can't require Encode/$f: $@\n";
979*0Sstevel@tonic-gate	    for my $enc (Encode->encodings()){
980*0Sstevel@tonic-gate		no warnings 'once';
981*0Sstevel@tonic-gate		$in_core{$enc} and next;
982*0Sstevel@tonic-gate		$Encode::Config::ExtModule{$enc} and next;
983*0Sstevel@tonic-gate		my $mod = "Encode/$f";
984*0Sstevel@tonic-gate		$mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
985*0Sstevel@tonic-gate		$LocalMod{$enc} ||= $mod;
986*0Sstevel@tonic-gate	    }
987*0Sstevel@tonic-gate	}
988*0Sstevel@tonic-gate    }
989*0Sstevel@tonic-gate    $_ModLines = "";
990*0Sstevel@tonic-gate    for my $enc (sort keys %LocalMod){
991*0Sstevel@tonic-gate	$_ModLines .=
992*0Sstevel@tonic-gate	    qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
993*0Sstevel@tonic-gate    }
994*0Sstevel@tonic-gate    warn $_ModLines;
995*0Sstevel@tonic-gate    $_LocalVer = _mkversion();
996*0Sstevel@tonic-gate    $_E2X = find_e2x();
997*0Sstevel@tonic-gate    $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
998*0Sstevel@tonic-gate    _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
999*0Sstevel@tonic-gate		  File::Spec->catfile($_Inc,"ConfigLocal.pm"),
1000*0Sstevel@tonic-gate		  1);
1001*0Sstevel@tonic-gate    exit;
1002*0Sstevel@tonic-gate}
1003*0Sstevel@tonic-gate
1004*0Sstevel@tonic-gatesub _mkversion{
1005*0Sstevel@tonic-gate    my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1006*0Sstevel@tonic-gate    $yyyy += 1900, $mo +=1;
1007*0Sstevel@tonic-gate    return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1008*0Sstevel@tonic-gate}
1009*0Sstevel@tonic-gate
1010*0Sstevel@tonic-gatesub _print_expand{
1011*0Sstevel@tonic-gate    eval { require File::Basename; };
1012*0Sstevel@tonic-gate    $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1013*0Sstevel@tonic-gate    File::Basename->import();
1014*0Sstevel@tonic-gate    my ($src, $dst, $clobber) = @_;
1015*0Sstevel@tonic-gate    if (!$clobber and -e $dst){
1016*0Sstevel@tonic-gate	warn "$dst exists. skipping\n";
1017*0Sstevel@tonic-gate	return;
1018*0Sstevel@tonic-gate    }
1019*0Sstevel@tonic-gate    warn "Generating $dst...\n";
1020*0Sstevel@tonic-gate    open my $in, $src or die "$src : $!";
1021*0Sstevel@tonic-gate    if ((my $d = dirname($dst)) ne '.'){
1022*0Sstevel@tonic-gate	-d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1023*0Sstevel@tonic-gate    }
1024*0Sstevel@tonic-gate    open my $out, ">$dst" or die "$!";
1025*0Sstevel@tonic-gate    my $asis = 0;
1026*0Sstevel@tonic-gate    while (<$in>){
1027*0Sstevel@tonic-gate	if (/^#### END_OF_HEADER/){
1028*0Sstevel@tonic-gate	    $asis = 1; next;
1029*0Sstevel@tonic-gate	}
1030*0Sstevel@tonic-gate	s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1031*0Sstevel@tonic-gate	print $out $_;
1032*0Sstevel@tonic-gate    }
1033*0Sstevel@tonic-gate}
1034*0Sstevel@tonic-gate__END__
1035*0Sstevel@tonic-gate
1036*0Sstevel@tonic-gate=head1 NAME
1037*0Sstevel@tonic-gate
1038*0Sstevel@tonic-gateenc2xs -- Perl Encode Module Generator
1039*0Sstevel@tonic-gate
1040*0Sstevel@tonic-gate=head1 SYNOPSIS
1041*0Sstevel@tonic-gate
1042*0Sstevel@tonic-gate  enc2xs -[options]
1043*0Sstevel@tonic-gate  enc2xs -M ModName mapfiles...
1044*0Sstevel@tonic-gate  enc2xs -C
1045*0Sstevel@tonic-gate
1046*0Sstevel@tonic-gate=head1 DESCRIPTION
1047*0Sstevel@tonic-gate
1048*0Sstevel@tonic-gateF<enc2xs> builds a Perl extension for use by Encode from either
1049*0Sstevel@tonic-gateUnicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1050*0Sstevel@tonic-gateBesides being used internally during the build process of the Encode
1051*0Sstevel@tonic-gatemodule, you can use F<enc2xs> to add your own encoding to perl.
1052*0Sstevel@tonic-gateNo knowledge of XS is necessary.
1053*0Sstevel@tonic-gate
1054*0Sstevel@tonic-gate=head1 Quick Guide
1055*0Sstevel@tonic-gate
1056*0Sstevel@tonic-gateIf you want to know as little about Perl as possible but need to
1057*0Sstevel@tonic-gateadd a new encoding, just read this chapter and forget the rest.
1058*0Sstevel@tonic-gate
1059*0Sstevel@tonic-gate=over 4
1060*0Sstevel@tonic-gate
1061*0Sstevel@tonic-gate=item 0.
1062*0Sstevel@tonic-gate
1063*0Sstevel@tonic-gateHave a .ucm file ready.  You can get it from somewhere or you can write
1064*0Sstevel@tonic-gateyour own from scratch or you can grab one from the Encode distribution
1065*0Sstevel@tonic-gateand customize it.  For the UCM format, see the next Chapter.  In the
1066*0Sstevel@tonic-gateexample below, I'll call my theoretical encoding myascii, defined
1067*0Sstevel@tonic-gatein I<my.ucm>.  C<$> is a shell prompt.
1068*0Sstevel@tonic-gate
1069*0Sstevel@tonic-gate  $ ls -F
1070*0Sstevel@tonic-gate  my.ucm
1071*0Sstevel@tonic-gate
1072*0Sstevel@tonic-gate=item 1.
1073*0Sstevel@tonic-gate
1074*0Sstevel@tonic-gateIssue a command as follows;
1075*0Sstevel@tonic-gate
1076*0Sstevel@tonic-gate  $ enc2xs -M My my.ucm
1077*0Sstevel@tonic-gate  generating Makefile.PL
1078*0Sstevel@tonic-gate  generating My.pm
1079*0Sstevel@tonic-gate  generating README
1080*0Sstevel@tonic-gate  generating Changes
1081*0Sstevel@tonic-gate
1082*0Sstevel@tonic-gateNow take a look at your current directory.  It should look like this.
1083*0Sstevel@tonic-gate
1084*0Sstevel@tonic-gate  $ ls -F
1085*0Sstevel@tonic-gate  Makefile.PL   My.pm         my.ucm        t/
1086*0Sstevel@tonic-gate
1087*0Sstevel@tonic-gateThe following files were created.
1088*0Sstevel@tonic-gate
1089*0Sstevel@tonic-gate  Makefile.PL - MakeMaker script
1090*0Sstevel@tonic-gate  My.pm       - Encode submodule
1091*0Sstevel@tonic-gate  t/My.t      - test file
1092*0Sstevel@tonic-gate
1093*0Sstevel@tonic-gate=over 4
1094*0Sstevel@tonic-gate
1095*0Sstevel@tonic-gate=item 1.1.
1096*0Sstevel@tonic-gate
1097*0Sstevel@tonic-gateIf you want *.ucm installed together with the modules, do as follows;
1098*0Sstevel@tonic-gate
1099*0Sstevel@tonic-gate  $ mkdir Encode
1100*0Sstevel@tonic-gate  $ mv *.ucm Encode
1101*0Sstevel@tonic-gate  $ enc2xs -M My Encode/*ucm
1102*0Sstevel@tonic-gate
1103*0Sstevel@tonic-gate=back
1104*0Sstevel@tonic-gate
1105*0Sstevel@tonic-gate=item 2.
1106*0Sstevel@tonic-gate
1107*0Sstevel@tonic-gateEdit the files generated.  You don't have to if you have no time AND no
1108*0Sstevel@tonic-gateintention to give it to someone else.  But it is a good idea to edit
1109*0Sstevel@tonic-gatethe pod and to add more tests.
1110*0Sstevel@tonic-gate
1111*0Sstevel@tonic-gate=item 3.
1112*0Sstevel@tonic-gate
1113*0Sstevel@tonic-gateNow issue a command all Perl Mongers love:
1114*0Sstevel@tonic-gate
1115*0Sstevel@tonic-gate  $ perl Makefile.PL
1116*0Sstevel@tonic-gate  Writing Makefile for Encode::My
1117*0Sstevel@tonic-gate
1118*0Sstevel@tonic-gate=item 4.
1119*0Sstevel@tonic-gate
1120*0Sstevel@tonic-gateNow all you have to do is make.
1121*0Sstevel@tonic-gate
1122*0Sstevel@tonic-gate  $ make
1123*0Sstevel@tonic-gate  cp My.pm blib/lib/Encode/My.pm
1124*0Sstevel@tonic-gate  /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1125*0Sstevel@tonic-gate    -o encode_t.c -f encode_t.fnm
1126*0Sstevel@tonic-gate  Reading myascii (myascii)
1127*0Sstevel@tonic-gate  Writing compiled form
1128*0Sstevel@tonic-gate  128 bytes in string tables
1129*0Sstevel@tonic-gate  384 bytes (25%) saved spotting duplicates
1130*0Sstevel@tonic-gate  1 bytes (99.2%) saved using substrings
1131*0Sstevel@tonic-gate  ....
1132*0Sstevel@tonic-gate  chmod 644 blib/arch/auto/Encode/My/My.bs
1133*0Sstevel@tonic-gate  $
1134*0Sstevel@tonic-gate
1135*0Sstevel@tonic-gateThe time it takes varies depending on how fast your machine is and
1136*0Sstevel@tonic-gatehow large your encoding is.  Unless you are working on something big
1137*0Sstevel@tonic-gatelike euc-tw, it won't take too long.
1138*0Sstevel@tonic-gate
1139*0Sstevel@tonic-gate=item 5.
1140*0Sstevel@tonic-gate
1141*0Sstevel@tonic-gateYou can "make install" already but you should test first.
1142*0Sstevel@tonic-gate
1143*0Sstevel@tonic-gate  $ make test
1144*0Sstevel@tonic-gate  PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1145*0Sstevel@tonic-gate    -e 'use Test::Harness  qw(&runtests $verbose); \
1146*0Sstevel@tonic-gate    $verbose=0; runtests @ARGV;' t/*.t
1147*0Sstevel@tonic-gate  t/My....ok
1148*0Sstevel@tonic-gate  All tests successful.
1149*0Sstevel@tonic-gate  Files=1, Tests=2,  0 wallclock secs
1150*0Sstevel@tonic-gate   ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1151*0Sstevel@tonic-gate
1152*0Sstevel@tonic-gate=item 6.
1153*0Sstevel@tonic-gate
1154*0Sstevel@tonic-gateIf you are content with the test result, just "make install"
1155*0Sstevel@tonic-gate
1156*0Sstevel@tonic-gate=item 7.
1157*0Sstevel@tonic-gate
1158*0Sstevel@tonic-gateIf you want to add your encoding to Encode's demand-loading list
1159*0Sstevel@tonic-gate(so you don't have to "use Encode::YourEncoding"), run
1160*0Sstevel@tonic-gate
1161*0Sstevel@tonic-gate  enc2xs -C
1162*0Sstevel@tonic-gate
1163*0Sstevel@tonic-gateto update Encode::ConfigLocal, a module that controls local settings.
1164*0Sstevel@tonic-gateAfter that, "use Encode;" is enough to load your encodings on demand.
1165*0Sstevel@tonic-gate
1166*0Sstevel@tonic-gate=back
1167*0Sstevel@tonic-gate
1168*0Sstevel@tonic-gate=head1 The Unicode Character Map
1169*0Sstevel@tonic-gate
1170*0Sstevel@tonic-gateEncode uses the Unicode Character Map (UCM) format for source character
1171*0Sstevel@tonic-gatemappings.  This format is used by IBM's ICU package and was adopted
1172*0Sstevel@tonic-gateby Nick Ing-Simmons for use with the Encode module.  Since UCM is
1173*0Sstevel@tonic-gatemore flexible than Tcl's Encoding Map and far more user-friendly,
1174*0Sstevel@tonic-gatethis is the recommended formet for Encode now.
1175*0Sstevel@tonic-gate
1176*0Sstevel@tonic-gateA UCM file looks like this.
1177*0Sstevel@tonic-gate
1178*0Sstevel@tonic-gate  #
1179*0Sstevel@tonic-gate  # Comments
1180*0Sstevel@tonic-gate  #
1181*0Sstevel@tonic-gate  <code_set_name> "US-ascii" # Required
1182*0Sstevel@tonic-gate  <code_set_alias> "ascii"   # Optional
1183*0Sstevel@tonic-gate  <mb_cur_min> 1             # Required; usually 1
1184*0Sstevel@tonic-gate  <mb_cur_max> 1             # Max. # of bytes/char
1185*0Sstevel@tonic-gate  <subchar> \x3F             # Substitution char
1186*0Sstevel@tonic-gate  #
1187*0Sstevel@tonic-gate  CHARMAP
1188*0Sstevel@tonic-gate  <U0000> \x00 |0 # <control>
1189*0Sstevel@tonic-gate  <U0001> \x01 |0 # <control>
1190*0Sstevel@tonic-gate  <U0002> \x02 |0 # <control>
1191*0Sstevel@tonic-gate  ....
1192*0Sstevel@tonic-gate  <U007C> \x7C |0 # VERTICAL LINE
1193*0Sstevel@tonic-gate  <U007D> \x7D |0 # RIGHT CURLY BRACKET
1194*0Sstevel@tonic-gate  <U007E> \x7E |0 # TILDE
1195*0Sstevel@tonic-gate  <U007F> \x7F |0 # <control>
1196*0Sstevel@tonic-gate  END CHARMAP
1197*0Sstevel@tonic-gate
1198*0Sstevel@tonic-gate=over 4
1199*0Sstevel@tonic-gate
1200*0Sstevel@tonic-gate=item *
1201*0Sstevel@tonic-gate
1202*0Sstevel@tonic-gateAnything that follows C<#> is treated as a comment.
1203*0Sstevel@tonic-gate
1204*0Sstevel@tonic-gate=item *
1205*0Sstevel@tonic-gate
1206*0Sstevel@tonic-gateThe header section continues until a line containing the word
1207*0Sstevel@tonic-gateCHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1208*0Sstevel@tonic-gatepair per line.  Strings used as values must be quoted. Barewords are
1209*0Sstevel@tonic-gatetreated as numbers.  I<\xXX> represents a byte.
1210*0Sstevel@tonic-gate
1211*0Sstevel@tonic-gateMost of the keywords are self-explanatory. I<subchar> means
1212*0Sstevel@tonic-gatesubstitution character, not subcharacter.  When you decode a Unicode
1213*0Sstevel@tonic-gatesequence to this encoding but no matching character is found, the byte
1214*0Sstevel@tonic-gatesequence defined here will be used.  For most cases, the value here is
1215*0Sstevel@tonic-gate\x3F; in ASCII, this is a question mark.
1216*0Sstevel@tonic-gate
1217*0Sstevel@tonic-gate=item *
1218*0Sstevel@tonic-gate
1219*0Sstevel@tonic-gateCHARMAP starts the character map section.  Each line has a form as
1220*0Sstevel@tonic-gatefollows:
1221*0Sstevel@tonic-gate
1222*0Sstevel@tonic-gate  <UXXXX> \xXX.. |0 # comment
1223*0Sstevel@tonic-gate    ^     ^      ^
1224*0Sstevel@tonic-gate    |     |      +- Fallback flag
1225*0Sstevel@tonic-gate    |     +-------- Encoded byte sequence
1226*0Sstevel@tonic-gate    +-------------- Unicode Character ID in hex
1227*0Sstevel@tonic-gate
1228*0Sstevel@tonic-gateThe format is roughly the same as a header section except for the
1229*0Sstevel@tonic-gatefallback flag: | followed by 0..3.   The meaning of the possible
1230*0Sstevel@tonic-gatevalues is as follows:
1231*0Sstevel@tonic-gate
1232*0Sstevel@tonic-gate=over 4
1233*0Sstevel@tonic-gate
1234*0Sstevel@tonic-gate=item |0
1235*0Sstevel@tonic-gate
1236*0Sstevel@tonic-gateRound trip safe.  A character decoded to Unicode encodes back to the
1237*0Sstevel@tonic-gatesame byte sequence.  Most characters have this flag.
1238*0Sstevel@tonic-gate
1239*0Sstevel@tonic-gate=item |1
1240*0Sstevel@tonic-gate
1241*0Sstevel@tonic-gateFallback for unicode -> encoding.  When seen, enc2xs adds this
1242*0Sstevel@tonic-gatecharacter for the encode map only.
1243*0Sstevel@tonic-gate
1244*0Sstevel@tonic-gate=item |2
1245*0Sstevel@tonic-gate
1246*0Sstevel@tonic-gateSkip sub-char mapping should there be no code point.
1247*0Sstevel@tonic-gate
1248*0Sstevel@tonic-gate=item |3
1249*0Sstevel@tonic-gate
1250*0Sstevel@tonic-gateFallback for encoding -> unicode.  When seen, enc2xs adds this
1251*0Sstevel@tonic-gatecharacter for the decode map only.
1252*0Sstevel@tonic-gate
1253*0Sstevel@tonic-gate=back
1254*0Sstevel@tonic-gate
1255*0Sstevel@tonic-gate=item *
1256*0Sstevel@tonic-gate
1257*0Sstevel@tonic-gateAnd finally, END OF CHARMAP ends the section.
1258*0Sstevel@tonic-gate
1259*0Sstevel@tonic-gate=back
1260*0Sstevel@tonic-gate
1261*0Sstevel@tonic-gateWhen you are manually creating a UCM file, you should copy ascii.ucm
1262*0Sstevel@tonic-gateor an existing encoding which is close to yours, rather than write
1263*0Sstevel@tonic-gateyour own from scratch.
1264*0Sstevel@tonic-gate
1265*0Sstevel@tonic-gateWhen you do so, make sure you leave at least B<U0000> to B<U0020> as
1266*0Sstevel@tonic-gateis, unless your environment is EBCDIC.
1267*0Sstevel@tonic-gate
1268*0Sstevel@tonic-gateB<CAVEAT>: not all features in UCM are implemented.  For example,
1269*0Sstevel@tonic-gateicu:state is not used.  Because of that, you need to write a perl
1270*0Sstevel@tonic-gatemodule if you want to support algorithmical encodings, notably
1271*0Sstevel@tonic-gatethe ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1272*0Sstevel@tonic-gateL<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1273*0Sstevel@tonic-gate
1274*0Sstevel@tonic-gate=head2 Coping with duplicate mappings
1275*0Sstevel@tonic-gate
1276*0Sstevel@tonic-gateWhen you create a map, you SHOULD make your mappings round-trip safe.
1277*0Sstevel@tonic-gateThat is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1278*0Sstevel@tonic-gate$data> stands for all characters that are marked as C<|0>.  Here is
1279*0Sstevel@tonic-gatehow to make sure:
1280*0Sstevel@tonic-gate
1281*0Sstevel@tonic-gate=over 4
1282*0Sstevel@tonic-gate
1283*0Sstevel@tonic-gate=item *
1284*0Sstevel@tonic-gate
1285*0Sstevel@tonic-gateSort your map in Unicode order.
1286*0Sstevel@tonic-gate
1287*0Sstevel@tonic-gate=item *
1288*0Sstevel@tonic-gate
1289*0Sstevel@tonic-gateWhen you have a duplicate entry, mark either one with '|1' or '|3'.
1290*0Sstevel@tonic-gate
1291*0Sstevel@tonic-gate=item *
1292*0Sstevel@tonic-gate
1293*0Sstevel@tonic-gateAnd make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1294*0Sstevel@tonic-gate
1295*0Sstevel@tonic-gate=back
1296*0Sstevel@tonic-gate
1297*0Sstevel@tonic-gateHere is an example from big5-eten.
1298*0Sstevel@tonic-gate
1299*0Sstevel@tonic-gate  <U2550> \xF9\xF9 |0
1300*0Sstevel@tonic-gate  <U2550> \xA2\xA4 |3
1301*0Sstevel@tonic-gate
1302*0Sstevel@tonic-gateInternally Encoding -> Unicode and Unicode -> Encoding Map looks like
1303*0Sstevel@tonic-gatethis;
1304*0Sstevel@tonic-gate
1305*0Sstevel@tonic-gate  E to U               U to E
1306*0Sstevel@tonic-gate  --------------------------------------
1307*0Sstevel@tonic-gate  \xF9\xF9 => U2550    U2550 => \xF9\xF9
1308*0Sstevel@tonic-gate  \xA2\xA4 => U2550
1309*0Sstevel@tonic-gate
1310*0Sstevel@tonic-gateSo it is round-trip safe for \xF9\xF9.  But if the line above is upside
1311*0Sstevel@tonic-gatedown, here is what happens.
1312*0Sstevel@tonic-gate
1313*0Sstevel@tonic-gate  E to U               U to E
1314*0Sstevel@tonic-gate  --------------------------------------
1315*0Sstevel@tonic-gate  \xA2\xA4 => U2550    U2550 => \xF9\xF9
1316*0Sstevel@tonic-gate  (\xF9\xF9 => U2550 is now overwritten!)
1317*0Sstevel@tonic-gate
1318*0Sstevel@tonic-gateThe Encode package comes with F<ucmlint>, a crude but sufficient
1319*0Sstevel@tonic-gateutility to check the integrity of a UCM file.  Check under the
1320*0Sstevel@tonic-gateEncode/bin directory for this.
1321*0Sstevel@tonic-gate
1322*0Sstevel@tonic-gate
1323*0Sstevel@tonic-gate=head1 Bookmarks
1324*0Sstevel@tonic-gate
1325*0Sstevel@tonic-gate=over 4
1326*0Sstevel@tonic-gate
1327*0Sstevel@tonic-gate=item *
1328*0Sstevel@tonic-gate
1329*0Sstevel@tonic-gateICU Home Page
1330*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/>
1331*0Sstevel@tonic-gate
1332*0Sstevel@tonic-gate=item *
1333*0Sstevel@tonic-gate
1334*0Sstevel@tonic-gateICU Character Mapping Tables
1335*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/charset/>
1336*0Sstevel@tonic-gate
1337*0Sstevel@tonic-gate=item *
1338*0Sstevel@tonic-gate
1339*0Sstevel@tonic-gateICU:Conversion Data
1340*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1341*0Sstevel@tonic-gate
1342*0Sstevel@tonic-gate=back
1343*0Sstevel@tonic-gate
1344*0Sstevel@tonic-gate=head1 SEE ALSO
1345*0Sstevel@tonic-gate
1346*0Sstevel@tonic-gateL<Encode>,
1347*0Sstevel@tonic-gateL<perlmod>,
1348*0Sstevel@tonic-gateL<perlpod>
1349*0Sstevel@tonic-gate
1350*0Sstevel@tonic-gate=cut
1351*0Sstevel@tonic-gate
1352*0Sstevel@tonic-gate# -Q to disable the duplicate codepoint test
1353*0Sstevel@tonic-gate# -S make mapping errors fatal
1354*0Sstevel@tonic-gate# -q to remove comments written to output files
1355*0Sstevel@tonic-gate# -O to enable the (brute force) substring optimiser
1356*0Sstevel@tonic-gate# -o <output> to specify the output file name (else it's the first arg)
1357*0Sstevel@tonic-gate# -f <inlist> to give a file with a list of input files (else use the args)
1358*0Sstevel@tonic-gate# -n <name> to name the encoding (else use the basename of the input file.
1359*0Sstevel@tonic-gate
1360*0Sstevel@tonic-gateWith %seen holding array refs:
1361*0Sstevel@tonic-gate
1362*0Sstevel@tonic-gate      865.66 real        28.80 user         8.79 sys
1363*0Sstevel@tonic-gate      7904  maximum resident set size
1364*0Sstevel@tonic-gate      1356  average shared memory size
1365*0Sstevel@tonic-gate     18566  average unshared data size
1366*0Sstevel@tonic-gate       229  average unshared stack size
1367*0Sstevel@tonic-gate     46080  page reclaims
1368*0Sstevel@tonic-gate     33373  page faults
1369*0Sstevel@tonic-gate
1370*0Sstevel@tonic-gateWith %seen holding simple scalars:
1371*0Sstevel@tonic-gate
1372*0Sstevel@tonic-gate      342.16 real        27.11 user         3.54 sys
1373*0Sstevel@tonic-gate      8388  maximum resident set size
1374*0Sstevel@tonic-gate      1394  average shared memory size
1375*0Sstevel@tonic-gate     14969  average unshared data size
1376*0Sstevel@tonic-gate       236  average unshared stack size
1377*0Sstevel@tonic-gate     28159  page reclaims
1378*0Sstevel@tonic-gate      9839  page faults
1379*0Sstevel@tonic-gate
1380*0Sstevel@tonic-gateYes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1381*0Sstevel@tonic-gatehow %seen is storing things its seen. So it is pathalogically bad on a 16M
1382*0Sstevel@tonic-gateRAM machine, but it's going to help even on modern machines.
1383*0Sstevel@tonic-gateSwapping is bad, m'kay :-)
1384