xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/bin/enc2xs (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1#!./perl
2BEGIN {
3    # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
4    # with $ENV{PERL_CORE} set
5    # In case we need it in future...
6    require Config; import Config;
7    pop @INC if $INC[-1] eq '.';
8}
9use strict;
10use warnings;
11use Getopt::Std;
12use Config;
13my @orig_ARGV = @ARGV;
14our $VERSION  = do { my @r = (q$Revision: 2.12 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
15
16# These may get re-ordered.
17# RAW is a do_now as inserted by &enter
18# AGG is an aggregated do_now, as built up by &process
19
20use constant {
21  RAW_NEXT => 0,
22  RAW_IN_LEN => 1,
23  RAW_OUT_BYTES => 2,
24  RAW_FALLBACK => 3,
25
26  AGG_MIN_IN => 0,
27  AGG_MAX_IN => 1,
28  AGG_OUT_BYTES => 2,
29  AGG_NEXT => 3,
30  AGG_IN_LEN => 4,
31  AGG_OUT_LEN => 5,
32  AGG_FALLBACK => 6,
33};
34
35# (See the algorithm in encengine.c - we're building structures for it)
36
37# There are two sorts of structures.
38# "do_now" (an array, two variants of what needs storing) is whatever we need
39# to do now we've read an input byte.
40# It's housed in a "do_next" (which is how we got to it), and in turn points
41# to a "do_next" which contains all the "do_now"s for the next input byte.
42
43# There will be a "do_next" which is the start state.
44# For a single byte encoding it's the only "do_next" - each "do_now" points
45# back to it, and each "do_now" will cause bytes. There is no state.
46
47# For a multi-byte encoding where all characters in the input are the same
48# length, then there will be a tree of "do_now"->"do_next"->"do_now"
49# branching out from the start state, one step for each input byte.
50# The leaf "do_now"s will all be at the same distance from the start state,
51# only the leaf "do_now"s cause output bytes, and they in turn point back to
52# the start state.
53
54# For an encoding where there are variable length input byte sequences, you
55# will encounter a leaf "do_now" sooner for the shorter input sequences, but
56# as before the leaves will point back to the start state.
57
58# The system will cope with escape encodings (imagine them as a mostly
59# self-contained tree for each escape state, and cross links between trees
60# at the state-switching characters) but so far no input format defines these.
61
62# The system will also cope with having output "leaves" in the middle of
63# the bifurcating branches, not just at the extremities, but again no
64# input format does this yet.
65
66# There are two variants of the "do_now" structure. The first, smaller variant
67# is generated by &enter as the input file is read. There is one structure
68# for each input byte. Say we are mapping a single byte encoding to a
69# single byte encoding, with  "ABCD" going "abcd". There will be
70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71
72# &process then walks the tree, building aggregate "do_now" structures for
73# adjacent bytes where possible. The aggregate is for a contiguous range of
74# bytes which each produce the same length of output, each move to the
75# same next state, and each have the same fallback flag.
76# So our 4 RAW "do_now"s above become replaced by a single structure
77# containing:
78# ["A", "D", "abcd", 1, ...]
79# ie, for an input byte $_ in "A".."D", output 1 byte, found as
80# substr ("abcd", (ord $_ - ord "A") * 1, 1)
81# which maps very nicely into pointer arithmetic in C for encengine.c
82
83sub encode_U
84{
85 # UTF-8 encode long hand - only covers part of perl's range
86 ## my $uv = shift;
87 # chr() works in native space so convert value from table
88 # into that space before using chr().
89 my $ch = chr(utf8::unicode_to_native($_[0]));
90 # Now get core perl to encode that the way it likes.
91 utf8::encode($ch);
92 return $ch;
93}
94
95sub encode_S
96{
97 # encode single byte
98 ## my ($ch,$page) = @_; return chr($ch);
99 return chr $_[0];
100}
101
102sub encode_D
103{
104 # encode double byte MS byte first
105 ## my ($ch,$page) = @_; return chr($page).chr($ch);
106 return chr ($_[1]) . chr $_[0];
107}
108
109sub encode_M
110{
111 # encode Multi-byte - single for 0..255 otherwise double
112 ## my ($ch,$page) = @_;
113 ## return &encode_D if $page;
114 ## return &encode_S;
115 return chr ($_[1]) . chr $_[0] if $_[1];
116 return chr $_[0];
117}
118
119my %encode_types = (U => \&encode_U,
120                    S => \&encode_S,
121                    D => \&encode_D,
122                    M => \&encode_M,
123                   );
124
125# Win32 does not expand globs on command line
126eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
127
128my %opt;
129# I think these are:
130# -Q to disable the duplicate codepoint test
131# -S make mapping errors fatal
132# -q to remove comments written to output files
133# -O to enable the (brute force) substring optimiser
134# -o <output> to specify the output file name (else it's the first arg)
135# -f <inlist> to give a file with a list of input files (else use the args)
136# -n <name> to name the encoding (else use the basename of the input file.
137getopts('CM:SQqOo:f:n:',\%opt);
138
139$opt{M} and make_makefile_pl($opt{M}, @ARGV);
140$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
141
142# This really should go first, else the die here causes empty (non-erroneous)
143# output files to be written.
144my @encfiles;
145if (exists $opt{'f'}) {
146    # -F is followed by name of file containing list of filenames
147    my $flist = $opt{'f'};
148    open(FLIST,$flist) || die "Cannot open $flist:$!";
149    chomp(@encfiles = <FLIST>);
150    close(FLIST);
151} else {
152    @encfiles = @ARGV;
153}
154
155my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
156chmod(0666,$cname) if -f $cname && !-w $cname;
157open(C,">$cname") || die "Cannot open $cname:$!";
158
159my $dname = $cname;
160my $hname = $cname;
161
162my ($doC,$doEnc,$doUcm,$doPet);
163
164if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
165 {
166  $doC = 1;
167  $dname =~ s/(\.[^\.]*)?$/.exh/;
168  chmod(0666,$dname) if -f $cname && !-w $dname;
169  open(D,">$dname") || die "Cannot open $dname:$!";
170  $hname =~ s/(\.[^\.]*)?$/.h/;
171  chmod(0666,$hname) if -f $cname && !-w $hname;
172  open(H,">$hname") || die "Cannot open $hname:$!";
173
174  foreach my $fh (\*C,\*D,\*H)
175  {
176   print $fh <<"END" unless $opt{'q'};
177/*
178 !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
179 This file was autogenerated by:
180 $^X $0 @orig_ARGV
181 enc2xs VERSION $VERSION
182*/
183END
184  }
185
186  if ($cname =~ /(\w+)\.xs$/)
187   {
188    print C "#include <EXTERN.h>\n";
189    print C "#include <perl.h>\n";
190    print C "#include <XSUB.h>\n";
191   }
192  print C "#include \"encode.h\"\n\n";
193
194 }
195elsif ($cname =~ /\.enc$/)
196 {
197  $doEnc = 1;
198 }
199elsif ($cname =~ /\.ucm$/)
200 {
201  $doUcm = 1;
202 }
203elsif ($cname =~ /\.pet$/)
204 {
205  $doPet = 1;
206 }
207
208my %encoding;
209my %strings;
210my $string_acc;
211my %strings_in_acc;
212
213my $saved = 0;
214my $subsave = 0;
215my $strings = 0;
216
217sub cmp_name
218{
219 if ($a =~ /^.*-(\d+)/)
220  {
221   my $an = $1;
222   if ($b =~ /^.*-(\d+)/)
223    {
224     my $r = $an <=> $1;
225     return $r if $r;
226    }
227  }
228 return $a cmp $b;
229}
230
231
232foreach my $enc (sort cmp_name @encfiles)
233 {
234  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
235  $name = $opt{'n'} if exists $opt{'n'};
236  if (open(E,$enc))
237   {
238    if ($sfx eq 'enc')
239     {
240      compile_enc(\*E,lc($name));
241     }
242    else
243     {
244      compile_ucm(\*E,lc($name));
245     }
246   }
247  else
248   {
249    warn "Cannot open $enc for $name:$!";
250   }
251 }
252
253if ($doC)
254 {
255  print STDERR "Writing compiled form\n";
256  foreach my $name (sort cmp_name keys %encoding)
257   {
258    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
259    process($name.'_utf8',$e2u);
260    addstrings(\*C,$e2u);
261
262    process('utf8_'.$name,$u2e);
263    addstrings(\*C,$u2e);
264   }
265  outbigstring(\*C,"enctable");
266  foreach my $name (sort cmp_name keys %encoding)
267   {
268    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
269    outtable(\*C,$e2u, "enctable");
270    outtable(\*C,$u2e, "enctable");
271
272    # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
273   }
274  my $cpp = ($Config{d_cplusplus} || '') eq 'define';
275  my $exta = $cpp ? 'extern "C" ' : "static";
276  my $extb = $cpp ? 'extern "C" ' : "";
277  foreach my $enc (sort cmp_name keys %encoding)
278   {
279    # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
280    my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
281    #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
282    my $replen = 0;
283    $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
284    my $sym = "${enc}_encoding";
285    $sym =~ s/\W+/_/g;
286    my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
287        $min_el,$max_el);
288    print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
289    print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
290    print C "${extb} const encode_t $sym = \n";
291    # This is to make null encoding work -- dankogai
292    for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
293    $info[$i] ||= 1;
294    }
295    # end of null tweak -- dankogai
296    print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
297   }
298
299  foreach my $enc (sort cmp_name keys %encoding)
300   {
301    my $sym = "${enc}_encoding";
302    $sym =~ s/\W+/_/g;
303    print H "extern encode_t $sym;\n";
304    print D " Encode_XSEncoding(aTHX_ &$sym);\n";
305   }
306
307  if ($cname =~ /(\w+)\.xs$/)
308   {
309    my $mod = $1;
310    print C <<'END';
311
312static void
313Encode_XSEncoding(pTHX_ encode_t *enc)
314{
315 dSP;
316 HV *stash = gv_stashpv("Encode::XS", TRUE);
317 SV *iv    = newSViv(PTR2IV(enc));
318 SV *sv    = sv_bless(newRV_noinc(iv),stash);
319 int i = 0;
320 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's
321 constness, in the hope that perl won't mess with it. */
322 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0);
323 SvFLAGS(iv) |= SVp_POK;
324 SvPVX(iv) = (char*) enc->name[0];
325 PUSHMARK(sp);
326 XPUSHs(sv);
327 while (enc->name[i])
328  {
329   const char *name = enc->name[i++];
330   XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
331  }
332 PUTBACK;
333 call_pv("Encode::define_encoding",G_DISCARD);
334 SvREFCNT_dec(sv);
335}
336
337END
338
339    print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
340    print C "BOOT:\n{\n";
341    print C "#include \"$dname\"\n";
342    print C "}\n";
343   }
344  # Close in void context is bad, m'kay
345  close(D) or warn "Error closing '$dname': $!";
346  close(H) or warn "Error closing '$hname': $!";
347
348  my $perc_saved    = $saved/($strings + $saved) * 100;
349  my $perc_subsaved = $subsave/($strings + $subsave) * 100;
350  printf STDERR "%d bytes in string tables\n",$strings;
351  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
352    $saved, $perc_saved              if $saved;
353  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
354    $subsave, $perc_subsaved         if $subsave;
355 }
356elsif ($doEnc)
357 {
358  foreach my $name (sort cmp_name keys %encoding)
359   {
360    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
361    output_enc(\*C,$name,$e2u);
362   }
363 }
364elsif ($doUcm)
365 {
366  foreach my $name (sort cmp_name keys %encoding)
367   {
368    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
369    output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
370   }
371 }
372
373# writing half meg files and then not checking to see if you just filled the
374# disk is bad, m'kay
375close(C) or die "Error closing '$cname': $!";
376
377# End of the main program.
378
379sub compile_ucm
380{
381 my ($fh,$name) = @_;
382 my $e2u = {};
383 my $u2e = {};
384 my $cs;
385 my %attr;
386 while (<$fh>)
387  {
388   s/#.*$//;
389   last if /^\s*CHARMAP\s*$/i;
390   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
391    {
392     $attr{$1} = $2;
393    }
394  }
395 if (!defined($cs =  $attr{'code_set_name'}))
396  {
397   warn "No <code_set_name> in $name\n";
398  }
399 else
400  {
401   $name = $cs unless exists $opt{'n'};
402  }
403 my $erep;
404 my $urep;
405 my $max_el;
406 my $min_el;
407 if (exists $attr{'subchar'})
408  {
409   #my @byte;
410   #$attr{'subchar'} =~ /^\s*/cg;
411   #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
412   #$erep = join('',map(chr(hex($_)),@byte));
413   $erep = $attr{'subchar'};
414   $erep =~ s/^\s+//; $erep =~ s/\s+$//;
415  }
416 print "Reading $name ($cs)\n";
417 my $nfb = 0;
418 my $hfb = 0;
419 while (<$fh>)
420  {
421   s/#.*$//;
422   last if /^\s*END\s+CHARMAP\s*$/i;
423   next if /^\s*$/;
424   my (@uni, @byte) = ();
425   my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
426       or die "Bad line: $_";
427   while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
428       push @uni, map { substr($_, 1) } split(/\+/, $1);
429   }
430   while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
431       push @byte, $1;
432   }
433   if (@uni)
434    {
435     my $uch =  join('', map { encode_U(hex($_)) } @uni );
436     my $ech = join('',map(chr(hex($_)),@byte));
437     my $el  = length($ech);
438     $max_el = $el if (!defined($max_el) || $el > $max_el);
439     $min_el = $el if (!defined($min_el) || $el < $min_el);
440     if (length($fb))
441      {
442       $fb = substr($fb,1);
443       $hfb++;
444      }
445     else
446      {
447       $nfb++;
448       $fb = '0';
449      }
450     # $fb is fallback flag
451     # 0 - round trip safe
452     # 1 - fallback for unicode -> enc
453     # 2 - skip sub-char mapping
454     # 3 - fallback enc -> unicode
455     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
456     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
457    }
458   else
459    {
460     warn $_;
461    }
462  }
463 if ($nfb && $hfb)
464  {
465   die "$nfb entries without fallback, $hfb entries with\n";
466  }
467 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
468}
469
470
471
472sub compile_enc
473{
474 my ($fh,$name) = @_;
475 my $e2u = {};
476 my $u2e = {};
477
478 my $type;
479 while ($type = <$fh>)
480  {
481   last if $type !~ /^\s*#/;
482  }
483 chomp($type);
484 return if $type eq 'E';
485 # Do the hash lookup once, rather than once per function call. 4% speedup.
486 my $type_func = $encode_types{$type};
487 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
488 warn "$type encoded $name\n";
489 my $rep = '';
490 # Save a defined test by setting these to defined values.
491 my $min_el = ~0; # A very big integer
492 my $max_el = 0;  # Anything must be longer than 0
493 {
494  my $v = hex($def);
495  $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
496 }
497 my $errors;
498 my $seen;
499 # use -Q to silence the seen test. Makefile.PL uses this by default.
500 $seen = {} unless $opt{Q};
501 do
502  {
503   my $line = <$fh>;
504   chomp($line);
505   my $page = hex($line);
506   my $ch = 0;
507   my $i = 16;
508   do
509    {
510     # So why is it 1% faster to leave the my here?
511     my $line = <$fh>;
512     $line =~ s/\r\n$/\n/;
513     die "$.:${line}Line should be exactly 65 characters long including
514     newline (".length($line).")" unless length ($line) == 65;
515     # Split line into groups of 4 hex digits, convert groups to ints
516     # This takes 65.35
517     # map {hex $_} $line =~ /(....)/g
518     # This takes 63.75 (2.5% less time)
519     # unpack "n*", pack "H*", $line
520     # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
521     # Doing it as while ($line =~ /(....)/g) took 74.63
522     foreach my $val (unpack "n*", pack "H*", $line)
523      {
524       next if $val == 0xFFFD;
525       my $ech = &$type_func($ch,$page);
526       if ($val || (!$ch && !$page))
527        {
528         my $el  = length($ech);
529         $max_el = $el if $el > $max_el;
530         $min_el = $el if $el < $min_el;
531         my $uch = encode_U($val);
532         if ($seen) {
533           # We're doing the test.
534           # We don't need to read this quickly, so storing it as a scalar,
535           # rather than 3 (anon array, plus the 2 scalars it holds) saves
536           # RAM and may make us faster on low RAM systems. [see __END__]
537           if (exists $seen->{$uch})
538             {
539               warn sprintf("U%04X is %02X%02X and %04X\n",
540                            $val,$page,$ch,$seen->{$uch});
541               $errors++;
542             }
543           else
544             {
545               $seen->{$uch} = $page << 8 | $ch;
546             }
547         }
548         # Passing 2 extra args each time is 3.6% slower!
549         # Even with having to add $fallback ||= 0 later
550         enter_fb0($e2u,$ech,$uch);
551         enter_fb0($u2e,$uch,$ech);
552        }
553       else
554        {
555         # No character at this position
556         # enter($e2u,$ech,undef,$e2u);
557        }
558       $ch++;
559      }
560    } while --$i;
561  } while --$pages;
562 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
563   if $min_el > $max_el;
564 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
565 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
566}
567
568# my ($a,$s,$d,$t,$fb) = @_;
569sub enter {
570  my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
571  # state we shift to after this (multibyte) input character defaults to same
572  # as current state.
573  $next ||= $current;
574  # Making sure it is defined seems to be faster than {no warnings;} in
575  # &process, or passing it in as 0 explicitly.
576  # XXX $fallback ||= 0;
577
578  # Start at the beginning and work forwards through the string to zero.
579  # effectively we are removing 1 character from the front each time
580  # but we don't actually edit the string. [this alone seems to be 14% speedup]
581  # Hence -$pos is the length of the remaining string.
582  my $pos = -length $inbytes;
583  while (1) {
584    my $byte = substr $inbytes, $pos, 1;
585    #  RAW_NEXT => 0,
586    #  RAW_IN_LEN => 1,
587    #  RAW_OUT_BYTES => 2,
588    #  RAW_FALLBACK => 3,
589    # to unicode an array would seem to be better, because the pages are dense.
590    # from unicode can be very sparse, favouring a hash.
591    # hash using the bytes (all length 1) as keys rather than ord value,
592    # as it's easier to sort these in &process.
593
594    # It's faster to always add $fallback even if it's undef, rather than
595    # choosing between 3 and 4 element array. (hence why we set it defined
596    # above)
597    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
598    # When $pos was -1 we were at the last input character.
599    unless (++$pos) {
600      $do_now->[RAW_OUT_BYTES] = $outbytes;
601      $do_now->[RAW_NEXT] = $next;
602      return;
603    }
604    # Tail recursion. The intermediate state may not have a name yet.
605    $current = $do_now->[RAW_NEXT];
606  }
607}
608
609# This is purely for optimisation. It's just &enter hard coded for $fallback
610# of 0, using only a 3 entry array ref to save memory for every entry.
611sub enter_fb0 {
612  my ($current,$inbytes,$outbytes,$next) = @_;
613  $next ||= $current;
614
615  my $pos = -length $inbytes;
616  while (1) {
617    my $byte = substr $inbytes, $pos, 1;
618    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
619    unless (++$pos) {
620      $do_now->[RAW_OUT_BYTES] = $outbytes;
621      $do_now->[RAW_NEXT] = $next;
622      return;
623    }
624    $current = $do_now->[RAW_NEXT];
625  }
626}
627
628sub process
629{
630  my ($name,$a) = @_;
631  $name =~ s/\W+/_/g;
632  $a->{Cname} = $name;
633  my $raw = $a->{Raw};
634  my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
635  my @ent;
636  $agg_max_in = 0;
637  foreach my $key (sort keys %$raw) {
638    #  RAW_NEXT => 0,
639    #  RAW_IN_LEN => 1,
640    #  RAW_OUT_BYTES => 2,
641    #  RAW_FALLBACK => 3,
642    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
643    # Now we are converting from raw to aggregate, switch from 1 byte strings
644    # to numbers
645    my $b = ord $key;
646    $fallback ||= 0;
647    if ($l &&
648        # If this == fails, we're going to reset $agg_max_in below anyway.
649        $b == ++$agg_max_in &&
650        # References in numeric context give the pointer as an int.
651        $agg_next == $next &&
652        $agg_in_len == $in_len &&
653        $agg_out_len == length $out_bytes &&
654        $agg_fallback == $fallback
655        # && length($l->[AGG_OUT_BYTES]) < 16
656       ) {
657      #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
658      # we can aggregate this byte onto the end.
659      $l->[AGG_MAX_IN] = $b;
660      $l->[AGG_OUT_BYTES] .= $out_bytes;
661    } else {
662      # AGG_MIN_IN => 0,
663      # AGG_MAX_IN => 1,
664      # AGG_OUT_BYTES => 2,
665      # AGG_NEXT => 3,
666      # AGG_IN_LEN => 4,
667      # AGG_OUT_LEN => 5,
668      # AGG_FALLBACK => 6,
669      # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
670      # (only gains .6% on euc-jp  -- is it worth it?)
671      push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
672                       $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
673                       $agg_fallback = $fallback];
674    }
675    if (exists $next->{Cname}) {
676      $next->{'Forward'} = 1 if $next != $a;
677    } else {
678      process(sprintf("%s_%02x",$name,$b),$next);
679    }
680  }
681  # encengine.c rules say that last entry must be for 255
682  if ($agg_max_in < 255) {
683    push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
684  }
685  $a->{'Entries'} = \@ent;
686}
687
688
689sub addstrings
690{
691 my ($fh,$a) = @_;
692 my $name = $a->{'Cname'};
693 # String tables
694 foreach my $b (@{$a->{'Entries'}})
695  {
696   next unless $b->[AGG_OUT_LEN];
697   $strings{$b->[AGG_OUT_BYTES]} = undef;
698  }
699 if ($a->{'Forward'})
700  {
701   my $cpp = ($Config{d_cplusplus} || '') eq 'define';
702   my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
703   my $const = $cpp ? '' : 'const';
704   print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
705  }
706 $a->{'DoneStrings'} = 1;
707 foreach my $b (@{$a->{'Entries'}})
708  {
709   my ($s,$e,$out,$t,$end,$l) = @$b;
710   addstrings($fh,$t) unless $t->{'DoneStrings'};
711  }
712}
713
714sub outbigstring
715{
716  my ($fh,$name) = @_;
717
718  $string_acc = '';
719
720  # Make the big string in the string accumulator. Longest first, on the hope
721  # that this makes it more likely that we find the short strings later on.
722  # Not sure if it helps sorting strings of the same length lexically.
723  foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
724    my $index = index $string_acc, $s;
725    if ($index >= 0) {
726      $saved += length($s);
727      $strings_in_acc{$s} = $index;
728    } else {
729    OPTIMISER: {
730    if ($opt{'O'}) {
731      my $sublength = length $s;
732      while (--$sublength > 0) {
733        # progressively lop characters off the end, to see if the start of
734        # the new string overlaps the end of the accumulator.
735        if (substr ($string_acc, -$sublength)
736        eq substr ($s, 0, $sublength)) {
737          $subsave += $sublength;
738          $strings_in_acc{$s} = length ($string_acc) - $sublength;
739          # append the last bit on the end.
740          $string_acc .= substr ($s, $sublength);
741          last OPTIMISER;
742        }
743        # or if the end of the new string overlaps the start of the
744        # accumulator
745        next unless substr ($string_acc, 0, $sublength)
746          eq substr ($s, -$sublength);
747        # well, the last $sublength characters of the accumulator match.
748        # so as we're prepending to the accumulator, need to shift all our
749        # existing offsets forwards
750        $_ += $sublength foreach values %strings_in_acc;
751        $subsave += $sublength;
752        $strings_in_acc{$s} = 0;
753        # append the first bit on the start.
754        $string_acc = substr ($s, 0, -$sublength) . $string_acc;
755        last OPTIMISER;
756      }
757    }
758    # Optimiser (if it ran) found nothing, so just going have to tack the
759    # whole thing on the end.
760    $strings_in_acc{$s} = length $string_acc;
761    $string_acc .= $s;
762      };
763    }
764  }
765
766  $strings = length $string_acc;
767  my $cpp = ($Config{d_cplusplus} || '') eq 'define';
768  my $var = $cpp ? '' : 'static';
769  my $definition = "\n$var const U8 $name\[$strings] = { " .
770    join(',',unpack "C*",$string_acc);
771  # We have a single long line. Split it at convenient commas.
772  print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
773  print $fh substr ($definition, pos $definition), " };\n";
774}
775
776sub findstring {
777  my ($name,$s) = @_;
778  my $offset = $strings_in_acc{$s};
779  die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
780    unless defined $offset;
781  "$name + $offset";
782}
783
784sub outtable
785{
786 my ($fh,$a,$bigname) = @_;
787 my $name = $a->{'Cname'};
788 $a->{'Done'} = 1;
789 foreach my $b (@{$a->{'Entries'}})
790  {
791   my ($s,$e,$out,$t,$end,$l) = @$b;
792   outtable($fh,$t,$bigname) unless $t->{'Done'};
793  }
794 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
795 my $var = $cpp ? '' : 'static';
796 my $const = $cpp ? '' : 'const';
797 print $fh "\n$var $const encpage_t $name\[",
798   scalar(@{$a->{'Entries'}}), "] = {\n";
799 foreach my $b (@{$a->{'Entries'}})
800  {
801   my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
802   # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
803   print  $fh "{";
804   if ($l)
805    {
806     printf $fh findstring($bigname,$out);
807    }
808   else
809    {
810     print  $fh "0";
811    }
812   print  $fh ",",$t->{Cname};
813   printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
814  }
815 print $fh "};\n";
816}
817
818sub output_enc
819{
820 my ($fh,$name,$a) = @_;
821 die "Changed - fix me for new structure";
822 foreach my $b (sort keys %$a)
823  {
824   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
825  }
826}
827
828sub decode_U
829{
830 my $s = shift;
831}
832
833my @uname;
834sub char_names
835{
836 my $s = do "unicore/Name.pl";
837 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
838 pos($s) = 0;
839 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
840  {
841   my $name = $3;
842   my $s = hex($1);
843   last if $s >= 0x10000;
844   my $e = length($2) ? hex($2) : $s;
845   for (my $i = $s; $i <= $e; $i++)
846    {
847     $uname[$i] = $name;
848#    print sprintf("U%04X $name\n",$i);
849    }
850  }
851}
852
853sub output_ucm_page
854{
855  my ($cmap,$a,$t,$pre) = @_;
856  # warn sprintf("Page %x\n",$pre);
857  my $raw = $t->{Raw};
858  foreach my $key (sort keys %$raw) {
859    #  RAW_NEXT => 0,
860    #  RAW_IN_LEN => 1,
861    #  RAW_OUT_BYTES => 2,
862    #  RAW_FALLBACK => 3,
863    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
864    my $u = ord $key;
865    $fallback ||= 0;
866
867    if ($next != $a && $next != $t) {
868      output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
869    } elsif (length $out_bytes) {
870      if ($pre) {
871        $u = $pre|($u &0x3f);
872      }
873      my $s = sprintf "<U%04X> ",$u;
874      #foreach my $c (split(//,$out_bytes)) {
875      #  $s .= sprintf "\\x%02X",ord($c);
876      #}
877      # 9.5% faster changing that loop to this:
878      $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
879      $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
880      push(@$cmap,$s);
881    } else {
882      warn join(',',$u, @{$raw->{$key}},$a,$t);
883    }
884  }
885}
886
887sub output_ucm
888{
889 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
890 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
891 print $fh "<code_set_name> \"$name\"\n";
892 char_names();
893 if (defined $min_el)
894  {
895   print $fh "<mb_cur_min> $min_el\n";
896  }
897 if (defined $max_el)
898  {
899   print $fh "<mb_cur_max> $max_el\n";
900  }
901 if (defined $rep)
902  {
903   print $fh "<subchar> ";
904   foreach my $c (split(//,$rep))
905    {
906     printf $fh "\\x%02X",ord($c);
907    }
908   print $fh "\n";
909  }
910 my @cmap;
911 output_ucm_page(\@cmap,$h,$h,0);
912 print $fh "#\nCHARMAP\n";
913 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
914  {
915   print $fh $line;
916  }
917 print $fh "END CHARMAP\n";
918}
919
920use vars qw(
921    $_Enc2xs
922    $_Version
923    $_Inc
924    $_E2X
925    $_Name
926    $_TableFiles
927    $_Now
928);
929
930sub find_e2x{
931    eval { require File::Find; };
932    my (@inc, %e2x_dir);
933    for my $inc (@INC){
934    push @inc, $inc unless $inc eq '.'; #skip current dir
935    }
936    File::Find::find(
937         sub {
938         my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
939             $atime,$mtime,$ctime,$blksize,$blocks)
940             = lstat($_) or return;
941         -f _ or return;
942         if (/^.*\.e2x$/o){
943             no warnings 'once';
944             $e2x_dir{$File::Find::dir} ||= $mtime;
945         }
946         return;
947         }, @inc);
948    warn join("\n", keys %e2x_dir), "\n";
949    for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
950    $_E2X = $d;
951    # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
952    return $_E2X;
953    }
954}
955
956sub make_makefile_pl
957{
958    eval { require Encode; };
959    $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
960    # our used for variable expansion
961    $_Enc2xs = $0;
962    $_Version = $VERSION;
963    $_E2X = find_e2x();
964    $_Name = shift;
965    $_TableFiles = join(",", map {qq('$_')} @_);
966    $_Now = scalar localtime();
967
968    eval { require File::Spec; };
969    _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
970    _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
971    _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
972    _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
973    _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
974    exit;
975}
976
977use vars qw(
978        $_ModLines
979        $_LocalVer
980        );
981
982sub make_configlocal_pm {
983    eval { require Encode; };
984    $@ and die "Unable to require Encode: $@\n";
985    eval { require File::Spec; };
986
987    # our used for variable expantion
988    my %in_core = map { $_ => 1 } (
989        'ascii',      'iso-8859-1', 'utf8',
990        'ascii-ctrl', 'null',       'utf-8-strict'
991    );
992    my %LocalMod = ();
993    # check @enc;
994    use File::Find ();
995    my $wanted = sub{
996	-f $_ or return;
997	$File::Find::name =~ /\A\./        and return;
998	$File::Find::name =~ /\.pm\z/      or  return;
999	$File::Find::name =~ m/\bEncode\b/ or  return;
1000	my $mod = $File::Find::name;
1001	$mod =~ s/.*\bEncode\b/Encode/o;
1002	$mod =~ s/\.pm\z//o;
1003	$mod =~ s,/,::,og;
1004	warn qq{ require $mod;\n};
1005	eval qq{ require $mod; };
1006	$@ and die "Can't require $mod: $@\n";
1007	for my $enc ( Encode->encodings() ) {
1008	    no warnings;
1009	    $in_core{$enc}                   and next;
1010	    $Encode::Config::ExtModule{$enc} and next;
1011	    $LocalMod{$enc} ||= $mod;
1012	}
1013    };
1014    File::Find::find({wanted => $wanted}, @INC);
1015    $_ModLines = "";
1016    for my $enc ( sort keys %LocalMod ) {
1017        $_ModLines .=
1018          qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1019    }
1020    warn $_ModLines;
1021    $_LocalVer = _mkversion();
1022    $_E2X      = find_e2x();
1023    $_Inc      = $INC{"Encode.pm"};
1024    $_Inc =~ s/\.pm$//o;
1025    _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1026        File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1027    exit;
1028}
1029
1030sub _mkversion{
1031    # v-string is now depreciated; use time() instead;
1032    #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1033    #$yyyy += 1900, $mo +=1;
1034    #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1035    return time();
1036}
1037
1038sub _print_expand{
1039    eval { require File::Basename; };
1040    $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1041    File::Basename->import();
1042    my ($src, $dst, $clobber) = @_;
1043    if (!$clobber and -e $dst){
1044    warn "$dst exists. skipping\n";
1045    return;
1046    }
1047    warn "Generating $dst...\n";
1048    open my $in, $src or die "$src : $!";
1049    if ((my $d = dirname($dst)) ne '.'){
1050    -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1051    }
1052    open my $out, ">$dst" or die "$!";
1053    my $asis = 0;
1054    while (<$in>){
1055    if (/^#### END_OF_HEADER/){
1056        $asis = 1; next;
1057    }
1058    s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1059    print $out $_;
1060    }
1061}
1062__END__
1063
1064=head1 NAME
1065
1066enc2xs -- Perl Encode Module Generator
1067
1068=head1 SYNOPSIS
1069
1070  enc2xs -[options]
1071  enc2xs -M ModName mapfiles...
1072  enc2xs -C
1073
1074=head1 DESCRIPTION
1075
1076F<enc2xs> builds a Perl extension for use by Encode from either
1077Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1078Besides being used internally during the build process of the Encode
1079module, you can use F<enc2xs> to add your own encoding to perl.
1080No knowledge of XS is necessary.
1081
1082=head1 Quick Guide
1083
1084If you want to know as little about Perl as possible but need to
1085add a new encoding, just read this chapter and forget the rest.
1086
1087=over 4
1088
1089=item 0.Z<>
1090
1091Have a .ucm file ready.  You can get it from somewhere or you can write
1092your own from scratch or you can grab one from the Encode distribution
1093and customize it.  For the UCM format, see the next Chapter.  In the
1094example below, I'll call my theoretical encoding myascii, defined
1095in I<my.ucm>.  C<$> is a shell prompt.
1096
1097  $ ls -F
1098  my.ucm
1099
1100=item 1.Z<>
1101
1102Issue a command as follows;
1103
1104  $ enc2xs -M My my.ucm
1105  generating Makefile.PL
1106  generating My.pm
1107  generating README
1108  generating Changes
1109
1110Now take a look at your current directory.  It should look like this.
1111
1112  $ ls -F
1113  Makefile.PL   My.pm         my.ucm        t/
1114
1115The following files were created.
1116
1117  Makefile.PL - MakeMaker script
1118  My.pm       - Encode submodule
1119  t/My.t      - test file
1120
1121=over 4
1122
1123=item 1.1.Z<>
1124
1125If you want *.ucm installed together with the modules, do as follows;
1126
1127  $ mkdir Encode
1128  $ mv *.ucm Encode
1129  $ enc2xs -M My Encode/*ucm
1130
1131=back
1132
1133=item 2.Z<>
1134
1135Edit the files generated.  You don't have to if you have no time AND no
1136intention to give it to someone else.  But it is a good idea to edit
1137the pod and to add more tests.
1138
1139=item 3.Z<>
1140
1141Now issue a command all Perl Mongers love:
1142
1143  $ perl Makefile.PL
1144  Writing Makefile for Encode::My
1145
1146=item 4.Z<>
1147
1148Now all you have to do is make.
1149
1150  $ make
1151  cp My.pm blib/lib/Encode/My.pm
1152  /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1153    -o encode_t.c -f encode_t.fnm
1154  Reading myascii (myascii)
1155  Writing compiled form
1156  128 bytes in string tables
1157  384 bytes (75%) saved spotting duplicates
1158  1 bytes (0.775%) saved using substrings
1159  ....
1160  chmod 644 blib/arch/auto/Encode/My/My.bs
1161  $
1162
1163The time it takes varies depending on how fast your machine is and
1164how large your encoding is.  Unless you are working on something big
1165like euc-tw, it won't take too long.
1166
1167=item 5.Z<>
1168
1169You can "make install" already but you should test first.
1170
1171  $ make test
1172  PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1173    -e 'use Test::Harness  qw(&runtests $verbose); \
1174    $verbose=0; runtests @ARGV;' t/*.t
1175  t/My....ok
1176  All tests successful.
1177  Files=1, Tests=2,  0 wallclock secs
1178   ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1179
1180=item 6.Z<>
1181
1182If you are content with the test result, just "make install"
1183
1184=item 7.Z<>
1185
1186If you want to add your encoding to Encode's demand-loading list
1187(so you don't have to "use Encode::YourEncoding"), run
1188
1189  enc2xs -C
1190
1191to update Encode::ConfigLocal, a module that controls local settings.
1192After that, "use Encode;" is enough to load your encodings on demand.
1193
1194=back
1195
1196=head1 The Unicode Character Map
1197
1198Encode uses the Unicode Character Map (UCM) format for source character
1199mappings.  This format is used by IBM's ICU package and was adopted
1200by Nick Ing-Simmons for use with the Encode module.  Since UCM is
1201more flexible than Tcl's Encoding Map and far more user-friendly,
1202this is the recommended format for Encode now.
1203
1204A UCM file looks like this.
1205
1206  #
1207  # Comments
1208  #
1209  <code_set_name> "US-ascii" # Required
1210  <code_set_alias> "ascii"   # Optional
1211  <mb_cur_min> 1             # Required; usually 1
1212  <mb_cur_max> 1             # Max. # of bytes/char
1213  <subchar> \x3F             # Substitution char
1214  #
1215  CHARMAP
1216  <U0000> \x00 |0 # <control>
1217  <U0001> \x01 |0 # <control>
1218  <U0002> \x02 |0 # <control>
1219  ....
1220  <U007C> \x7C |0 # VERTICAL LINE
1221  <U007D> \x7D |0 # RIGHT CURLY BRACKET
1222  <U007E> \x7E |0 # TILDE
1223  <U007F> \x7F |0 # <control>
1224  END CHARMAP
1225
1226=over 4
1227
1228=item *
1229
1230Anything that follows C<#> is treated as a comment.
1231
1232=item *
1233
1234The header section continues until a line containing the word
1235CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1236pair per line.  Strings used as values must be quoted. Barewords are
1237treated as numbers.  I<\xXX> represents a byte.
1238
1239Most of the keywords are self-explanatory. I<subchar> means
1240substitution character, not subcharacter.  When you decode a Unicode
1241sequence to this encoding but no matching character is found, the byte
1242sequence defined here will be used.  For most cases, the value here is
1243\x3F; in ASCII, this is a question mark.
1244
1245=item *
1246
1247CHARMAP starts the character map section.  Each line has a form as
1248follows:
1249
1250  <UXXXX> \xXX.. |0 # comment
1251    ^     ^      ^
1252    |     |      +- Fallback flag
1253    |     +-------- Encoded byte sequence
1254    +-------------- Unicode Character ID in hex
1255
1256The format is roughly the same as a header section except for the
1257fallback flag: | followed by 0..3.   The meaning of the possible
1258values is as follows:
1259
1260=over 4
1261
1262=item |0
1263
1264Round trip safe.  A character decoded to Unicode encodes back to the
1265same byte sequence.  Most characters have this flag.
1266
1267=item |1
1268
1269Fallback for unicode -> encoding.  When seen, enc2xs adds this
1270character for the encode map only.
1271
1272=item |2
1273
1274Skip sub-char mapping should there be no code point.
1275
1276=item |3
1277
1278Fallback for encoding -> unicode.  When seen, enc2xs adds this
1279character for the decode map only.
1280
1281=back
1282
1283=item *
1284
1285And finally, END OF CHARMAP ends the section.
1286
1287=back
1288
1289When you are manually creating a UCM file, you should copy ascii.ucm
1290or an existing encoding which is close to yours, rather than write
1291your own from scratch.
1292
1293When you do so, make sure you leave at least B<U0000> to B<U0020> as
1294is, unless your environment is EBCDIC.
1295
1296B<CAVEAT>: not all features in UCM are implemented.  For example,
1297icu:state is not used.  Because of that, you need to write a perl
1298module if you want to support algorithmical encodings, notably
1299the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1300L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1301
1302=head2 Coping with duplicate mappings
1303
1304When you create a map, you SHOULD make your mappings round-trip safe.
1305That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1306$data> stands for all characters that are marked as C<|0>.  Here is
1307how to make sure:
1308
1309=over 4
1310
1311=item *
1312
1313Sort your map in Unicode order.
1314
1315=item *
1316
1317When you have a duplicate entry, mark either one with '|1' or '|3'.
1318
1319=item *
1320
1321And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1322
1323=back
1324
1325Here is an example from big5-eten.
1326
1327  <U2550> \xF9\xF9 |0
1328  <U2550> \xA2\xA4 |3
1329
1330Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1331this;
1332
1333  E to U               U to E
1334  --------------------------------------
1335  \xF9\xF9 => U2550    U2550 => \xF9\xF9
1336  \xA2\xA4 => U2550
1337
1338So it is round-trip safe for \xF9\xF9.  But if the line above is upside
1339down, here is what happens.
1340
1341  E to U               U to E
1342  --------------------------------------
1343  \xA2\xA4 => U2550    U2550 => \xF9\xF9
1344  (\xF9\xF9 => U2550 is now overwritten!)
1345
1346The Encode package comes with F<ucmlint>, a crude but sufficient
1347utility to check the integrity of a UCM file.  Check under the
1348Encode/bin directory for this.
1349
1350When in doubt, you can use F<ucmsort>, yet another utility under
1351Encode/bin directory.
1352
1353=head1 Bookmarks
1354
1355=over 4
1356
1357=item *
1358
1359ICU Home Page
1360L<http://www.icu-project.org/>
1361
1362=item *
1363
1364ICU Character Mapping Tables
1365L<http://site.icu-project.org/charts/charset>
1366
1367=item *
1368
1369ICU:Conversion Data
1370L<http://www.icu-project.org/userguide/conversion-data.html>
1371
1372=back
1373
1374=head1 SEE ALSO
1375
1376L<Encode>,
1377L<perlmod>,
1378L<perlpod>
1379
1380=cut
1381
1382# -Q to disable the duplicate codepoint test
1383# -S make mapping errors fatal
1384# -q to remove comments written to output files
1385# -O to enable the (brute force) substring optimiser
1386# -o <output> to specify the output file name (else it's the first arg)
1387# -f <inlist> to give a file with a list of input files (else use the args)
1388# -n <name> to name the encoding (else use the basename of the input file.
1389
1390With %seen holding array refs:
1391
1392      865.66 real        28.80 user         8.79 sys
1393      7904  maximum resident set size
1394      1356  average shared memory size
1395     18566  average unshared data size
1396       229  average unshared stack size
1397     46080  page reclaims
1398     33373  page faults
1399
1400With %seen holding simple scalars:
1401
1402      342.16 real        27.11 user         3.54 sys
1403      8388  maximum resident set size
1404      1394  average shared memory size
1405     14969  average unshared data size
1406       236  average unshared stack size
1407     28159  page reclaims
1408      9839  page faults
1409
1410Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1411how %seen is storing things its seen. So it is pathalogically bad on a 16M
1412RAM machine, but it's going to help even on modern machines.
1413Swapping is bad, m'kay :-)
1414