1#!./perl 2BEGIN { 3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's 4 # with $ENV{PERL_CORE} set 5 # In case we need it in future... 6 require Config; import Config; 7 pop @INC if $INC[-1] eq '.'; 8} 9use strict; 10use warnings; 11use Getopt::Std; 12use Config; 13my @orig_ARGV = @ARGV; 14our $VERSION = do { my @r = (q$Revision: 2.12 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 15 16# These may get re-ordered. 17# RAW is a do_now as inserted by &enter 18# AGG is an aggregated do_now, as built up by &process 19 20use constant { 21 RAW_NEXT => 0, 22 RAW_IN_LEN => 1, 23 RAW_OUT_BYTES => 2, 24 RAW_FALLBACK => 3, 25 26 AGG_MIN_IN => 0, 27 AGG_MAX_IN => 1, 28 AGG_OUT_BYTES => 2, 29 AGG_NEXT => 3, 30 AGG_IN_LEN => 4, 31 AGG_OUT_LEN => 5, 32 AGG_FALLBACK => 6, 33}; 34 35# (See the algorithm in encengine.c - we're building structures for it) 36 37# There are two sorts of structures. 38# "do_now" (an array, two variants of what needs storing) is whatever we need 39# to do now we've read an input byte. 40# It's housed in a "do_next" (which is how we got to it), and in turn points 41# to a "do_next" which contains all the "do_now"s for the next input byte. 42 43# There will be a "do_next" which is the start state. 44# For a single byte encoding it's the only "do_next" - each "do_now" points 45# back to it, and each "do_now" will cause bytes. There is no state. 46 47# For a multi-byte encoding where all characters in the input are the same 48# length, then there will be a tree of "do_now"->"do_next"->"do_now" 49# branching out from the start state, one step for each input byte. 50# The leaf "do_now"s will all be at the same distance from the start state, 51# only the leaf "do_now"s cause output bytes, and they in turn point back to 52# the start state. 53 54# For an encoding where there are variable length input byte sequences, you 55# will encounter a leaf "do_now" sooner for the shorter input sequences, but 56# as before the leaves will point back to the start state. 57 58# The system will cope with escape encodings (imagine them as a mostly 59# self-contained tree for each escape state, and cross links between trees 60# at the state-switching characters) but so far no input format defines these. 61 62# The system will also cope with having output "leaves" in the middle of 63# the bifurcating branches, not just at the extremities, but again no 64# input format does this yet. 65 66# There are two variants of the "do_now" structure. The first, smaller variant 67# is generated by &enter as the input file is read. There is one structure 68# for each input byte. Say we are mapping a single byte encoding to a 69# single byte encoding, with "ABCD" going "abcd". There will be 70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...} 71 72# &process then walks the tree, building aggregate "do_now" structures for 73# adjacent bytes where possible. The aggregate is for a contiguous range of 74# bytes which each produce the same length of output, each move to the 75# same next state, and each have the same fallback flag. 76# So our 4 RAW "do_now"s above become replaced by a single structure 77# containing: 78# ["A", "D", "abcd", 1, ...] 79# ie, for an input byte $_ in "A".."D", output 1 byte, found as 80# substr ("abcd", (ord $_ - ord "A") * 1, 1) 81# which maps very nicely into pointer arithmetic in C for encengine.c 82 83sub encode_U 84{ 85 # UTF-8 encode long hand - only covers part of perl's range 86 ## my $uv = shift; 87 # chr() works in native space so convert value from table 88 # into that space before using chr(). 89 my $ch = chr(utf8::unicode_to_native($_[0])); 90 # Now get core perl to encode that the way it likes. 91 utf8::encode($ch); 92 return $ch; 93} 94 95sub encode_S 96{ 97 # encode single byte 98 ## my ($ch,$page) = @_; return chr($ch); 99 return chr $_[0]; 100} 101 102sub encode_D 103{ 104 # encode double byte MS byte first 105 ## my ($ch,$page) = @_; return chr($page).chr($ch); 106 return chr ($_[1]) . chr $_[0]; 107} 108 109sub encode_M 110{ 111 # encode Multi-byte - single for 0..255 otherwise double 112 ## my ($ch,$page) = @_; 113 ## return &encode_D if $page; 114 ## return &encode_S; 115 return chr ($_[1]) . chr $_[0] if $_[1]; 116 return chr $_[0]; 117} 118 119my %encode_types = (U => \&encode_U, 120 S => \&encode_S, 121 D => \&encode_D, 122 M => \&encode_M, 123 ); 124 125# Win32 does not expand globs on command line 126eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); 127 128my %opt; 129# I think these are: 130# -Q to disable the duplicate codepoint test 131# -S make mapping errors fatal 132# -q to remove comments written to output files 133# -O to enable the (brute force) substring optimiser 134# -o <output> to specify the output file name (else it's the first arg) 135# -f <inlist> to give a file with a list of input files (else use the args) 136# -n <name> to name the encoding (else use the basename of the input file. 137getopts('CM:SQqOo:f:n:',\%opt); 138 139$opt{M} and make_makefile_pl($opt{M}, @ARGV); 140$opt{C} and make_configlocal_pm($opt{C}, @ARGV); 141 142# This really should go first, else the die here causes empty (non-erroneous) 143# output files to be written. 144my @encfiles; 145if (exists $opt{'f'}) { 146 # -F is followed by name of file containing list of filenames 147 my $flist = $opt{'f'}; 148 open(FLIST,$flist) || die "Cannot open $flist:$!"; 149 chomp(@encfiles = <FLIST>); 150 close(FLIST); 151} else { 152 @encfiles = @ARGV; 153} 154 155my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV); 156chmod(0666,$cname) if -f $cname && !-w $cname; 157open(C,">$cname") || die "Cannot open $cname:$!"; 158 159my $dname = $cname; 160my $hname = $cname; 161 162my ($doC,$doEnc,$doUcm,$doPet); 163 164if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined 165 { 166 $doC = 1; 167 $dname =~ s/(\.[^\.]*)?$/.exh/; 168 chmod(0666,$dname) if -f $cname && !-w $dname; 169 open(D,">$dname") || die "Cannot open $dname:$!"; 170 $hname =~ s/(\.[^\.]*)?$/.h/; 171 chmod(0666,$hname) if -f $cname && !-w $hname; 172 open(H,">$hname") || die "Cannot open $hname:$!"; 173 174 foreach my $fh (\*C,\*D,\*H) 175 { 176 print $fh <<"END" unless $opt{'q'}; 177/* 178 !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 179 This file was autogenerated by: 180 $^X $0 @orig_ARGV 181 enc2xs VERSION $VERSION 182*/ 183END 184 } 185 186 if ($cname =~ /(\w+)\.xs$/) 187 { 188 print C "#include <EXTERN.h>\n"; 189 print C "#include <perl.h>\n"; 190 print C "#include <XSUB.h>\n"; 191 } 192 print C "#include \"encode.h\"\n\n"; 193 194 } 195elsif ($cname =~ /\.enc$/) 196 { 197 $doEnc = 1; 198 } 199elsif ($cname =~ /\.ucm$/) 200 { 201 $doUcm = 1; 202 } 203elsif ($cname =~ /\.pet$/) 204 { 205 $doPet = 1; 206 } 207 208my %encoding; 209my %strings; 210my $string_acc; 211my %strings_in_acc; 212 213my $saved = 0; 214my $subsave = 0; 215my $strings = 0; 216 217sub cmp_name 218{ 219 if ($a =~ /^.*-(\d+)/) 220 { 221 my $an = $1; 222 if ($b =~ /^.*-(\d+)/) 223 { 224 my $r = $an <=> $1; 225 return $r if $r; 226 } 227 } 228 return $a cmp $b; 229} 230 231 232foreach my $enc (sort cmp_name @encfiles) 233 { 234 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; 235 $name = $opt{'n'} if exists $opt{'n'}; 236 if (open(E,$enc)) 237 { 238 if ($sfx eq 'enc') 239 { 240 compile_enc(\*E,lc($name)); 241 } 242 else 243 { 244 compile_ucm(\*E,lc($name)); 245 } 246 } 247 else 248 { 249 warn "Cannot open $enc for $name:$!"; 250 } 251 } 252 253if ($doC) 254 { 255 print STDERR "Writing compiled form\n"; 256 foreach my $name (sort cmp_name keys %encoding) 257 { 258 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 259 process($name.'_utf8',$e2u); 260 addstrings(\*C,$e2u); 261 262 process('utf8_'.$name,$u2e); 263 addstrings(\*C,$u2e); 264 } 265 outbigstring(\*C,"enctable"); 266 foreach my $name (sort cmp_name keys %encoding) 267 { 268 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 269 outtable(\*C,$e2u, "enctable"); 270 outtable(\*C,$u2e, "enctable"); 271 272 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); 273 } 274 my $cpp = ($Config{d_cplusplus} || '') eq 'define'; 275 my $exta = $cpp ? 'extern "C" ' : "static"; 276 my $extb = $cpp ? 'extern "C" ' : ""; 277 foreach my $enc (sort cmp_name keys %encoding) 278 { 279 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; 280 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}}; 281 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); 282 my $replen = 0; 283 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g); 284 my $sym = "${enc}_encoding"; 285 $sym =~ s/\W+/_/g; 286 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen, 287 $min_el,$max_el); 288 print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n"; 289 print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n"; 290 print C "${extb} const encode_t $sym = \n"; 291 # This is to make null encoding work -- dankogai 292 for (my $i = (scalar @info) - 1; $i >= 0; --$i){ 293 $info[$i] ||= 1; 294 } 295 # end of null tweak -- dankogai 296 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n"; 297 } 298 299 foreach my $enc (sort cmp_name keys %encoding) 300 { 301 my $sym = "${enc}_encoding"; 302 $sym =~ s/\W+/_/g; 303 print H "extern encode_t $sym;\n"; 304 print D " Encode_XSEncoding(aTHX_ &$sym);\n"; 305 } 306 307 if ($cname =~ /(\w+)\.xs$/) 308 { 309 my $mod = $1; 310 print C <<'END'; 311 312static void 313Encode_XSEncoding(pTHX_ encode_t *enc) 314{ 315 dSP; 316 HV *stash = gv_stashpv("Encode::XS", TRUE); 317 SV *iv = newSViv(PTR2IV(enc)); 318 SV *sv = sv_bless(newRV_noinc(iv),stash); 319 int i = 0; 320 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 321 constness, in the hope that perl won't mess with it. */ 322 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 323 SvFLAGS(iv) |= SVp_POK; 324 SvPVX(iv) = (char*) enc->name[0]; 325 PUSHMARK(sp); 326 XPUSHs(sv); 327 while (enc->name[i]) 328 { 329 const char *name = enc->name[i++]; 330 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 331 } 332 PUTBACK; 333 call_pv("Encode::define_encoding",G_DISCARD); 334 SvREFCNT_dec(sv); 335} 336 337END 338 339 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; 340 print C "BOOT:\n{\n"; 341 print C "#include \"$dname\"\n"; 342 print C "}\n"; 343 } 344 # Close in void context is bad, m'kay 345 close(D) or warn "Error closing '$dname': $!"; 346 close(H) or warn "Error closing '$hname': $!"; 347 348 my $perc_saved = $saved/($strings + $saved) * 100; 349 my $perc_subsaved = $subsave/($strings + $subsave) * 100; 350 printf STDERR "%d bytes in string tables\n",$strings; 351 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n", 352 $saved, $perc_saved if $saved; 353 printf STDERR "%d bytes (%.3g%%) saved using substrings\n", 354 $subsave, $perc_subsaved if $subsave; 355 } 356elsif ($doEnc) 357 { 358 foreach my $name (sort cmp_name keys %encoding) 359 { 360 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 361 output_enc(\*C,$name,$e2u); 362 } 363 } 364elsif ($doUcm) 365 { 366 foreach my $name (sort cmp_name keys %encoding) 367 { 368 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 369 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el); 370 } 371 } 372 373# writing half meg files and then not checking to see if you just filled the 374# disk is bad, m'kay 375close(C) or die "Error closing '$cname': $!"; 376 377# End of the main program. 378 379sub compile_ucm 380{ 381 my ($fh,$name) = @_; 382 my $e2u = {}; 383 my $u2e = {}; 384 my $cs; 385 my %attr; 386 while (<$fh>) 387 { 388 s/#.*$//; 389 last if /^\s*CHARMAP\s*$/i; 390 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr 391 { 392 $attr{$1} = $2; 393 } 394 } 395 if (!defined($cs = $attr{'code_set_name'})) 396 { 397 warn "No <code_set_name> in $name\n"; 398 } 399 else 400 { 401 $name = $cs unless exists $opt{'n'}; 402 } 403 my $erep; 404 my $urep; 405 my $max_el; 406 my $min_el; 407 if (exists $attr{'subchar'}) 408 { 409 #my @byte; 410 #$attr{'subchar'} =~ /^\s*/cg; 411 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; 412 #$erep = join('',map(chr(hex($_)),@byte)); 413 $erep = $attr{'subchar'}; 414 $erep =~ s/^\s+//; $erep =~ s/\s+$//; 415 } 416 print "Reading $name ($cs)\n"; 417 my $nfb = 0; 418 my $hfb = 0; 419 while (<$fh>) 420 { 421 s/#.*$//; 422 last if /^\s*END\s+CHARMAP\s*$/i; 423 next if /^\s*$/; 424 my (@uni, @byte) = (); 425 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o 426 or die "Bad line: $_"; 427 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){ 428 push @uni, map { substr($_, 1) } split(/\+/, $1); 429 } 430 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){ 431 push @byte, $1; 432 } 433 if (@uni) 434 { 435 my $uch = join('', map { encode_U(hex($_)) } @uni ); 436 my $ech = join('',map(chr(hex($_)),@byte)); 437 my $el = length($ech); 438 $max_el = $el if (!defined($max_el) || $el > $max_el); 439 $min_el = $el if (!defined($min_el) || $el < $min_el); 440 if (length($fb)) 441 { 442 $fb = substr($fb,1); 443 $hfb++; 444 } 445 else 446 { 447 $nfb++; 448 $fb = '0'; 449 } 450 # $fb is fallback flag 451 # 0 - round trip safe 452 # 1 - fallback for unicode -> enc 453 # 2 - skip sub-char mapping 454 # 3 - fallback enc -> unicode 455 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); 456 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); 457 } 458 else 459 { 460 warn $_; 461 } 462 } 463 if ($nfb && $hfb) 464 { 465 die "$nfb entries without fallback, $hfb entries with\n"; 466 } 467 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el]; 468} 469 470 471 472sub compile_enc 473{ 474 my ($fh,$name) = @_; 475 my $e2u = {}; 476 my $u2e = {}; 477 478 my $type; 479 while ($type = <$fh>) 480 { 481 last if $type !~ /^\s*#/; 482 } 483 chomp($type); 484 return if $type eq 'E'; 485 # Do the hash lookup once, rather than once per function call. 4% speedup. 486 my $type_func = $encode_types{$type}; 487 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); 488 warn "$type encoded $name\n"; 489 my $rep = ''; 490 # Save a defined test by setting these to defined values. 491 my $min_el = ~0; # A very big integer 492 my $max_el = 0; # Anything must be longer than 0 493 { 494 my $v = hex($def); 495 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe); 496 } 497 my $errors; 498 my $seen; 499 # use -Q to silence the seen test. Makefile.PL uses this by default. 500 $seen = {} unless $opt{Q}; 501 do 502 { 503 my $line = <$fh>; 504 chomp($line); 505 my $page = hex($line); 506 my $ch = 0; 507 my $i = 16; 508 do 509 { 510 # So why is it 1% faster to leave the my here? 511 my $line = <$fh>; 512 $line =~ s/\r\n$/\n/; 513 die "$.:${line}Line should be exactly 65 characters long including 514 newline (".length($line).")" unless length ($line) == 65; 515 # Split line into groups of 4 hex digits, convert groups to ints 516 # This takes 65.35 517 # map {hex $_} $line =~ /(....)/g 518 # This takes 63.75 (2.5% less time) 519 # unpack "n*", pack "H*", $line 520 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay 521 # Doing it as while ($line =~ /(....)/g) took 74.63 522 foreach my $val (unpack "n*", pack "H*", $line) 523 { 524 next if $val == 0xFFFD; 525 my $ech = &$type_func($ch,$page); 526 if ($val || (!$ch && !$page)) 527 { 528 my $el = length($ech); 529 $max_el = $el if $el > $max_el; 530 $min_el = $el if $el < $min_el; 531 my $uch = encode_U($val); 532 if ($seen) { 533 # We're doing the test. 534 # We don't need to read this quickly, so storing it as a scalar, 535 # rather than 3 (anon array, plus the 2 scalars it holds) saves 536 # RAM and may make us faster on low RAM systems. [see __END__] 537 if (exists $seen->{$uch}) 538 { 539 warn sprintf("U%04X is %02X%02X and %04X\n", 540 $val,$page,$ch,$seen->{$uch}); 541 $errors++; 542 } 543 else 544 { 545 $seen->{$uch} = $page << 8 | $ch; 546 } 547 } 548 # Passing 2 extra args each time is 3.6% slower! 549 # Even with having to add $fallback ||= 0 later 550 enter_fb0($e2u,$ech,$uch); 551 enter_fb0($u2e,$uch,$ech); 552 } 553 else 554 { 555 # No character at this position 556 # enter($e2u,$ech,undef,$e2u); 557 } 558 $ch++; 559 } 560 } while --$i; 561 } while --$pages; 562 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines" 563 if $min_el > $max_el; 564 die "$errors mapping conflicts\n" if ($errors && $opt{'S'}); 565 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el]; 566} 567 568# my ($a,$s,$d,$t,$fb) = @_; 569sub enter { 570 my ($current,$inbytes,$outbytes,$next,$fallback) = @_; 571 # state we shift to after this (multibyte) input character defaults to same 572 # as current state. 573 $next ||= $current; 574 # Making sure it is defined seems to be faster than {no warnings;} in 575 # &process, or passing it in as 0 explicitly. 576 # XXX $fallback ||= 0; 577 578 # Start at the beginning and work forwards through the string to zero. 579 # effectively we are removing 1 character from the front each time 580 # but we don't actually edit the string. [this alone seems to be 14% speedup] 581 # Hence -$pos is the length of the remaining string. 582 my $pos = -length $inbytes; 583 while (1) { 584 my $byte = substr $inbytes, $pos, 1; 585 # RAW_NEXT => 0, 586 # RAW_IN_LEN => 1, 587 # RAW_OUT_BYTES => 2, 588 # RAW_FALLBACK => 3, 589 # to unicode an array would seem to be better, because the pages are dense. 590 # from unicode can be very sparse, favouring a hash. 591 # hash using the bytes (all length 1) as keys rather than ord value, 592 # as it's easier to sort these in &process. 593 594 # It's faster to always add $fallback even if it's undef, rather than 595 # choosing between 3 and 4 element array. (hence why we set it defined 596 # above) 597 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback]; 598 # When $pos was -1 we were at the last input character. 599 unless (++$pos) { 600 $do_now->[RAW_OUT_BYTES] = $outbytes; 601 $do_now->[RAW_NEXT] = $next; 602 return; 603 } 604 # Tail recursion. The intermediate state may not have a name yet. 605 $current = $do_now->[RAW_NEXT]; 606 } 607} 608 609# This is purely for optimisation. It's just &enter hard coded for $fallback 610# of 0, using only a 3 entry array ref to save memory for every entry. 611sub enter_fb0 { 612 my ($current,$inbytes,$outbytes,$next) = @_; 613 $next ||= $current; 614 615 my $pos = -length $inbytes; 616 while (1) { 617 my $byte = substr $inbytes, $pos, 1; 618 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'']; 619 unless (++$pos) { 620 $do_now->[RAW_OUT_BYTES] = $outbytes; 621 $do_now->[RAW_NEXT] = $next; 622 return; 623 } 624 $current = $do_now->[RAW_NEXT]; 625 } 626} 627 628sub process 629{ 630 my ($name,$a) = @_; 631 $name =~ s/\W+/_/g; 632 $a->{Cname} = $name; 633 my $raw = $a->{Raw}; 634 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback); 635 my @ent; 636 $agg_max_in = 0; 637 foreach my $key (sort keys %$raw) { 638 # RAW_NEXT => 0, 639 # RAW_IN_LEN => 1, 640 # RAW_OUT_BYTES => 2, 641 # RAW_FALLBACK => 3, 642 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 643 # Now we are converting from raw to aggregate, switch from 1 byte strings 644 # to numbers 645 my $b = ord $key; 646 $fallback ||= 0; 647 if ($l && 648 # If this == fails, we're going to reset $agg_max_in below anyway. 649 $b == ++$agg_max_in && 650 # References in numeric context give the pointer as an int. 651 $agg_next == $next && 652 $agg_in_len == $in_len && 653 $agg_out_len == length $out_bytes && 654 $agg_fallback == $fallback 655 # && length($l->[AGG_OUT_BYTES]) < 16 656 ) { 657 # my $i = ord($b)-ord($l->[AGG_MIN_IN]); 658 # we can aggregate this byte onto the end. 659 $l->[AGG_MAX_IN] = $b; 660 $l->[AGG_OUT_BYTES] .= $out_bytes; 661 } else { 662 # AGG_MIN_IN => 0, 663 # AGG_MAX_IN => 1, 664 # AGG_OUT_BYTES => 2, 665 # AGG_NEXT => 3, 666 # AGG_IN_LEN => 4, 667 # AGG_OUT_LEN => 5, 668 # AGG_FALLBACK => 6, 669 # Reset the last thing we saw, plus set 5 lexicals to save some derefs. 670 # (only gains .6% on euc-jp -- is it worth it?) 671 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next, 672 $agg_in_len = $in_len, $agg_out_len = length $out_bytes, 673 $agg_fallback = $fallback]; 674 } 675 if (exists $next->{Cname}) { 676 $next->{'Forward'} = 1 if $next != $a; 677 } else { 678 process(sprintf("%s_%02x",$name,$b),$next); 679 } 680 } 681 # encengine.c rules say that last entry must be for 255 682 if ($agg_max_in < 255) { 683 push @ent, [1+$agg_max_in, 255,undef,$a,0,0]; 684 } 685 $a->{'Entries'} = \@ent; 686} 687 688 689sub addstrings 690{ 691 my ($fh,$a) = @_; 692 my $name = $a->{'Cname'}; 693 # String tables 694 foreach my $b (@{$a->{'Entries'}}) 695 { 696 next unless $b->[AGG_OUT_LEN]; 697 $strings{$b->[AGG_OUT_BYTES]} = undef; 698 } 699 if ($a->{'Forward'}) 700 { 701 my $cpp = ($Config{d_cplusplus} || '') eq 'define'; 702 my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static'; 703 my $const = $cpp ? '' : 'const'; 704 print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; 705 } 706 $a->{'DoneStrings'} = 1; 707 foreach my $b (@{$a->{'Entries'}}) 708 { 709 my ($s,$e,$out,$t,$end,$l) = @$b; 710 addstrings($fh,$t) unless $t->{'DoneStrings'}; 711 } 712} 713 714sub outbigstring 715{ 716 my ($fh,$name) = @_; 717 718 $string_acc = ''; 719 720 # Make the big string in the string accumulator. Longest first, on the hope 721 # that this makes it more likely that we find the short strings later on. 722 # Not sure if it helps sorting strings of the same length lexically. 723 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) { 724 my $index = index $string_acc, $s; 725 if ($index >= 0) { 726 $saved += length($s); 727 $strings_in_acc{$s} = $index; 728 } else { 729 OPTIMISER: { 730 if ($opt{'O'}) { 731 my $sublength = length $s; 732 while (--$sublength > 0) { 733 # progressively lop characters off the end, to see if the start of 734 # the new string overlaps the end of the accumulator. 735 if (substr ($string_acc, -$sublength) 736 eq substr ($s, 0, $sublength)) { 737 $subsave += $sublength; 738 $strings_in_acc{$s} = length ($string_acc) - $sublength; 739 # append the last bit on the end. 740 $string_acc .= substr ($s, $sublength); 741 last OPTIMISER; 742 } 743 # or if the end of the new string overlaps the start of the 744 # accumulator 745 next unless substr ($string_acc, 0, $sublength) 746 eq substr ($s, -$sublength); 747 # well, the last $sublength characters of the accumulator match. 748 # so as we're prepending to the accumulator, need to shift all our 749 # existing offsets forwards 750 $_ += $sublength foreach values %strings_in_acc; 751 $subsave += $sublength; 752 $strings_in_acc{$s} = 0; 753 # append the first bit on the start. 754 $string_acc = substr ($s, 0, -$sublength) . $string_acc; 755 last OPTIMISER; 756 } 757 } 758 # Optimiser (if it ran) found nothing, so just going have to tack the 759 # whole thing on the end. 760 $strings_in_acc{$s} = length $string_acc; 761 $string_acc .= $s; 762 }; 763 } 764 } 765 766 $strings = length $string_acc; 767 my $cpp = ($Config{d_cplusplus} || '') eq 'define'; 768 my $var = $cpp ? '' : 'static'; 769 my $definition = "\n$var const U8 $name\[$strings] = { " . 770 join(',',unpack "C*",$string_acc); 771 # We have a single long line. Split it at convenient commas. 772 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs; 773 print $fh substr ($definition, pos $definition), " };\n"; 774} 775 776sub findstring { 777 my ($name,$s) = @_; 778 my $offset = $strings_in_acc{$s}; 779 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator" 780 unless defined $offset; 781 "$name + $offset"; 782} 783 784sub outtable 785{ 786 my ($fh,$a,$bigname) = @_; 787 my $name = $a->{'Cname'}; 788 $a->{'Done'} = 1; 789 foreach my $b (@{$a->{'Entries'}}) 790 { 791 my ($s,$e,$out,$t,$end,$l) = @$b; 792 outtable($fh,$t,$bigname) unless $t->{'Done'}; 793 } 794 my $cpp = ($Config{d_cplusplus} || '') eq 'define'; 795 my $var = $cpp ? '' : 'static'; 796 my $const = $cpp ? '' : 'const'; 797 print $fh "\n$var $const encpage_t $name\[", 798 scalar(@{$a->{'Entries'}}), "] = {\n"; 799 foreach my $b (@{$a->{'Entries'}}) 800 { 801 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; 802 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan 803 print $fh "{"; 804 if ($l) 805 { 806 printf $fh findstring($bigname,$out); 807 } 808 else 809 { 810 print $fh "0"; 811 } 812 print $fh ",",$t->{Cname}; 813 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; 814 } 815 print $fh "};\n"; 816} 817 818sub output_enc 819{ 820 my ($fh,$name,$a) = @_; 821 die "Changed - fix me for new structure"; 822 foreach my $b (sort keys %$a) 823 { 824 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; 825 } 826} 827 828sub decode_U 829{ 830 my $s = shift; 831} 832 833my @uname; 834sub char_names 835{ 836 my $s = do "unicore/Name.pl"; 837 die "char_names: unicore/Name.pl: $!\n" unless defined $s; 838 pos($s) = 0; 839 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc) 840 { 841 my $name = $3; 842 my $s = hex($1); 843 last if $s >= 0x10000; 844 my $e = length($2) ? hex($2) : $s; 845 for (my $i = $s; $i <= $e; $i++) 846 { 847 $uname[$i] = $name; 848# print sprintf("U%04X $name\n",$i); 849 } 850 } 851} 852 853sub output_ucm_page 854{ 855 my ($cmap,$a,$t,$pre) = @_; 856 # warn sprintf("Page %x\n",$pre); 857 my $raw = $t->{Raw}; 858 foreach my $key (sort keys %$raw) { 859 # RAW_NEXT => 0, 860 # RAW_IN_LEN => 1, 861 # RAW_OUT_BYTES => 2, 862 # RAW_FALLBACK => 3, 863 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 864 my $u = ord $key; 865 $fallback ||= 0; 866 867 if ($next != $a && $next != $t) { 868 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF); 869 } elsif (length $out_bytes) { 870 if ($pre) { 871 $u = $pre|($u &0x3f); 872 } 873 my $s = sprintf "<U%04X> ",$u; 874 #foreach my $c (split(//,$out_bytes)) { 875 # $s .= sprintf "\\x%02X",ord($c); 876 #} 877 # 9.5% faster changing that loop to this: 878 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes; 879 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u]; 880 push(@$cmap,$s); 881 } else { 882 warn join(',',$u, @{$raw->{$key}},$a,$t); 883 } 884 } 885} 886 887sub output_ucm 888{ 889 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_; 890 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'}; 891 print $fh "<code_set_name> \"$name\"\n"; 892 char_names(); 893 if (defined $min_el) 894 { 895 print $fh "<mb_cur_min> $min_el\n"; 896 } 897 if (defined $max_el) 898 { 899 print $fh "<mb_cur_max> $max_el\n"; 900 } 901 if (defined $rep) 902 { 903 print $fh "<subchar> "; 904 foreach my $c (split(//,$rep)) 905 { 906 printf $fh "\\x%02X",ord($c); 907 } 908 print $fh "\n"; 909 } 910 my @cmap; 911 output_ucm_page(\@cmap,$h,$h,0); 912 print $fh "#\nCHARMAP\n"; 913 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap) 914 { 915 print $fh $line; 916 } 917 print $fh "END CHARMAP\n"; 918} 919 920use vars qw( 921 $_Enc2xs 922 $_Version 923 $_Inc 924 $_E2X 925 $_Name 926 $_TableFiles 927 $_Now 928); 929 930sub find_e2x{ 931 eval { require File::Find; }; 932 my (@inc, %e2x_dir); 933 for my $inc (@INC){ 934 push @inc, $inc unless $inc eq '.'; #skip current dir 935 } 936 File::Find::find( 937 sub { 938 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, 939 $atime,$mtime,$ctime,$blksize,$blocks) 940 = lstat($_) or return; 941 -f _ or return; 942 if (/^.*\.e2x$/o){ 943 no warnings 'once'; 944 $e2x_dir{$File::Find::dir} ||= $mtime; 945 } 946 return; 947 }, @inc); 948 warn join("\n", keys %e2x_dir), "\n"; 949 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){ 950 $_E2X = $d; 951 # warn "$_E2X => ", scalar localtime($e2x_dir{$d}); 952 return $_E2X; 953 } 954} 955 956sub make_makefile_pl 957{ 958 eval { require Encode; }; 959 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n"; 960 # our used for variable expansion 961 $_Enc2xs = $0; 962 $_Version = $VERSION; 963 $_E2X = find_e2x(); 964 $_Name = shift; 965 $_TableFiles = join(",", map {qq('$_')} @_); 966 $_Now = scalar localtime(); 967 968 eval { require File::Spec; }; 969 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL"); 970 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm"); 971 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t"); 972 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README"); 973 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes"); 974 exit; 975} 976 977use vars qw( 978 $_ModLines 979 $_LocalVer 980 ); 981 982sub make_configlocal_pm { 983 eval { require Encode; }; 984 $@ and die "Unable to require Encode: $@\n"; 985 eval { require File::Spec; }; 986 987 # our used for variable expantion 988 my %in_core = map { $_ => 1 } ( 989 'ascii', 'iso-8859-1', 'utf8', 990 'ascii-ctrl', 'null', 'utf-8-strict' 991 ); 992 my %LocalMod = (); 993 # check @enc; 994 use File::Find (); 995 my $wanted = sub{ 996 -f $_ or return; 997 $File::Find::name =~ /\A\./ and return; 998 $File::Find::name =~ /\.pm\z/ or return; 999 $File::Find::name =~ m/\bEncode\b/ or return; 1000 my $mod = $File::Find::name; 1001 $mod =~ s/.*\bEncode\b/Encode/o; 1002 $mod =~ s/\.pm\z//o; 1003 $mod =~ s,/,::,og; 1004 warn qq{ require $mod;\n}; 1005 eval qq{ require $mod; }; 1006 $@ and die "Can't require $mod: $@\n"; 1007 for my $enc ( Encode->encodings() ) { 1008 no warnings; 1009 $in_core{$enc} and next; 1010 $Encode::Config::ExtModule{$enc} and next; 1011 $LocalMod{$enc} ||= $mod; 1012 } 1013 }; 1014 File::Find::find({wanted => $wanted}, @INC); 1015 $_ModLines = ""; 1016 for my $enc ( sort keys %LocalMod ) { 1017 $_ModLines .= 1018 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n); 1019 } 1020 warn $_ModLines; 1021 $_LocalVer = _mkversion(); 1022 $_E2X = find_e2x(); 1023 $_Inc = $INC{"Encode.pm"}; 1024 $_Inc =~ s/\.pm$//o; 1025 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ), 1026 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 ); 1027 exit; 1028} 1029 1030sub _mkversion{ 1031 # v-string is now depreciated; use time() instead; 1032 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime(); 1033 #$yyyy += 1900, $mo +=1; 1034 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm); 1035 return time(); 1036} 1037 1038sub _print_expand{ 1039 eval { require File::Basename; }; 1040 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n"; 1041 File::Basename->import(); 1042 my ($src, $dst, $clobber) = @_; 1043 if (!$clobber and -e $dst){ 1044 warn "$dst exists. skipping\n"; 1045 return; 1046 } 1047 warn "Generating $dst...\n"; 1048 open my $in, $src or die "$src : $!"; 1049 if ((my $d = dirname($dst)) ne '.'){ 1050 -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; 1051 } 1052 open my $out, ">$dst" or die "$!"; 1053 my $asis = 0; 1054 while (<$in>){ 1055 if (/^#### END_OF_HEADER/){ 1056 $asis = 1; next; 1057 } 1058 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis; 1059 print $out $_; 1060 } 1061} 1062__END__ 1063 1064=head1 NAME 1065 1066enc2xs -- Perl Encode Module Generator 1067 1068=head1 SYNOPSIS 1069 1070 enc2xs -[options] 1071 enc2xs -M ModName mapfiles... 1072 enc2xs -C 1073 1074=head1 DESCRIPTION 1075 1076F<enc2xs> builds a Perl extension for use by Encode from either 1077Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc). 1078Besides being used internally during the build process of the Encode 1079module, you can use F<enc2xs> to add your own encoding to perl. 1080No knowledge of XS is necessary. 1081 1082=head1 Quick Guide 1083 1084If you want to know as little about Perl as possible but need to 1085add a new encoding, just read this chapter and forget the rest. 1086 1087=over 4 1088 1089=item 0.Z<> 1090 1091Have a .ucm file ready. You can get it from somewhere or you can write 1092your own from scratch or you can grab one from the Encode distribution 1093and customize it. For the UCM format, see the next Chapter. In the 1094example below, I'll call my theoretical encoding myascii, defined 1095in I<my.ucm>. C<$> is a shell prompt. 1096 1097 $ ls -F 1098 my.ucm 1099 1100=item 1.Z<> 1101 1102Issue a command as follows; 1103 1104 $ enc2xs -M My my.ucm 1105 generating Makefile.PL 1106 generating My.pm 1107 generating README 1108 generating Changes 1109 1110Now take a look at your current directory. It should look like this. 1111 1112 $ ls -F 1113 Makefile.PL My.pm my.ucm t/ 1114 1115The following files were created. 1116 1117 Makefile.PL - MakeMaker script 1118 My.pm - Encode submodule 1119 t/My.t - test file 1120 1121=over 4 1122 1123=item 1.1.Z<> 1124 1125If you want *.ucm installed together with the modules, do as follows; 1126 1127 $ mkdir Encode 1128 $ mv *.ucm Encode 1129 $ enc2xs -M My Encode/*ucm 1130 1131=back 1132 1133=item 2.Z<> 1134 1135Edit the files generated. You don't have to if you have no time AND no 1136intention to give it to someone else. But it is a good idea to edit 1137the pod and to add more tests. 1138 1139=item 3.Z<> 1140 1141Now issue a command all Perl Mongers love: 1142 1143 $ perl Makefile.PL 1144 Writing Makefile for Encode::My 1145 1146=item 4.Z<> 1147 1148Now all you have to do is make. 1149 1150 $ make 1151 cp My.pm blib/lib/Encode/My.pm 1152 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \ 1153 -o encode_t.c -f encode_t.fnm 1154 Reading myascii (myascii) 1155 Writing compiled form 1156 128 bytes in string tables 1157 384 bytes (75%) saved spotting duplicates 1158 1 bytes (0.775%) saved using substrings 1159 .... 1160 chmod 644 blib/arch/auto/Encode/My/My.bs 1161 $ 1162 1163The time it takes varies depending on how fast your machine is and 1164how large your encoding is. Unless you are working on something big 1165like euc-tw, it won't take too long. 1166 1167=item 5.Z<> 1168 1169You can "make install" already but you should test first. 1170 1171 $ make test 1172 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \ 1173 -e 'use Test::Harness qw(&runtests $verbose); \ 1174 $verbose=0; runtests @ARGV;' t/*.t 1175 t/My....ok 1176 All tests successful. 1177 Files=1, Tests=2, 0 wallclock secs 1178 ( 0.09 cusr + 0.01 csys = 0.09 CPU) 1179 1180=item 6.Z<> 1181 1182If you are content with the test result, just "make install" 1183 1184=item 7.Z<> 1185 1186If you want to add your encoding to Encode's demand-loading list 1187(so you don't have to "use Encode::YourEncoding"), run 1188 1189 enc2xs -C 1190 1191to update Encode::ConfigLocal, a module that controls local settings. 1192After that, "use Encode;" is enough to load your encodings on demand. 1193 1194=back 1195 1196=head1 The Unicode Character Map 1197 1198Encode uses the Unicode Character Map (UCM) format for source character 1199mappings. This format is used by IBM's ICU package and was adopted 1200by Nick Ing-Simmons for use with the Encode module. Since UCM is 1201more flexible than Tcl's Encoding Map and far more user-friendly, 1202this is the recommended format for Encode now. 1203 1204A UCM file looks like this. 1205 1206 # 1207 # Comments 1208 # 1209 <code_set_name> "US-ascii" # Required 1210 <code_set_alias> "ascii" # Optional 1211 <mb_cur_min> 1 # Required; usually 1 1212 <mb_cur_max> 1 # Max. # of bytes/char 1213 <subchar> \x3F # Substitution char 1214 # 1215 CHARMAP 1216 <U0000> \x00 |0 # <control> 1217 <U0001> \x01 |0 # <control> 1218 <U0002> \x02 |0 # <control> 1219 .... 1220 <U007C> \x7C |0 # VERTICAL LINE 1221 <U007D> \x7D |0 # RIGHT CURLY BRACKET 1222 <U007E> \x7E |0 # TILDE 1223 <U007F> \x7F |0 # <control> 1224 END CHARMAP 1225 1226=over 4 1227 1228=item * 1229 1230Anything that follows C<#> is treated as a comment. 1231 1232=item * 1233 1234The header section continues until a line containing the word 1235CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one 1236pair per line. Strings used as values must be quoted. Barewords are 1237treated as numbers. I<\xXX> represents a byte. 1238 1239Most of the keywords are self-explanatory. I<subchar> means 1240substitution character, not subcharacter. When you decode a Unicode 1241sequence to this encoding but no matching character is found, the byte 1242sequence defined here will be used. For most cases, the value here is 1243\x3F; in ASCII, this is a question mark. 1244 1245=item * 1246 1247CHARMAP starts the character map section. Each line has a form as 1248follows: 1249 1250 <UXXXX> \xXX.. |0 # comment 1251 ^ ^ ^ 1252 | | +- Fallback flag 1253 | +-------- Encoded byte sequence 1254 +-------------- Unicode Character ID in hex 1255 1256The format is roughly the same as a header section except for the 1257fallback flag: | followed by 0..3. The meaning of the possible 1258values is as follows: 1259 1260=over 4 1261 1262=item |0 1263 1264Round trip safe. A character decoded to Unicode encodes back to the 1265same byte sequence. Most characters have this flag. 1266 1267=item |1 1268 1269Fallback for unicode -> encoding. When seen, enc2xs adds this 1270character for the encode map only. 1271 1272=item |2 1273 1274Skip sub-char mapping should there be no code point. 1275 1276=item |3 1277 1278Fallback for encoding -> unicode. When seen, enc2xs adds this 1279character for the decode map only. 1280 1281=back 1282 1283=item * 1284 1285And finally, END OF CHARMAP ends the section. 1286 1287=back 1288 1289When you are manually creating a UCM file, you should copy ascii.ucm 1290or an existing encoding which is close to yours, rather than write 1291your own from scratch. 1292 1293When you do so, make sure you leave at least B<U0000> to B<U0020> as 1294is, unless your environment is EBCDIC. 1295 1296B<CAVEAT>: not all features in UCM are implemented. For example, 1297icu:state is not used. Because of that, you need to write a perl 1298module if you want to support algorithmical encodings, notably 1299the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>, 1300L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>. 1301 1302=head2 Coping with duplicate mappings 1303 1304When you create a map, you SHOULD make your mappings round-trip safe. 1305That is, C<encode('your-encoding', decode('your-encoding', $data)) eq 1306$data> stands for all characters that are marked as C<|0>. Here is 1307how to make sure: 1308 1309=over 4 1310 1311=item * 1312 1313Sort your map in Unicode order. 1314 1315=item * 1316 1317When you have a duplicate entry, mark either one with '|1' or '|3'. 1318 1319=item * 1320 1321And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry. 1322 1323=back 1324 1325Here is an example from big5-eten. 1326 1327 <U2550> \xF9\xF9 |0 1328 <U2550> \xA2\xA4 |3 1329 1330Internally Encoding -> Unicode and Unicode -> Encoding Map looks like 1331this; 1332 1333 E to U U to E 1334 -------------------------------------- 1335 \xF9\xF9 => U2550 U2550 => \xF9\xF9 1336 \xA2\xA4 => U2550 1337 1338So it is round-trip safe for \xF9\xF9. But if the line above is upside 1339down, here is what happens. 1340 1341 E to U U to E 1342 -------------------------------------- 1343 \xA2\xA4 => U2550 U2550 => \xF9\xF9 1344 (\xF9\xF9 => U2550 is now overwritten!) 1345 1346The Encode package comes with F<ucmlint>, a crude but sufficient 1347utility to check the integrity of a UCM file. Check under the 1348Encode/bin directory for this. 1349 1350When in doubt, you can use F<ucmsort>, yet another utility under 1351Encode/bin directory. 1352 1353=head1 Bookmarks 1354 1355=over 4 1356 1357=item * 1358 1359ICU Home Page 1360L<http://www.icu-project.org/> 1361 1362=item * 1363 1364ICU Character Mapping Tables 1365L<http://site.icu-project.org/charts/charset> 1366 1367=item * 1368 1369ICU:Conversion Data 1370L<http://www.icu-project.org/userguide/conversion-data.html> 1371 1372=back 1373 1374=head1 SEE ALSO 1375 1376L<Encode>, 1377L<perlmod>, 1378L<perlpod> 1379 1380=cut 1381 1382# -Q to disable the duplicate codepoint test 1383# -S make mapping errors fatal 1384# -q to remove comments written to output files 1385# -O to enable the (brute force) substring optimiser 1386# -o <output> to specify the output file name (else it's the first arg) 1387# -f <inlist> to give a file with a list of input files (else use the args) 1388# -n <name> to name the encoding (else use the basename of the input file. 1389 1390With %seen holding array refs: 1391 1392 865.66 real 28.80 user 8.79 sys 1393 7904 maximum resident set size 1394 1356 average shared memory size 1395 18566 average unshared data size 1396 229 average unshared stack size 1397 46080 page reclaims 1398 33373 page faults 1399 1400With %seen holding simple scalars: 1401 1402 342.16 real 27.11 user 3.54 sys 1403 8388 maximum resident set size 1404 1394 average shared memory size 1405 14969 average unshared data size 1406 236 average unshared stack size 1407 28159 page reclaims 1408 9839 page faults 1409 1410Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is 1411how %seen is storing things its seen. So it is pathalogically bad on a 16M 1412RAM machine, but it's going to help even on modern machines. 1413Swapping is bad, m'kay :-) 1414