1#!./perl 2BEGIN { 3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's 4 # with $ENV{PERL_CORE} set 5 # In case we need it in future... 6 require Config; import Config; 7 pop @INC if $INC[-1] eq '.'; 8} 9use strict; 10use warnings; 11use Getopt::Std; 12use Config; 13my @orig_ARGV = @ARGV; 14our $VERSION = do { my @r = (q$Revision: 2.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 15 16# These may get re-ordered. 17# RAW is a do_now as inserted by &enter 18# AGG is an aggregated do_now, as built up by &process 19 20use constant { 21 RAW_NEXT => 0, 22 RAW_IN_LEN => 1, 23 RAW_OUT_BYTES => 2, 24 RAW_FALLBACK => 3, 25 26 AGG_MIN_IN => 0, 27 AGG_MAX_IN => 1, 28 AGG_OUT_BYTES => 2, 29 AGG_NEXT => 3, 30 AGG_IN_LEN => 4, 31 AGG_OUT_LEN => 5, 32 AGG_FALLBACK => 6, 33}; 34 35# (See the algorithm in encengine.c - we're building structures for it) 36 37# There are two sorts of structures. 38# "do_now" (an array, two variants of what needs storing) is whatever we need 39# to do now we've read an input byte. 40# It's housed in a "do_next" (which is how we got to it), and in turn points 41# to a "do_next" which contains all the "do_now"s for the next input byte. 42 43# There will be a "do_next" which is the start state. 44# For a single byte encoding it's the only "do_next" - each "do_now" points 45# back to it, and each "do_now" will cause bytes. There is no state. 46 47# For a multi-byte encoding where all characters in the input are the same 48# length, then there will be a tree of "do_now"->"do_next"->"do_now" 49# branching out from the start state, one step for each input byte. 50# The leaf "do_now"s will all be at the same distance from the start state, 51# only the leaf "do_now"s cause output bytes, and they in turn point back to 52# the start state. 53 54# For an encoding where there are variable length input byte sequences, you 55# will encounter a leaf "do_now" sooner for the shorter input sequences, but 56# as before the leaves will point back to the start state. 57 58# The system will cope with escape encodings (imagine them as a mostly 59# self-contained tree for each escape state, and cross links between trees 60# at the state-switching characters) but so far no input format defines these. 61 62# The system will also cope with having output "leaves" in the middle of 63# the bifurcating branches, not just at the extremities, but again no 64# input format does this yet. 65 66# There are two variants of the "do_now" structure. The first, smaller variant 67# is generated by &enter as the input file is read. There is one structure 68# for each input byte. Say we are mapping a single byte encoding to a 69# single byte encoding, with "ABCD" going "abcd". There will be 70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...} 71 72# &process then walks the tree, building aggregate "do_now" structures for 73# adjacent bytes where possible. The aggregate is for a contiguous range of 74# bytes which each produce the same length of output, each move to the 75# same next state, and each have the same fallback flag. 76# So our 4 RAW "do_now"s above become replaced by a single structure 77# containing: 78# ["A", "D", "abcd", 1, ...] 79# ie, for an input byte $_ in "A".."D", output 1 byte, found as 80# substr ("abcd", (ord $_ - ord "A") * 1, 1) 81# which maps very nicely into pointer arithmetic in C for encengine.c 82 83sub encode_U 84{ 85 # UTF-8 encode long hand - only covers part of perl's range 86 ## my $uv = shift; 87 # chr() works in native space so convert value from table 88 # into that space before using chr(). 89 my $ch = chr(utf8::unicode_to_native($_[0])); 90 # Now get core perl to encode that the way it likes. 91 utf8::encode($ch); 92 return $ch; 93} 94 95sub encode_S 96{ 97 # encode single byte 98 ## my ($ch,$page) = @_; return chr($ch); 99 return chr $_[0]; 100} 101 102sub encode_D 103{ 104 # encode double byte MS byte first 105 ## my ($ch,$page) = @_; return chr($page).chr($ch); 106 return chr ($_[1]) . chr $_[0]; 107} 108 109sub encode_M 110{ 111 # encode Multi-byte - single for 0..255 otherwise double 112 ## my ($ch,$page) = @_; 113 ## return &encode_D if $page; 114 ## return &encode_S; 115 return chr ($_[1]) . chr $_[0] if $_[1]; 116 return chr $_[0]; 117} 118 119my %encode_types = (U => \&encode_U, 120 S => \&encode_S, 121 D => \&encode_D, 122 M => \&encode_M, 123 ); 124 125# Win32 does not expand globs on command line 126if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) { 127 eval "\@ARGV = map(glob(\$_),\@ARGV)"; 128 @ARGV = @orig_ARGV unless @ARGV; 129} 130 131my %opt; 132# I think these are: 133# -Q to disable the duplicate codepoint test 134# -S make mapping errors fatal 135# -q to remove comments written to output files 136# -O to enable the (brute force) substring optimiser 137# -o <output> to specify the output file name (else it's the first arg) 138# -f <inlist> to give a file with a list of input files (else use the args) 139# -n <name> to name the encoding (else use the basename of the input file. 140#Getopt::Long::Configure("bundling"); 141#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v)); 142getopts('CM:SQqOo:f:n:v',\%opt); 143 144$opt{M} and make_makefile_pl($opt{M}, @ARGV); 145$opt{C} and make_configlocal_pm($opt{C}, @ARGV); 146$opt{v} ||= $ENV{ENC2XS_VERBOSE}; 147 148sub verbose { 149 print STDERR @_ if $opt{v}; 150} 151sub verbosef { 152 printf STDERR @_ if $opt{v}; 153} 154 155 156# ($cpp, $static, $sized) = compiler_info($declaration) 157# 158# return some information about the compiler and compile options we're using: 159# 160# $declaration - true if we're doing a declaration rather than a definition. 161# 162# $cpp - we're using C++ 163# $static - ok to declare the arrays as static 164# $sized - the array declarations should be sized 165 166sub compiler_info { 167 my ($declaration) = @_; 168 169 my $ccflags = $Config{ccflags}; 170 if (defined $Config{ccwarnflags}) { 171 $ccflags .= " " . $Config{ccwarnflags}; 172 } 173 my $compat = $ccflags =~ /\Q-Wc++-compat/; 174 my $pedantic = $ccflags =~ /-pedantic/; 175 176 my $cpp = ($Config{d_cplusplus} || '') eq 'define'; 177 178 # The encpage_t tables contain recursive and mutually recursive 179 # references. To allow them to compile under C++ and some restrictive 180 # cc options, it may be necessary to make the tables non-static/const 181 # (thus moving them from the text to the data segment) and/or not 182 # include the size in the declaration. 183 184 my $static = !( 185 $cpp 186 || ($compat && $pedantic) 187 || ($^O eq 'MacOS' && $declaration) 188 ); 189 190 # -Wc++-compat on its own warns if the array declaration is sized. 191 # The easiest way to avoid this warning is simply not to include 192 # the size in the declaration. 193 # With -pedantic as well, the issue doesn't arise because $static 194 # above becomes false. 195 my $sized = $declaration && !($compat && !$pedantic); 196 197 return ($cpp, $static, $sized); 198} 199 200 201# This really should go first, else the die here causes empty (non-erroneous) 202# output files to be written. 203my @encfiles; 204if (exists $opt{f}) { 205 # -F is followed by name of file containing list of filenames 206 my $flist = $opt{f}; 207 open(FLIST,$flist) || die "Cannot open $flist:$!"; 208 chomp(@encfiles = <FLIST>); 209 close(FLIST); 210} else { 211 @encfiles = @ARGV; 212} 213 214my $cname = $opt{o} ? $opt{o} : shift(@ARGV); 215unless ($cname) { #debuging a win32 nmake error-only. works via cmdline 216 print "\nARGV:"; 217 print "$_ " for @ARGV; 218 print "\nopt:"; 219 print " $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt; 220} 221chmod(0666,$cname) if -f $cname && !-w $cname; 222open(C,">", $cname) || die "Cannot open $cname:$!"; 223 224my $dname = $cname; 225my $hname = $cname; 226 227my ($doC,$doEnc,$doUcm,$doPet); 228 229if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined 230 { 231 $doC = 1; 232 $dname =~ s/(\.[^\.]*)?$/.exh/; 233 chmod(0666,$dname) if -f $cname && !-w $dname; 234 open(D,">", $dname) || die "Cannot open $dname:$!"; 235 $hname =~ s/(\.[^\.]*)?$/.h/; 236 chmod(0666,$hname) if -f $cname && !-w $hname; 237 open(H,">", $hname) || die "Cannot open $hname:$!"; 238 239 foreach my $fh (\*C,\*D,\*H) 240 { 241 print $fh <<"END" unless $opt{'q'}; 242/* 243 !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 244 This file was autogenerated by: 245 $^X $0 @orig_ARGV 246 enc2xs VERSION $VERSION 247*/ 248END 249 } 250 251 if ($cname =~ /(\w+)\.xs$/) 252 { 253 print C "#define PERL_NO_GET_CONTEXT\n"; 254 print C "#include <EXTERN.h>\n"; 255 print C "#include <perl.h>\n"; 256 print C "#include <XSUB.h>\n"; 257 } 258 print C "#include \"encode.h\"\n\n"; 259 260 } 261elsif ($cname =~ /\.enc$/) 262 { 263 $doEnc = 1; 264 } 265elsif ($cname =~ /\.ucm$/) 266 { 267 $doUcm = 1; 268 } 269elsif ($cname =~ /\.pet$/) 270 { 271 $doPet = 1; 272 } 273 274my %encoding; 275my %strings; 276my $string_acc; 277my %strings_in_acc; 278 279my $saved = 0; 280my $subsave = 0; 281my $strings = 0; 282 283sub cmp_name 284{ 285 if ($a =~ /^.*-(\d+)/) 286 { 287 my $an = $1; 288 if ($b =~ /^.*-(\d+)/) 289 { 290 my $r = $an <=> $1; 291 return $r if $r; 292 } 293 } 294 return $a cmp $b; 295} 296 297 298foreach my $enc (sort cmp_name @encfiles) 299 { 300 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; 301 $name = $opt{'n'} if exists $opt{'n'}; 302 if (open(E,$enc)) 303 { 304 if ($sfx eq 'enc') 305 { 306 compile_enc(\*E,lc($name)); 307 } 308 else 309 { 310 compile_ucm(\*E,lc($name)); 311 } 312 } 313 else 314 { 315 warn "Cannot open $enc for $name:$!"; 316 } 317 } 318 319if ($doC) 320 { 321 verbose "Writing compiled form\n"; 322 foreach my $name (sort cmp_name keys %encoding) 323 { 324 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 325 process($name.'_utf8',$e2u); 326 addstrings(\*C,$e2u); 327 328 process('utf8_'.$name,$u2e); 329 addstrings(\*C,$u2e); 330 } 331 outbigstring(\*C,"enctable"); 332 foreach my $name (sort cmp_name keys %encoding) 333 { 334 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 335 outtable(\*C,$e2u, "enctable"); 336 outtable(\*C,$u2e, "enctable"); 337 338 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); 339 } 340 my ($cpp) = compiler_info(0); 341 my $ext = $cpp ? 'extern "C"' : "extern"; 342 my $exta = $cpp ? 'extern "C"' : "static"; 343 my $extb = $cpp ? 'extern "C"' : ""; 344 foreach my $enc (sort cmp_name keys %encoding) 345 { 346 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; 347 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}}; 348 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); 349 my $replen = 0; 350 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g); 351 my $sym = "${enc}_encoding"; 352 $sym =~ s/\W+/_/g; 353 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen, 354 $min_el,$max_el); 355 print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n"; 356 print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n"; 357 print C "${extb} const encode_t $sym = \n"; 358 # This is to make null encoding work -- dankogai 359 for (my $i = (scalar @info) - 1; $i >= 0; --$i){ 360 $info[$i] ||= 1; 361 } 362 # end of null tweak -- dankogai 363 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n"; 364 } 365 366 foreach my $enc (sort cmp_name keys %encoding) 367 { 368 my $sym = "${enc}_encoding"; 369 $sym =~ s/\W+/_/g; 370 print H "${ext} encode_t $sym;\n"; 371 print D " Encode_XSEncoding(aTHX_ &$sym);\n"; 372 } 373 374 if ($cname =~ /(\w+)\.xs$/) 375 { 376 my $mod = $1; 377 print C <<'END'; 378 379static void 380Encode_XSEncoding(pTHX_ encode_t *enc) 381{ 382 dSP; 383 HV *stash = gv_stashpv("Encode::XS", TRUE); 384 SV *iv = newSViv(PTR2IV(enc)); 385 SV *sv = sv_bless(newRV_noinc(iv),stash); 386 int i = 0; 387 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 388 constness, in the hope that perl won't mess with it. */ 389 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 390 SvFLAGS(iv) |= SVp_POK; 391 SvPVX(iv) = (char*) enc->name[0]; 392 PUSHMARK(sp); 393 XPUSHs(sv); 394 while (enc->name[i]) 395 { 396 const char *name = enc->name[i++]; 397 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 398 } 399 PUTBACK; 400 call_pv("Encode::define_encoding",G_DISCARD); 401 SvREFCNT_dec(sv); 402} 403 404END 405 406 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; 407 print C "BOOT:\n{\n"; 408 print C "#include \"$dname\"\n"; 409 print C "}\n"; 410 } 411 # Close in void context is bad, m'kay 412 close(D) or warn "Error closing '$dname': $!"; 413 close(H) or warn "Error closing '$hname': $!"; 414 415 my $perc_saved = $saved/($strings + $saved) * 100; 416 my $perc_subsaved = $subsave/($strings + $subsave) * 100; 417 verbosef "%d bytes in string tables\n",$strings; 418 verbosef "%d bytes (%.3g%%) saved spotting duplicates\n", 419 $saved, $perc_saved if $saved; 420 verbosef "%d bytes (%.3g%%) saved using substrings\n", 421 $subsave, $perc_subsaved if $subsave; 422 } 423elsif ($doEnc) 424 { 425 foreach my $name (sort cmp_name keys %encoding) 426 { 427 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 428 output_enc(\*C,$name,$e2u); 429 } 430 } 431elsif ($doUcm) 432 { 433 foreach my $name (sort cmp_name keys %encoding) 434 { 435 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 436 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el); 437 } 438 } 439 440# writing half meg files and then not checking to see if you just filled the 441# disk is bad, m'kay 442close(C) or die "Error closing '$cname': $!"; 443 444# End of the main program. 445 446sub compile_ucm 447{ 448 my ($fh,$name) = @_; 449 my $e2u = {}; 450 my $u2e = {}; 451 my $cs; 452 my %attr; 453 while (<$fh>) 454 { 455 s/#.*$//; 456 last if /^\s*CHARMAP\s*$/i; 457 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr 458 { 459 $attr{$1} = $2; 460 } 461 } 462 if (!defined($cs = $attr{'code_set_name'})) 463 { 464 warn "No <code_set_name> in $name\n"; 465 } 466 else 467 { 468 $name = $cs unless exists $opt{'n'}; 469 } 470 my $erep; 471 my $urep; 472 my $max_el; 473 my $min_el; 474 if (exists $attr{'subchar'}) 475 { 476 #my @byte; 477 #$attr{'subchar'} =~ /^\s*/cg; 478 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; 479 #$erep = join('',map(chr(hex($_)),@byte)); 480 $erep = $attr{'subchar'}; 481 $erep =~ s/^\s+//; $erep =~ s/\s+$//; 482 } 483 print "Reading $name ($cs)\n" 484 unless defined $ENV{MAKEFLAGS} 485 and $ENV{MAKEFLAGS} =~ /\b(s|silent|quiet)\b/; 486 my $nfb = 0; 487 my $hfb = 0; 488 while (<$fh>) 489 { 490 s/#.*$//; 491 last if /^\s*END\s+CHARMAP\s*$/i; 492 next if /^\s*$/; 493 my (@uni, @byte) = (); 494 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o 495 or die "Bad line: $_"; 496 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){ 497 push @uni, map { substr($_, 1) } split(/\+/, $1); 498 } 499 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){ 500 push @byte, $1; 501 } 502 if (@uni) 503 { 504 my $uch = join('', map { encode_U(hex($_)) } @uni ); 505 my $ech = join('',map(chr(hex($_)),@byte)); 506 my $el = length($ech); 507 $max_el = $el if (!defined($max_el) || $el > $max_el); 508 $min_el = $el if (!defined($min_el) || $el < $min_el); 509 if (length($fb)) 510 { 511 $fb = substr($fb,1); 512 $hfb++; 513 } 514 else 515 { 516 $nfb++; 517 $fb = '0'; 518 } 519 # $fb is fallback flag 520 # 0 - round trip safe 521 # 1 - fallback for unicode -> enc 522 # 2 - skip sub-char mapping 523 # 3 - fallback enc -> unicode 524 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); 525 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); 526 } 527 else 528 { 529 warn $_; 530 } 531 } 532 if ($nfb && $hfb) 533 { 534 die "$nfb entries without fallback, $hfb entries with\n"; 535 } 536 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el]; 537} 538 539 540 541sub compile_enc 542{ 543 my ($fh,$name) = @_; 544 my $e2u = {}; 545 my $u2e = {}; 546 547 my $type; 548 while ($type = <$fh>) 549 { 550 last if $type !~ /^\s*#/; 551 } 552 chomp($type); 553 return if $type eq 'E'; 554 # Do the hash lookup once, rather than once per function call. 4% speedup. 555 my $type_func = $encode_types{$type}; 556 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); 557 warn "$type encoded $name\n"; 558 my $rep = ''; 559 # Save a defined test by setting these to defined values. 560 my $min_el = ~0; # A very big integer 561 my $max_el = 0; # Anything must be longer than 0 562 { 563 my $v = hex($def); 564 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe); 565 } 566 my $errors; 567 my $seen; 568 # use -Q to silence the seen test. Makefile.PL uses this by default. 569 $seen = {} unless $opt{Q}; 570 do 571 { 572 my $line = <$fh>; 573 chomp($line); 574 my $page = hex($line); 575 my $ch = 0; 576 my $i = 16; 577 do 578 { 579 # So why is it 1% faster to leave the my here? 580 my $line = <$fh>; 581 $line =~ s/\r\n$/\n/; 582 die "$.:${line}Line should be exactly 65 characters long including 583 newline (".length($line).")" unless length ($line) == 65; 584 # Split line into groups of 4 hex digits, convert groups to ints 585 # This takes 65.35 586 # map {hex $_} $line =~ /(....)/g 587 # This takes 63.75 (2.5% less time) 588 # unpack "n*", pack "H*", $line 589 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay 590 # Doing it as while ($line =~ /(....)/g) took 74.63 591 foreach my $val (unpack "n*", pack "H*", $line) 592 { 593 next if $val == 0xFFFD; 594 my $ech = &$type_func($ch,$page); 595 if ($val || (!$ch && !$page)) 596 { 597 my $el = length($ech); 598 $max_el = $el if $el > $max_el; 599 $min_el = $el if $el < $min_el; 600 my $uch = encode_U($val); 601 if ($seen) { 602 # We're doing the test. 603 # We don't need to read this quickly, so storing it as a scalar, 604 # rather than 3 (anon array, plus the 2 scalars it holds) saves 605 # RAM and may make us faster on low RAM systems. [see __END__] 606 if (exists $seen->{$uch}) 607 { 608 warn sprintf("U%04X is %02X%02X and %04X\n", 609 $val,$page,$ch,$seen->{$uch}); 610 $errors++; 611 } 612 else 613 { 614 $seen->{$uch} = $page << 8 | $ch; 615 } 616 } 617 # Passing 2 extra args each time is 3.6% slower! 618 # Even with having to add $fallback ||= 0 later 619 enter_fb0($e2u,$ech,$uch); 620 enter_fb0($u2e,$uch,$ech); 621 } 622 else 623 { 624 # No character at this position 625 # enter($e2u,$ech,undef,$e2u); 626 } 627 $ch++; 628 } 629 } while --$i; 630 } while --$pages; 631 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines" 632 if $min_el > $max_el; 633 die "$errors mapping conflicts\n" if ($errors && $opt{'S'}); 634 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el]; 635} 636 637# my ($a,$s,$d,$t,$fb) = @_; 638sub enter { 639 my ($current,$inbytes,$outbytes,$next,$fallback) = @_; 640 # state we shift to after this (multibyte) input character defaults to same 641 # as current state. 642 $next ||= $current; 643 # Making sure it is defined seems to be faster than {no warnings;} in 644 # &process, or passing it in as 0 explicitly. 645 # XXX $fallback ||= 0; 646 647 # Start at the beginning and work forwards through the string to zero. 648 # effectively we are removing 1 character from the front each time 649 # but we don't actually edit the string. [this alone seems to be 14% speedup] 650 # Hence -$pos is the length of the remaining string. 651 my $pos = -length $inbytes; 652 while (1) { 653 my $byte = substr $inbytes, $pos, 1; 654 # RAW_NEXT => 0, 655 # RAW_IN_LEN => 1, 656 # RAW_OUT_BYTES => 2, 657 # RAW_FALLBACK => 3, 658 # to unicode an array would seem to be better, because the pages are dense. 659 # from unicode can be very sparse, favouring a hash. 660 # hash using the bytes (all length 1) as keys rather than ord value, 661 # as it's easier to sort these in &process. 662 663 # It's faster to always add $fallback even if it's undef, rather than 664 # choosing between 3 and 4 element array. (hence why we set it defined 665 # above) 666 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback]; 667 # When $pos was -1 we were at the last input character. 668 unless (++$pos) { 669 $do_now->[RAW_OUT_BYTES] = $outbytes; 670 $do_now->[RAW_NEXT] = $next; 671 return; 672 } 673 # Tail recursion. The intermediate state may not have a name yet. 674 $current = $do_now->[RAW_NEXT]; 675 } 676} 677 678# This is purely for optimisation. It's just &enter hard coded for $fallback 679# of 0, using only a 3 entry array ref to save memory for every entry. 680sub enter_fb0 { 681 my ($current,$inbytes,$outbytes,$next) = @_; 682 $next ||= $current; 683 684 my $pos = -length $inbytes; 685 while (1) { 686 my $byte = substr $inbytes, $pos, 1; 687 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'']; 688 unless (++$pos) { 689 $do_now->[RAW_OUT_BYTES] = $outbytes; 690 $do_now->[RAW_NEXT] = $next; 691 return; 692 } 693 $current = $do_now->[RAW_NEXT]; 694 } 695} 696 697sub process 698{ 699 my ($name,$a) = @_; 700 $name =~ s/\W+/_/g; 701 $a->{Cname} = $name; 702 my $raw = $a->{Raw}; 703 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback); 704 my @ent; 705 $agg_max_in = 0; 706 foreach my $key (sort keys %$raw) { 707 # RAW_NEXT => 0, 708 # RAW_IN_LEN => 1, 709 # RAW_OUT_BYTES => 2, 710 # RAW_FALLBACK => 3, 711 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 712 # Now we are converting from raw to aggregate, switch from 1 byte strings 713 # to numbers 714 my $b = ord $key; 715 $fallback ||= 0; 716 if ($l && 717 # If this == fails, we're going to reset $agg_max_in below anyway. 718 $b == ++$agg_max_in && 719 # References in numeric context give the pointer as an int. 720 $agg_next == $next && 721 $agg_in_len == $in_len && 722 $agg_out_len == length $out_bytes && 723 $agg_fallback == $fallback 724 # && length($l->[AGG_OUT_BYTES]) < 16 725 ) { 726 # my $i = ord($b)-ord($l->[AGG_MIN_IN]); 727 # we can aggregate this byte onto the end. 728 $l->[AGG_MAX_IN] = $b; 729 $l->[AGG_OUT_BYTES] .= $out_bytes; 730 } else { 731 # AGG_MIN_IN => 0, 732 # AGG_MAX_IN => 1, 733 # AGG_OUT_BYTES => 2, 734 # AGG_NEXT => 3, 735 # AGG_IN_LEN => 4, 736 # AGG_OUT_LEN => 5, 737 # AGG_FALLBACK => 6, 738 # Reset the last thing we saw, plus set 5 lexicals to save some derefs. 739 # (only gains .6% on euc-jp -- is it worth it?) 740 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next, 741 $agg_in_len = $in_len, $agg_out_len = length $out_bytes, 742 $agg_fallback = $fallback]; 743 } 744 if (exists $next->{Cname}) { 745 $next->{'Forward'} = 1 if $next != $a; 746 } else { 747 process(sprintf("%s_%02x",$name,$b),$next); 748 } 749 } 750 # encengine.c rules say that last entry must be for 255 751 if ($agg_max_in < 255) { 752 push @ent, [1+$agg_max_in, 255,undef,$a,0,0]; 753 } 754 $a->{'Entries'} = \@ent; 755} 756 757 758sub addstrings 759{ 760 my ($fh,$a) = @_; 761 my $name = $a->{'Cname'}; 762 # String tables 763 foreach my $b (@{$a->{'Entries'}}) 764 { 765 next unless $b->[AGG_OUT_LEN]; 766 $strings{$b->[AGG_OUT_BYTES]} = undef; 767 } 768 if ($a->{'Forward'}) 769 { 770 my ($cpp, $static, $sized) = compiler_info(1); 771 my $count = $sized ? scalar(@{$a->{'Entries'}}) : ''; 772 if ($static) { 773 # we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline 774 print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 775 print $fh "extern encpage_t $name\[$count];\n"; 776 print $fh "#else\n"; 777 print $fh "static const encpage_t $name\[$count];\n"; 778 print $fh "#endif\n"; 779 } else { 780 print $fh "extern encpage_t $name\[$count];\n"; 781 } 782 } 783 $a->{'DoneStrings'} = 1; 784 foreach my $b (@{$a->{'Entries'}}) 785 { 786 my ($s,$e,$out,$t,$end,$l) = @$b; 787 addstrings($fh,$t) unless $t->{'DoneStrings'}; 788 } 789} 790 791sub outbigstring 792{ 793 my ($fh,$name) = @_; 794 795 $string_acc = ''; 796 797 # Make the big string in the string accumulator. Longest first, on the hope 798 # that this makes it more likely that we find the short strings later on. 799 # Not sure if it helps sorting strings of the same length lexically. 800 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) { 801 my $index = index $string_acc, $s; 802 if ($index >= 0) { 803 $saved += length($s); 804 $strings_in_acc{$s} = $index; 805 } else { 806 OPTIMISER: { 807 if ($opt{'O'}) { 808 my $sublength = length $s; 809 while (--$sublength > 0) { 810 # progressively lop characters off the end, to see if the start of 811 # the new string overlaps the end of the accumulator. 812 if (substr ($string_acc, -$sublength) 813 eq substr ($s, 0, $sublength)) { 814 $subsave += $sublength; 815 $strings_in_acc{$s} = length ($string_acc) - $sublength; 816 # append the last bit on the end. 817 $string_acc .= substr ($s, $sublength); 818 last OPTIMISER; 819 } 820 # or if the end of the new string overlaps the start of the 821 # accumulator 822 next unless substr ($string_acc, 0, $sublength) 823 eq substr ($s, -$sublength); 824 # well, the last $sublength characters of the accumulator match. 825 # so as we're prepending to the accumulator, need to shift all our 826 # existing offsets forwards 827 $_ += $sublength foreach values %strings_in_acc; 828 $subsave += $sublength; 829 $strings_in_acc{$s} = 0; 830 # append the first bit on the start. 831 $string_acc = substr ($s, 0, -$sublength) . $string_acc; 832 last OPTIMISER; 833 } 834 } 835 # Optimiser (if it ran) found nothing, so just going have to tack the 836 # whole thing on the end. 837 $strings_in_acc{$s} = length $string_acc; 838 $string_acc .= $s; 839 }; 840 } 841 } 842 843 $strings = length $string_acc; 844 my ($cpp) = compiler_info(0); 845 my $var = $cpp ? '' : 'static'; 846 my $definition = "\n$var const U8 $name\[$strings] = { " . 847 join(',',unpack "C*",$string_acc); 848 # We have a single long line. Split it at convenient commas. 849 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs; 850 print $fh substr ($definition, pos $definition), " };\n"; 851} 852 853sub findstring { 854 my ($name,$s) = @_; 855 my $offset = $strings_in_acc{$s}; 856 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator" 857 unless defined $offset; 858 "$name + $offset"; 859} 860 861sub outtable 862{ 863 my ($fh,$a,$bigname) = @_; 864 my $name = $a->{'Cname'}; 865 $a->{'Done'} = 1; 866 foreach my $b (@{$a->{'Entries'}}) 867 { 868 my ($s,$e,$out,$t,$end,$l) = @$b; 869 outtable($fh,$t,$bigname) unless $t->{'Done'}; 870 } 871 my ($cpp, $static) = compiler_info(0); 872 my $count = scalar(@{$a->{'Entries'}}); 873 if ($static) { 874 print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 875 print $fh "encpage_t $name\[$count] = {\n"; 876 print $fh "#else\n"; 877 print $fh "static const encpage_t $name\[$count] = {\n"; 878 print $fh "#endif\n"; 879 } else { 880 print $fh "\nencpage_t $name\[$count] = {\n"; 881 } 882 foreach my $b (@{$a->{'Entries'}}) 883 { 884 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; 885 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan 886 print $fh "{"; 887 if ($l) 888 { 889 printf $fh findstring($bigname,$out); 890 } 891 else 892 { 893 print $fh "0"; 894 } 895 print $fh ",",$t->{Cname}; 896 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; 897 } 898 print $fh "};\n"; 899} 900 901sub output_enc 902{ 903 my ($fh,$name,$a) = @_; 904 die "Changed - fix me for new structure"; 905 foreach my $b (sort keys %$a) 906 { 907 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; 908 } 909} 910 911sub decode_U 912{ 913 my $s = shift; 914} 915 916my @uname; 917sub char_names 918{ 919 my $s = do "unicore/Name.pl"; 920 die "char_names: unicore/Name.pl: $!\n" unless defined $s; 921 pos($s) = 0; 922 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc) 923 { 924 my $name = $3; 925 my $s = hex($1); 926 last if $s >= 0x10000; 927 my $e = length($2) ? hex($2) : $s; 928 for (my $i = $s; $i <= $e; $i++) 929 { 930 $uname[$i] = $name; 931# print sprintf("U%04X $name\n",$i); 932 } 933 } 934} 935 936sub output_ucm_page 937{ 938 my ($cmap,$a,$t,$pre) = @_; 939 # warn sprintf("Page %x\n",$pre); 940 my $raw = $t->{Raw}; 941 foreach my $key (sort keys %$raw) { 942 # RAW_NEXT => 0, 943 # RAW_IN_LEN => 1, 944 # RAW_OUT_BYTES => 2, 945 # RAW_FALLBACK => 3, 946 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 947 my $u = ord $key; 948 $fallback ||= 0; 949 950 if ($next != $a && $next != $t) { 951 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF); 952 } elsif (length $out_bytes) { 953 if ($pre) { 954 $u = $pre|($u &0x3f); 955 } 956 my $s = sprintf "<U%04X> ",$u; 957 #foreach my $c (split(//,$out_bytes)) { 958 # $s .= sprintf "\\x%02X",ord($c); 959 #} 960 # 9.5% faster changing that loop to this: 961 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes; 962 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u]; 963 push(@$cmap,$s); 964 } else { 965 warn join(',',$u, @{$raw->{$key}},$a,$t); 966 } 967 } 968} 969 970sub output_ucm 971{ 972 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_; 973 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'}; 974 print $fh "<code_set_name> \"$name\"\n"; 975 char_names(); 976 if (defined $min_el) 977 { 978 print $fh "<mb_cur_min> $min_el\n"; 979 } 980 if (defined $max_el) 981 { 982 print $fh "<mb_cur_max> $max_el\n"; 983 } 984 if (defined $rep) 985 { 986 print $fh "<subchar> "; 987 foreach my $c (split(//,$rep)) 988 { 989 printf $fh "\\x%02X",ord($c); 990 } 991 print $fh "\n"; 992 } 993 my @cmap; 994 output_ucm_page(\@cmap,$h,$h,0); 995 print $fh "#\nCHARMAP\n"; 996 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap) 997 { 998 print $fh $line; 999 } 1000 print $fh "END CHARMAP\n"; 1001} 1002 1003use vars qw( 1004 $_Enc2xs 1005 $_Version 1006 $_Inc 1007 $_E2X 1008 $_Name 1009 $_TableFiles 1010 $_Now 1011); 1012 1013sub find_e2x{ 1014 eval { require File::Find; }; 1015 my (@inc, %e2x_dir); 1016 for my $inc (@INC){ 1017 push @inc, $inc unless $inc eq '.'; #skip current dir 1018 } 1019 File::Find::find( 1020 sub { 1021 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, 1022 $atime,$mtime,$ctime,$blksize,$blocks) 1023 = lstat($_) or return; 1024 -f _ or return; 1025 if (/^.*\.e2x$/o){ 1026 no warnings 'once'; 1027 $e2x_dir{$File::Find::dir} ||= $mtime; 1028 } 1029 return; 1030 }, @inc); 1031 warn join("\n", keys %e2x_dir), "\n"; 1032 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){ 1033 $_E2X = $d; 1034 # warn "$_E2X => ", scalar localtime($e2x_dir{$d}); 1035 return $_E2X; 1036 } 1037} 1038 1039sub make_makefile_pl 1040{ 1041 eval { require Encode } or die "You need to install Encode to use enc2xs -M\nerror: $@\n"; 1042 # our used for variable expansion 1043 $_Enc2xs = $0; 1044 $_Version = $VERSION; 1045 $_E2X = find_e2x(); 1046 $_Name = shift; 1047 $_TableFiles = join(",", map {qq('$_')} @_); 1048 $_Now = scalar localtime(); 1049 1050 eval { require File::Spec; }; 1051 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL"); 1052 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm"); 1053 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t"); 1054 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README"); 1055 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes"); 1056 exit; 1057} 1058 1059use vars qw( 1060 $_ModLines 1061 $_LocalVer 1062 ); 1063 1064sub make_configlocal_pm { 1065 eval { require Encode } or die "Unable to require Encode: $@\n"; 1066 eval { require File::Spec; }; 1067 1068 # our used for variable expantion 1069 my %in_core = map { $_ => 1 } ( 1070 'ascii', 'iso-8859-1', 'utf8', 1071 'ascii-ctrl', 'null', 'utf-8-strict' 1072 ); 1073 my %LocalMod = (); 1074 # check @enc; 1075 use File::Find (); 1076 my $wanted = sub{ 1077 -f $_ or return; 1078 $File::Find::name =~ /\A\./ and return; 1079 $File::Find::name =~ /\.pm\z/ or return; 1080 $File::Find::name =~ m/\bEncode\b/ or return; 1081 my $mod = $File::Find::name; 1082 $mod =~ s/.*\bEncode\b/Encode/o; 1083 $mod =~ s/\.pm\z//o; 1084 $mod =~ s,/,::,og; 1085 eval qq{ require $mod; } or return; 1086 warn qq{ require $mod;\n}; 1087 for my $enc ( Encode->encodings() ) { 1088 no warnings; 1089 $in_core{$enc} and next; 1090 $Encode::Config::ExtModule{$enc} and next; 1091 $LocalMod{$enc} ||= $mod; 1092 } 1093 }; 1094 File::Find::find({wanted => $wanted}, @INC); 1095 $_ModLines = ""; 1096 for my $enc ( sort keys %LocalMod ) { 1097 $_ModLines .= 1098 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n); 1099 } 1100 warn $_ModLines if $_ModLines; 1101 $_LocalVer = _mkversion(); 1102 $_E2X = find_e2x(); 1103 $_Inc = $INC{"Encode.pm"}; 1104 $_Inc =~ s/\.pm$//o; 1105 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ), 1106 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 ); 1107 exit; 1108} 1109 1110sub _mkversion{ 1111 # v-string is now depreciated; use time() instead; 1112 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime(); 1113 #$yyyy += 1900, $mo +=1; 1114 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm); 1115 return time(); 1116} 1117 1118sub _print_expand{ 1119 eval { require File::Basename } or die "File::Basename needed. Are you on miniperl?;\nerror: $@\n"; 1120 File::Basename->import(); 1121 my ($src, $dst, $clobber) = @_; 1122 if (!$clobber and -e $dst){ 1123 warn "$dst exists. skipping\n"; 1124 return; 1125 } 1126 warn "Generating $dst...\n"; 1127 open my $in, $src or die "$src : $!"; 1128 if ((my $d = dirname($dst)) ne '.'){ 1129 -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; 1130 } 1131 open my $out, ">", $dst or die "$!"; 1132 my $asis = 0; 1133 while (<$in>){ 1134 if (/^#### END_OF_HEADER/){ 1135 $asis = 1; next; 1136 } 1137 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis; 1138 print $out $_; 1139 } 1140} 1141__END__ 1142 1143=head1 NAME 1144 1145enc2xs -- Perl Encode Module Generator 1146 1147=head1 SYNOPSIS 1148 1149 enc2xs -[options] 1150 enc2xs -M ModName mapfiles... 1151 enc2xs -C 1152 1153=head1 DESCRIPTION 1154 1155F<enc2xs> builds a Perl extension for use by Encode from either 1156Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc). 1157Besides being used internally during the build process of the Encode 1158module, you can use F<enc2xs> to add your own encoding to perl. 1159No knowledge of XS is necessary. 1160 1161=head1 Quick Guide 1162 1163If you want to know as little about Perl as possible but need to 1164add a new encoding, just read this chapter and forget the rest. 1165 1166=over 4 1167 1168=item 0.Z<> 1169 1170Have a .ucm file ready. You can get it from somewhere or you can write 1171your own from scratch or you can grab one from the Encode distribution 1172and customize it. For the UCM format, see the next Chapter. In the 1173example below, I'll call my theoretical encoding myascii, defined 1174in I<my.ucm>. C<$> is a shell prompt. 1175 1176 $ ls -F 1177 my.ucm 1178 1179=item 1.Z<> 1180 1181Issue a command as follows; 1182 1183 $ enc2xs -M My my.ucm 1184 generating Makefile.PL 1185 generating My.pm 1186 generating README 1187 generating Changes 1188 1189Now take a look at your current directory. It should look like this. 1190 1191 $ ls -F 1192 Makefile.PL My.pm my.ucm t/ 1193 1194The following files were created. 1195 1196 Makefile.PL - MakeMaker script 1197 My.pm - Encode submodule 1198 t/My.t - test file 1199 1200=over 4 1201 1202=item 1.1.Z<> 1203 1204If you want *.ucm installed together with the modules, do as follows; 1205 1206 $ mkdir Encode 1207 $ mv *.ucm Encode 1208 $ enc2xs -M My Encode/*ucm 1209 1210=back 1211 1212=item 2.Z<> 1213 1214Edit the files generated. You don't have to if you have no time AND no 1215intention to give it to someone else. But it is a good idea to edit 1216the pod and to add more tests. 1217 1218=item 3.Z<> 1219 1220Now issue a command all Perl Mongers love: 1221 1222 $ perl Makefile.PL 1223 Writing Makefile for Encode::My 1224 1225=item 4.Z<> 1226 1227Now all you have to do is make. 1228 1229 $ make 1230 cp My.pm blib/lib/Encode/My.pm 1231 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \ 1232 -o encode_t.c -f encode_t.fnm 1233 Reading myascii (myascii) 1234 Writing compiled form 1235 128 bytes in string tables 1236 384 bytes (75%) saved spotting duplicates 1237 1 bytes (0.775%) saved using substrings 1238 .... 1239 chmod 644 blib/arch/auto/Encode/My/My.bs 1240 $ 1241 1242The time it takes varies depending on how fast your machine is and 1243how large your encoding is. Unless you are working on something big 1244like euc-tw, it won't take too long. 1245 1246=item 5.Z<> 1247 1248You can "make install" already but you should test first. 1249 1250 $ make test 1251 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \ 1252 -e 'use Test::Harness qw(&runtests $verbose); \ 1253 $verbose=0; runtests @ARGV;' t/*.t 1254 t/My....ok 1255 All tests successful. 1256 Files=1, Tests=2, 0 wallclock secs 1257 ( 0.09 cusr + 0.01 csys = 0.09 CPU) 1258 1259=item 6.Z<> 1260 1261If you are content with the test result, just "make install" 1262 1263=item 7.Z<> 1264 1265If you want to add your encoding to Encode's demand-loading list 1266(so you don't have to "use Encode::YourEncoding"), run 1267 1268 enc2xs -C 1269 1270to update Encode::ConfigLocal, a module that controls local settings. 1271After that, "use Encode;" is enough to load your encodings on demand. 1272 1273=back 1274 1275=head1 The Unicode Character Map 1276 1277Encode uses the Unicode Character Map (UCM) format for source character 1278mappings. This format is used by IBM's ICU package and was adopted 1279by Nick Ing-Simmons for use with the Encode module. Since UCM is 1280more flexible than Tcl's Encoding Map and far more user-friendly, 1281this is the recommended format for Encode now. 1282 1283A UCM file looks like this. 1284 1285 # 1286 # Comments 1287 # 1288 <code_set_name> "US-ascii" # Required 1289 <code_set_alias> "ascii" # Optional 1290 <mb_cur_min> 1 # Required; usually 1 1291 <mb_cur_max> 1 # Max. # of bytes/char 1292 <subchar> \x3F # Substitution char 1293 # 1294 CHARMAP 1295 <U0000> \x00 |0 # <control> 1296 <U0001> \x01 |0 # <control> 1297 <U0002> \x02 |0 # <control> 1298 .... 1299 <U007C> \x7C |0 # VERTICAL LINE 1300 <U007D> \x7D |0 # RIGHT CURLY BRACKET 1301 <U007E> \x7E |0 # TILDE 1302 <U007F> \x7F |0 # <control> 1303 END CHARMAP 1304 1305=over 4 1306 1307=item * 1308 1309Anything that follows C<#> is treated as a comment. 1310 1311=item * 1312 1313The header section continues until a line containing the word 1314CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one 1315pair per line. Strings used as values must be quoted. Barewords are 1316treated as numbers. I<\xXX> represents a byte. 1317 1318Most of the keywords are self-explanatory. I<subchar> means 1319substitution character, not subcharacter. When you decode a Unicode 1320sequence to this encoding but no matching character is found, the byte 1321sequence defined here will be used. For most cases, the value here is 1322\x3F; in ASCII, this is a question mark. 1323 1324=item * 1325 1326CHARMAP starts the character map section. Each line has a form as 1327follows: 1328 1329 <UXXXX> \xXX.. |0 # comment 1330 ^ ^ ^ 1331 | | +- Fallback flag 1332 | +-------- Encoded byte sequence 1333 +-------------- Unicode Character ID in hex 1334 1335The format is roughly the same as a header section except for the 1336fallback flag: | followed by 0..3. The meaning of the possible 1337values is as follows: 1338 1339=over 4 1340 1341=item |0 1342 1343Round trip safe. A character decoded to Unicode encodes back to the 1344same byte sequence. Most characters have this flag. 1345 1346=item |1 1347 1348Fallback for unicode -> encoding. When seen, enc2xs adds this 1349character for the encode map only. 1350 1351=item |2 1352 1353Skip sub-char mapping should there be no code point. 1354 1355=item |3 1356 1357Fallback for encoding -> unicode. When seen, enc2xs adds this 1358character for the decode map only. 1359 1360=back 1361 1362=item * 1363 1364And finally, END OF CHARMAP ends the section. 1365 1366=back 1367 1368When you are manually creating a UCM file, you should copy ascii.ucm 1369or an existing encoding which is close to yours, rather than write 1370your own from scratch. 1371 1372When you do so, make sure you leave at least B<U0000> to B<U0020> as 1373is, unless your environment is EBCDIC. 1374 1375B<CAVEAT>: not all features in UCM are implemented. For example, 1376icu:state is not used. Because of that, you need to write a perl 1377module if you want to support algorithmical encodings, notably 1378the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>, 1379L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>. 1380 1381=head2 Coping with duplicate mappings 1382 1383When you create a map, you SHOULD make your mappings round-trip safe. 1384That is, C<encode('your-encoding', decode('your-encoding', $data)) eq 1385$data> stands for all characters that are marked as C<|0>. Here is 1386how to make sure: 1387 1388=over 4 1389 1390=item * 1391 1392Sort your map in Unicode order. 1393 1394=item * 1395 1396When you have a duplicate entry, mark either one with '|1' or '|3'. 1397 1398=item * 1399 1400And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry. 1401 1402=back 1403 1404Here is an example from big5-eten. 1405 1406 <U2550> \xF9\xF9 |0 1407 <U2550> \xA2\xA4 |3 1408 1409Internally Encoding -> Unicode and Unicode -> Encoding Map looks like 1410this; 1411 1412 E to U U to E 1413 -------------------------------------- 1414 \xF9\xF9 => U2550 U2550 => \xF9\xF9 1415 \xA2\xA4 => U2550 1416 1417So it is round-trip safe for \xF9\xF9. But if the line above is upside 1418down, here is what happens. 1419 1420 E to U U to E 1421 -------------------------------------- 1422 \xA2\xA4 => U2550 U2550 => \xF9\xF9 1423 (\xF9\xF9 => U2550 is now overwritten!) 1424 1425The Encode package comes with F<ucmlint>, a crude but sufficient 1426utility to check the integrity of a UCM file. Check under the 1427Encode/bin directory for this. 1428 1429When in doubt, you can use F<ucmsort>, yet another utility under 1430Encode/bin directory. 1431 1432=head1 Bookmarks 1433 1434=over 4 1435 1436=item * 1437 1438ICU Home Page 1439L<http://www.icu-project.org/> 1440 1441=item * 1442 1443ICU Character Mapping Tables 1444L<http://site.icu-project.org/charts/charset> 1445 1446=item * 1447 1448ICU:Conversion Data 1449L<http://www.icu-project.org/userguide/conversion-data.html> 1450 1451=back 1452 1453=head1 SEE ALSO 1454 1455L<Encode>, 1456L<perlmod>, 1457L<perlpod> 1458 1459=cut 1460 1461# -Q to disable the duplicate codepoint test 1462# -S make mapping errors fatal 1463# -q to remove comments written to output files 1464# -O to enable the (brute force) substring optimiser 1465# -o <output> to specify the output file name (else it's the first arg) 1466# -f <inlist> to give a file with a list of input files (else use the args) 1467# -n <name> to name the encoding (else use the basename of the input file. 1468 1469With %seen holding array refs: 1470 1471 865.66 real 28.80 user 8.79 sys 1472 7904 maximum resident set size 1473 1356 average shared memory size 1474 18566 average unshared data size 1475 229 average unshared stack size 1476 46080 page reclaims 1477 33373 page faults 1478 1479With %seen holding simple scalars: 1480 1481 342.16 real 27.11 user 3.54 sys 1482 8388 maximum resident set size 1483 1394 average shared memory size 1484 14969 average unshared data size 1485 236 average unshared stack size 1486 28159 page reclaims 1487 9839 page faults 1488 1489Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is 1490how %seen is storing things its seen. So it is pathalogically bad on a 16M 1491RAM machine, but it's going to help even on modern machines. 1492Swapping is bad, m'kay :-) 1493