1*0Sstevel@tonic-gate#!./perl 2*0Sstevel@tonic-gateBEGIN { 3*0Sstevel@tonic-gate # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's 4*0Sstevel@tonic-gate # with $ENV{PERL_CORE} set 5*0Sstevel@tonic-gate # In case we need it in future... 6*0Sstevel@tonic-gate require Config; import Config; 7*0Sstevel@tonic-gate} 8*0Sstevel@tonic-gateuse strict; 9*0Sstevel@tonic-gateuse warnings; 10*0Sstevel@tonic-gateuse Getopt::Std; 11*0Sstevel@tonic-gatemy @orig_ARGV = @ARGV; 12*0Sstevel@tonic-gateour $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 13*0Sstevel@tonic-gate 14*0Sstevel@tonic-gate# These may get re-ordered. 15*0Sstevel@tonic-gate# RAW is a do_now as inserted by &enter 16*0Sstevel@tonic-gate# AGG is an aggreagated do_now, as built up by &process 17*0Sstevel@tonic-gate 18*0Sstevel@tonic-gateuse constant { 19*0Sstevel@tonic-gate RAW_NEXT => 0, 20*0Sstevel@tonic-gate RAW_IN_LEN => 1, 21*0Sstevel@tonic-gate RAW_OUT_BYTES => 2, 22*0Sstevel@tonic-gate RAW_FALLBACK => 3, 23*0Sstevel@tonic-gate 24*0Sstevel@tonic-gate AGG_MIN_IN => 0, 25*0Sstevel@tonic-gate AGG_MAX_IN => 1, 26*0Sstevel@tonic-gate AGG_OUT_BYTES => 2, 27*0Sstevel@tonic-gate AGG_NEXT => 3, 28*0Sstevel@tonic-gate AGG_IN_LEN => 4, 29*0Sstevel@tonic-gate AGG_OUT_LEN => 5, 30*0Sstevel@tonic-gate AGG_FALLBACK => 6, 31*0Sstevel@tonic-gate}; 32*0Sstevel@tonic-gate 33*0Sstevel@tonic-gate# (See the algorithm in encengine.c - we're building structures for it) 34*0Sstevel@tonic-gate 35*0Sstevel@tonic-gate# There are two sorts of structures. 36*0Sstevel@tonic-gate# "do_now" (an array, two variants of what needs storing) is whatever we need 37*0Sstevel@tonic-gate# to do now we've read an input byte. 38*0Sstevel@tonic-gate# It's housed in a "do_next" (which is how we got to it), and in turn points 39*0Sstevel@tonic-gate# to a "do_next" which contains all the "do_now"s for the next input byte. 40*0Sstevel@tonic-gate 41*0Sstevel@tonic-gate# There will be a "do_next" which is the start state. 42*0Sstevel@tonic-gate# For a single byte encoding it's the only "do_next" - each "do_now" points 43*0Sstevel@tonic-gate# back to it, and each "do_now" will cause bytes. There is no state. 44*0Sstevel@tonic-gate 45*0Sstevel@tonic-gate# For a multi-byte encoding where all characters in the input are the same 46*0Sstevel@tonic-gate# length, then there will be a tree of "do_now"->"do_next"->"do_now" 47*0Sstevel@tonic-gate# branching out from the start state, one step for each input byte. 48*0Sstevel@tonic-gate# The leaf "do_now"s will all be at the same distance from the start state, 49*0Sstevel@tonic-gate# only the leaf "do_now"s cause output bytes, and they in turn point back to 50*0Sstevel@tonic-gate# the start state. 51*0Sstevel@tonic-gate 52*0Sstevel@tonic-gate# For an encoding where there are varaible length input byte sequences, you 53*0Sstevel@tonic-gate# will encounter a leaf "do_now" sooner for the shorter input sequences, but 54*0Sstevel@tonic-gate# as before the leaves will point back to the start state. 55*0Sstevel@tonic-gate 56*0Sstevel@tonic-gate# The system will cope with escape encodings (imagine them as a mostly 57*0Sstevel@tonic-gate# self-contained tree for each escape state, and cross links between trees 58*0Sstevel@tonic-gate# at the state-switching characters) but so far no input format defines these. 59*0Sstevel@tonic-gate 60*0Sstevel@tonic-gate# The system will also cope with having output "leaves" in the middle of 61*0Sstevel@tonic-gate# the bifurcating branches, not just at the extremities, but again no 62*0Sstevel@tonic-gate# input format does this yet. 63*0Sstevel@tonic-gate 64*0Sstevel@tonic-gate# There are two variants of the "do_now" structure. The first, smaller variant 65*0Sstevel@tonic-gate# is generated by &enter as the input file is read. There is one structure 66*0Sstevel@tonic-gate# for each input byte. Say we are mapping a single byte encoding to a 67*0Sstevel@tonic-gate# single byte encoding, with "ABCD" going "abcd". There will be 68*0Sstevel@tonic-gate# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...} 69*0Sstevel@tonic-gate 70*0Sstevel@tonic-gate# &process then walks the tree, building aggregate "do_now" structres for 71*0Sstevel@tonic-gate# adjacent bytes where possible. The aggregate is for a contiguous range of 72*0Sstevel@tonic-gate# bytes which each produce the same length of output, each move to the 73*0Sstevel@tonic-gate# same next state, and each have the same fallback flag. 74*0Sstevel@tonic-gate# So our 4 RAW "do_now"s above become replaced by a single structure 75*0Sstevel@tonic-gate# containing: 76*0Sstevel@tonic-gate# ["A", "D", "abcd", 1, ...] 77*0Sstevel@tonic-gate# ie, for an input byte $_ in "A".."D", output 1 byte, found as 78*0Sstevel@tonic-gate# substr ("abcd", (ord $_ - ord "A") * 1, 1) 79*0Sstevel@tonic-gate# which maps very nicely into pointer arithmetic in C for encengine.c 80*0Sstevel@tonic-gate 81*0Sstevel@tonic-gatesub encode_U 82*0Sstevel@tonic-gate{ 83*0Sstevel@tonic-gate # UTF-8 encode long hand - only covers part of perl's range 84*0Sstevel@tonic-gate ## my $uv = shift; 85*0Sstevel@tonic-gate # chr() works in native space so convert value from table 86*0Sstevel@tonic-gate # into that space before using chr(). 87*0Sstevel@tonic-gate my $ch = chr(utf8::unicode_to_native($_[0])); 88*0Sstevel@tonic-gate # Now get core perl to encode that the way it likes. 89*0Sstevel@tonic-gate utf8::encode($ch); 90*0Sstevel@tonic-gate return $ch; 91*0Sstevel@tonic-gate} 92*0Sstevel@tonic-gate 93*0Sstevel@tonic-gatesub encode_S 94*0Sstevel@tonic-gate{ 95*0Sstevel@tonic-gate # encode single byte 96*0Sstevel@tonic-gate ## my ($ch,$page) = @_; return chr($ch); 97*0Sstevel@tonic-gate return chr $_[0]; 98*0Sstevel@tonic-gate} 99*0Sstevel@tonic-gate 100*0Sstevel@tonic-gatesub encode_D 101*0Sstevel@tonic-gate{ 102*0Sstevel@tonic-gate # encode double byte MS byte first 103*0Sstevel@tonic-gate ## my ($ch,$page) = @_; return chr($page).chr($ch); 104*0Sstevel@tonic-gate return chr ($_[1]) . chr $_[0]; 105*0Sstevel@tonic-gate} 106*0Sstevel@tonic-gate 107*0Sstevel@tonic-gatesub encode_M 108*0Sstevel@tonic-gate{ 109*0Sstevel@tonic-gate # encode Multi-byte - single for 0..255 otherwise double 110*0Sstevel@tonic-gate ## my ($ch,$page) = @_; 111*0Sstevel@tonic-gate ## return &encode_D if $page; 112*0Sstevel@tonic-gate ## return &encode_S; 113*0Sstevel@tonic-gate return chr ($_[1]) . chr $_[0] if $_[1]; 114*0Sstevel@tonic-gate return chr $_[0]; 115*0Sstevel@tonic-gate} 116*0Sstevel@tonic-gate 117*0Sstevel@tonic-gatemy %encode_types = (U => \&encode_U, 118*0Sstevel@tonic-gate S => \&encode_S, 119*0Sstevel@tonic-gate D => \&encode_D, 120*0Sstevel@tonic-gate M => \&encode_M, 121*0Sstevel@tonic-gate ); 122*0Sstevel@tonic-gate 123*0Sstevel@tonic-gate# Win32 does not expand globs on command line 124*0Sstevel@tonic-gateeval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); 125*0Sstevel@tonic-gate 126*0Sstevel@tonic-gatemy %opt; 127*0Sstevel@tonic-gate# I think these are: 128*0Sstevel@tonic-gate# -Q to disable the duplicate codepoint test 129*0Sstevel@tonic-gate# -S make mapping errors fatal 130*0Sstevel@tonic-gate# -q to remove comments written to output files 131*0Sstevel@tonic-gate# -O to enable the (brute force) substring optimiser 132*0Sstevel@tonic-gate# -o <output> to specify the output file name (else it's the first arg) 133*0Sstevel@tonic-gate# -f <inlist> to give a file with a list of input files (else use the args) 134*0Sstevel@tonic-gate# -n <name> to name the encoding (else use the basename of the input file. 135*0Sstevel@tonic-gategetopts('CM:SQqOo:f:n:',\%opt); 136*0Sstevel@tonic-gate 137*0Sstevel@tonic-gate$opt{M} and make_makefile_pl($opt{M}, @ARGV); 138*0Sstevel@tonic-gate$opt{C} and make_configlocal_pm($opt{C}, @ARGV); 139*0Sstevel@tonic-gate 140*0Sstevel@tonic-gate# This really should go first, else the die here causes empty (non-erroneous) 141*0Sstevel@tonic-gate# output files to be written. 142*0Sstevel@tonic-gatemy @encfiles; 143*0Sstevel@tonic-gateif (exists $opt{'f'}) { 144*0Sstevel@tonic-gate # -F is followed by name of file containing list of filenames 145*0Sstevel@tonic-gate my $flist = $opt{'f'}; 146*0Sstevel@tonic-gate open(FLIST,$flist) || die "Cannot open $flist:$!"; 147*0Sstevel@tonic-gate chomp(@encfiles = <FLIST>); 148*0Sstevel@tonic-gate close(FLIST); 149*0Sstevel@tonic-gate} else { 150*0Sstevel@tonic-gate @encfiles = @ARGV; 151*0Sstevel@tonic-gate} 152*0Sstevel@tonic-gate 153*0Sstevel@tonic-gatemy $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV); 154*0Sstevel@tonic-gatechmod(0666,$cname) if -f $cname && !-w $cname; 155*0Sstevel@tonic-gateopen(C,">$cname") || die "Cannot open $cname:$!"; 156*0Sstevel@tonic-gate 157*0Sstevel@tonic-gatemy $dname = $cname; 158*0Sstevel@tonic-gatemy $hname = $cname; 159*0Sstevel@tonic-gate 160*0Sstevel@tonic-gatemy ($doC,$doEnc,$doUcm,$doPet); 161*0Sstevel@tonic-gate 162*0Sstevel@tonic-gateif ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined 163*0Sstevel@tonic-gate { 164*0Sstevel@tonic-gate $doC = 1; 165*0Sstevel@tonic-gate $dname =~ s/(\.[^\.]*)?$/.exh/; 166*0Sstevel@tonic-gate chmod(0666,$dname) if -f $cname && !-w $dname; 167*0Sstevel@tonic-gate open(D,">$dname") || die "Cannot open $dname:$!"; 168*0Sstevel@tonic-gate $hname =~ s/(\.[^\.]*)?$/.h/; 169*0Sstevel@tonic-gate chmod(0666,$hname) if -f $cname && !-w $hname; 170*0Sstevel@tonic-gate open(H,">$hname") || die "Cannot open $hname:$!"; 171*0Sstevel@tonic-gate 172*0Sstevel@tonic-gate foreach my $fh (\*C,\*D,\*H) 173*0Sstevel@tonic-gate { 174*0Sstevel@tonic-gate print $fh <<"END" unless $opt{'q'}; 175*0Sstevel@tonic-gate/* 176*0Sstevel@tonic-gate !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 177*0Sstevel@tonic-gate This file was autogenerated by: 178*0Sstevel@tonic-gate $^X $0 @orig_ARGV 179*0Sstevel@tonic-gate*/ 180*0Sstevel@tonic-gateEND 181*0Sstevel@tonic-gate } 182*0Sstevel@tonic-gate 183*0Sstevel@tonic-gate if ($cname =~ /(\w+)\.xs$/) 184*0Sstevel@tonic-gate { 185*0Sstevel@tonic-gate print C "#include <EXTERN.h>\n"; 186*0Sstevel@tonic-gate print C "#include <perl.h>\n"; 187*0Sstevel@tonic-gate print C "#include <XSUB.h>\n"; 188*0Sstevel@tonic-gate print C "#define U8 U8\n"; 189*0Sstevel@tonic-gate } 190*0Sstevel@tonic-gate print C "#include \"encode.h\"\n\n"; 191*0Sstevel@tonic-gate 192*0Sstevel@tonic-gate } 193*0Sstevel@tonic-gateelsif ($cname =~ /\.enc$/) 194*0Sstevel@tonic-gate { 195*0Sstevel@tonic-gate $doEnc = 1; 196*0Sstevel@tonic-gate } 197*0Sstevel@tonic-gateelsif ($cname =~ /\.ucm$/) 198*0Sstevel@tonic-gate { 199*0Sstevel@tonic-gate $doUcm = 1; 200*0Sstevel@tonic-gate } 201*0Sstevel@tonic-gateelsif ($cname =~ /\.pet$/) 202*0Sstevel@tonic-gate { 203*0Sstevel@tonic-gate $doPet = 1; 204*0Sstevel@tonic-gate } 205*0Sstevel@tonic-gate 206*0Sstevel@tonic-gatemy %encoding; 207*0Sstevel@tonic-gatemy %strings; 208*0Sstevel@tonic-gatemy $string_acc; 209*0Sstevel@tonic-gatemy %strings_in_acc; 210*0Sstevel@tonic-gate 211*0Sstevel@tonic-gatemy $saved = 0; 212*0Sstevel@tonic-gatemy $subsave = 0; 213*0Sstevel@tonic-gatemy $strings = 0; 214*0Sstevel@tonic-gate 215*0Sstevel@tonic-gatesub cmp_name 216*0Sstevel@tonic-gate{ 217*0Sstevel@tonic-gate if ($a =~ /^.*-(\d+)/) 218*0Sstevel@tonic-gate { 219*0Sstevel@tonic-gate my $an = $1; 220*0Sstevel@tonic-gate if ($b =~ /^.*-(\d+)/) 221*0Sstevel@tonic-gate { 222*0Sstevel@tonic-gate my $r = $an <=> $1; 223*0Sstevel@tonic-gate return $r if $r; 224*0Sstevel@tonic-gate } 225*0Sstevel@tonic-gate } 226*0Sstevel@tonic-gate return $a cmp $b; 227*0Sstevel@tonic-gate} 228*0Sstevel@tonic-gate 229*0Sstevel@tonic-gate 230*0Sstevel@tonic-gateforeach my $enc (sort cmp_name @encfiles) 231*0Sstevel@tonic-gate { 232*0Sstevel@tonic-gate my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; 233*0Sstevel@tonic-gate $name = $opt{'n'} if exists $opt{'n'}; 234*0Sstevel@tonic-gate if (open(E,$enc)) 235*0Sstevel@tonic-gate { 236*0Sstevel@tonic-gate if ($sfx eq 'enc') 237*0Sstevel@tonic-gate { 238*0Sstevel@tonic-gate compile_enc(\*E,lc($name)); 239*0Sstevel@tonic-gate } 240*0Sstevel@tonic-gate else 241*0Sstevel@tonic-gate { 242*0Sstevel@tonic-gate compile_ucm(\*E,lc($name)); 243*0Sstevel@tonic-gate } 244*0Sstevel@tonic-gate } 245*0Sstevel@tonic-gate else 246*0Sstevel@tonic-gate { 247*0Sstevel@tonic-gate warn "Cannot open $enc for $name:$!"; 248*0Sstevel@tonic-gate } 249*0Sstevel@tonic-gate } 250*0Sstevel@tonic-gate 251*0Sstevel@tonic-gateif ($doC) 252*0Sstevel@tonic-gate { 253*0Sstevel@tonic-gate print STDERR "Writing compiled form\n"; 254*0Sstevel@tonic-gate foreach my $name (sort cmp_name keys %encoding) 255*0Sstevel@tonic-gate { 256*0Sstevel@tonic-gate my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 257*0Sstevel@tonic-gate process($name.'_utf8',$e2u); 258*0Sstevel@tonic-gate addstrings(\*C,$e2u); 259*0Sstevel@tonic-gate 260*0Sstevel@tonic-gate process('utf8_'.$name,$u2e); 261*0Sstevel@tonic-gate addstrings(\*C,$u2e); 262*0Sstevel@tonic-gate } 263*0Sstevel@tonic-gate outbigstring(\*C,"enctable"); 264*0Sstevel@tonic-gate foreach my $name (sort cmp_name keys %encoding) 265*0Sstevel@tonic-gate { 266*0Sstevel@tonic-gate my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 267*0Sstevel@tonic-gate outtable(\*C,$e2u, "enctable"); 268*0Sstevel@tonic-gate outtable(\*C,$u2e, "enctable"); 269*0Sstevel@tonic-gate 270*0Sstevel@tonic-gate # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); 271*0Sstevel@tonic-gate } 272*0Sstevel@tonic-gate foreach my $enc (sort cmp_name keys %encoding) 273*0Sstevel@tonic-gate { 274*0Sstevel@tonic-gate # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; 275*0Sstevel@tonic-gate my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}}; 276*0Sstevel@tonic-gate #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); 277*0Sstevel@tonic-gate my $replen = 0; 278*0Sstevel@tonic-gate $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g); 279*0Sstevel@tonic-gate my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el); 280*0Sstevel@tonic-gate my $sym = "${enc}_encoding"; 281*0Sstevel@tonic-gate $sym =~ s/\W+/_/g; 282*0Sstevel@tonic-gate print C "encode_t $sym = \n"; 283*0Sstevel@tonic-gate # This is to make null encoding work -- dankogai 284*0Sstevel@tonic-gate for (my $i = (scalar @info) - 1; $i >= 0; --$i){ 285*0Sstevel@tonic-gate $info[$i] ||= 1; 286*0Sstevel@tonic-gate } 287*0Sstevel@tonic-gate # end of null tweak -- dankogai 288*0Sstevel@tonic-gate print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n"; 289*0Sstevel@tonic-gate } 290*0Sstevel@tonic-gate 291*0Sstevel@tonic-gate foreach my $enc (sort cmp_name keys %encoding) 292*0Sstevel@tonic-gate { 293*0Sstevel@tonic-gate my $sym = "${enc}_encoding"; 294*0Sstevel@tonic-gate $sym =~ s/\W+/_/g; 295*0Sstevel@tonic-gate print H "extern encode_t $sym;\n"; 296*0Sstevel@tonic-gate print D " Encode_XSEncoding(aTHX_ &$sym);\n"; 297*0Sstevel@tonic-gate } 298*0Sstevel@tonic-gate 299*0Sstevel@tonic-gate if ($cname =~ /(\w+)\.xs$/) 300*0Sstevel@tonic-gate { 301*0Sstevel@tonic-gate my $mod = $1; 302*0Sstevel@tonic-gate print C <<'END'; 303*0Sstevel@tonic-gate 304*0Sstevel@tonic-gatestatic void 305*0Sstevel@tonic-gateEncode_XSEncoding(pTHX_ encode_t *enc) 306*0Sstevel@tonic-gate{ 307*0Sstevel@tonic-gate dSP; 308*0Sstevel@tonic-gate HV *stash = gv_stashpv("Encode::XS", TRUE); 309*0Sstevel@tonic-gate SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash); 310*0Sstevel@tonic-gate int i = 0; 311*0Sstevel@tonic-gate PUSHMARK(sp); 312*0Sstevel@tonic-gate XPUSHs(sv); 313*0Sstevel@tonic-gate while (enc->name[i]) 314*0Sstevel@tonic-gate { 315*0Sstevel@tonic-gate const char *name = enc->name[i++]; 316*0Sstevel@tonic-gate XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 317*0Sstevel@tonic-gate } 318*0Sstevel@tonic-gate PUTBACK; 319*0Sstevel@tonic-gate call_pv("Encode::define_encoding",G_DISCARD); 320*0Sstevel@tonic-gate SvREFCNT_dec(sv); 321*0Sstevel@tonic-gate} 322*0Sstevel@tonic-gate 323*0Sstevel@tonic-gateEND 324*0Sstevel@tonic-gate 325*0Sstevel@tonic-gate print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; 326*0Sstevel@tonic-gate print C "BOOT:\n{\n"; 327*0Sstevel@tonic-gate print C "#include \"$dname\"\n"; 328*0Sstevel@tonic-gate print C "}\n"; 329*0Sstevel@tonic-gate } 330*0Sstevel@tonic-gate # Close in void context is bad, m'kay 331*0Sstevel@tonic-gate close(D) or warn "Error closing '$dname': $!"; 332*0Sstevel@tonic-gate close(H) or warn "Error closing '$hname': $!"; 333*0Sstevel@tonic-gate 334*0Sstevel@tonic-gate my $perc_saved = $strings/($strings + $saved) * 100; 335*0Sstevel@tonic-gate my $perc_subsaved = $strings/($strings + $subsave) * 100; 336*0Sstevel@tonic-gate printf STDERR "%d bytes in string tables\n",$strings; 337*0Sstevel@tonic-gate printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n", 338*0Sstevel@tonic-gate $saved, $perc_saved if $saved; 339*0Sstevel@tonic-gate printf STDERR "%d bytes (%.3g%%) saved using substrings\n", 340*0Sstevel@tonic-gate $subsave, $perc_subsaved if $subsave; 341*0Sstevel@tonic-gate } 342*0Sstevel@tonic-gateelsif ($doEnc) 343*0Sstevel@tonic-gate { 344*0Sstevel@tonic-gate foreach my $name (sort cmp_name keys %encoding) 345*0Sstevel@tonic-gate { 346*0Sstevel@tonic-gate my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 347*0Sstevel@tonic-gate output_enc(\*C,$name,$e2u); 348*0Sstevel@tonic-gate } 349*0Sstevel@tonic-gate } 350*0Sstevel@tonic-gateelsif ($doUcm) 351*0Sstevel@tonic-gate { 352*0Sstevel@tonic-gate foreach my $name (sort cmp_name keys %encoding) 353*0Sstevel@tonic-gate { 354*0Sstevel@tonic-gate my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 355*0Sstevel@tonic-gate output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el); 356*0Sstevel@tonic-gate } 357*0Sstevel@tonic-gate } 358*0Sstevel@tonic-gate 359*0Sstevel@tonic-gate# writing half meg files and then not checking to see if you just filled the 360*0Sstevel@tonic-gate# disk is bad, m'kay 361*0Sstevel@tonic-gateclose(C) or die "Error closing '$cname': $!"; 362*0Sstevel@tonic-gate 363*0Sstevel@tonic-gate# End of the main program. 364*0Sstevel@tonic-gate 365*0Sstevel@tonic-gatesub compile_ucm 366*0Sstevel@tonic-gate{ 367*0Sstevel@tonic-gate my ($fh,$name) = @_; 368*0Sstevel@tonic-gate my $e2u = {}; 369*0Sstevel@tonic-gate my $u2e = {}; 370*0Sstevel@tonic-gate my $cs; 371*0Sstevel@tonic-gate my %attr; 372*0Sstevel@tonic-gate while (<$fh>) 373*0Sstevel@tonic-gate { 374*0Sstevel@tonic-gate s/#.*$//; 375*0Sstevel@tonic-gate last if /^\s*CHARMAP\s*$/i; 376*0Sstevel@tonic-gate if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr 377*0Sstevel@tonic-gate { 378*0Sstevel@tonic-gate $attr{$1} = $2; 379*0Sstevel@tonic-gate } 380*0Sstevel@tonic-gate } 381*0Sstevel@tonic-gate if (!defined($cs = $attr{'code_set_name'})) 382*0Sstevel@tonic-gate { 383*0Sstevel@tonic-gate warn "No <code_set_name> in $name\n"; 384*0Sstevel@tonic-gate } 385*0Sstevel@tonic-gate else 386*0Sstevel@tonic-gate { 387*0Sstevel@tonic-gate $name = $cs unless exists $opt{'n'}; 388*0Sstevel@tonic-gate } 389*0Sstevel@tonic-gate my $erep; 390*0Sstevel@tonic-gate my $urep; 391*0Sstevel@tonic-gate my $max_el; 392*0Sstevel@tonic-gate my $min_el; 393*0Sstevel@tonic-gate if (exists $attr{'subchar'}) 394*0Sstevel@tonic-gate { 395*0Sstevel@tonic-gate #my @byte; 396*0Sstevel@tonic-gate #$attr{'subchar'} =~ /^\s*/cg; 397*0Sstevel@tonic-gate #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; 398*0Sstevel@tonic-gate #$erep = join('',map(chr(hex($_)),@byte)); 399*0Sstevel@tonic-gate $erep = $attr{'subchar'}; 400*0Sstevel@tonic-gate $erep =~ s/^\s+//; $erep =~ s/\s+$//; 401*0Sstevel@tonic-gate } 402*0Sstevel@tonic-gate print "Reading $name ($cs)\n"; 403*0Sstevel@tonic-gate my $nfb = 0; 404*0Sstevel@tonic-gate my $hfb = 0; 405*0Sstevel@tonic-gate while (<$fh>) 406*0Sstevel@tonic-gate { 407*0Sstevel@tonic-gate s/#.*$//; 408*0Sstevel@tonic-gate last if /^\s*END\s+CHARMAP\s*$/i; 409*0Sstevel@tonic-gate next if /^\s*$/; 410*0Sstevel@tonic-gate my (@uni, @byte) = (); 411*0Sstevel@tonic-gate my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o 412*0Sstevel@tonic-gate or die "Bad line: $_"; 413*0Sstevel@tonic-gate while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){ 414*0Sstevel@tonic-gate push @uni, map { substr($_, 1) } split(/\+/, $1); 415*0Sstevel@tonic-gate } 416*0Sstevel@tonic-gate while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){ 417*0Sstevel@tonic-gate push @byte, $1; 418*0Sstevel@tonic-gate } 419*0Sstevel@tonic-gate if (@uni) 420*0Sstevel@tonic-gate { 421*0Sstevel@tonic-gate my $uch = join('', map { encode_U(hex($_)) } @uni ); 422*0Sstevel@tonic-gate my $ech = join('',map(chr(hex($_)),@byte)); 423*0Sstevel@tonic-gate my $el = length($ech); 424*0Sstevel@tonic-gate $max_el = $el if (!defined($max_el) || $el > $max_el); 425*0Sstevel@tonic-gate $min_el = $el if (!defined($min_el) || $el < $min_el); 426*0Sstevel@tonic-gate if (length($fb)) 427*0Sstevel@tonic-gate { 428*0Sstevel@tonic-gate $fb = substr($fb,1); 429*0Sstevel@tonic-gate $hfb++; 430*0Sstevel@tonic-gate } 431*0Sstevel@tonic-gate else 432*0Sstevel@tonic-gate { 433*0Sstevel@tonic-gate $nfb++; 434*0Sstevel@tonic-gate $fb = '0'; 435*0Sstevel@tonic-gate } 436*0Sstevel@tonic-gate # $fb is fallback flag 437*0Sstevel@tonic-gate # 0 - round trip safe 438*0Sstevel@tonic-gate # 1 - fallback for unicode -> enc 439*0Sstevel@tonic-gate # 2 - skip sub-char mapping 440*0Sstevel@tonic-gate # 3 - fallback enc -> unicode 441*0Sstevel@tonic-gate enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); 442*0Sstevel@tonic-gate enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); 443*0Sstevel@tonic-gate } 444*0Sstevel@tonic-gate else 445*0Sstevel@tonic-gate { 446*0Sstevel@tonic-gate warn $_; 447*0Sstevel@tonic-gate } 448*0Sstevel@tonic-gate } 449*0Sstevel@tonic-gate if ($nfb && $hfb) 450*0Sstevel@tonic-gate { 451*0Sstevel@tonic-gate die "$nfb entries without fallback, $hfb entries with\n"; 452*0Sstevel@tonic-gate } 453*0Sstevel@tonic-gate $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el]; 454*0Sstevel@tonic-gate} 455*0Sstevel@tonic-gate 456*0Sstevel@tonic-gate 457*0Sstevel@tonic-gate 458*0Sstevel@tonic-gatesub compile_enc 459*0Sstevel@tonic-gate{ 460*0Sstevel@tonic-gate my ($fh,$name) = @_; 461*0Sstevel@tonic-gate my $e2u = {}; 462*0Sstevel@tonic-gate my $u2e = {}; 463*0Sstevel@tonic-gate 464*0Sstevel@tonic-gate my $type; 465*0Sstevel@tonic-gate while ($type = <$fh>) 466*0Sstevel@tonic-gate { 467*0Sstevel@tonic-gate last if $type !~ /^\s*#/; 468*0Sstevel@tonic-gate } 469*0Sstevel@tonic-gate chomp($type); 470*0Sstevel@tonic-gate return if $type eq 'E'; 471*0Sstevel@tonic-gate # Do the hash lookup once, rather than once per function call. 4% speedup. 472*0Sstevel@tonic-gate my $type_func = $encode_types{$type}; 473*0Sstevel@tonic-gate my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); 474*0Sstevel@tonic-gate warn "$type encoded $name\n"; 475*0Sstevel@tonic-gate my $rep = ''; 476*0Sstevel@tonic-gate # Save a defined test by setting these to defined values. 477*0Sstevel@tonic-gate my $min_el = ~0; # A very big integer 478*0Sstevel@tonic-gate my $max_el = 0; # Anything must be longer than 0 479*0Sstevel@tonic-gate { 480*0Sstevel@tonic-gate my $v = hex($def); 481*0Sstevel@tonic-gate $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe); 482*0Sstevel@tonic-gate } 483*0Sstevel@tonic-gate my $errors; 484*0Sstevel@tonic-gate my $seen; 485*0Sstevel@tonic-gate # use -Q to silence the seen test. Makefile.PL uses this by default. 486*0Sstevel@tonic-gate $seen = {} unless $opt{Q}; 487*0Sstevel@tonic-gate do 488*0Sstevel@tonic-gate { 489*0Sstevel@tonic-gate my $line = <$fh>; 490*0Sstevel@tonic-gate chomp($line); 491*0Sstevel@tonic-gate my $page = hex($line); 492*0Sstevel@tonic-gate my $ch = 0; 493*0Sstevel@tonic-gate my $i = 16; 494*0Sstevel@tonic-gate do 495*0Sstevel@tonic-gate { 496*0Sstevel@tonic-gate # So why is it 1% faster to leave the my here? 497*0Sstevel@tonic-gate my $line = <$fh>; 498*0Sstevel@tonic-gate $line =~ s/\r\n$/\n/; 499*0Sstevel@tonic-gate die "$.:${line}Line should be exactly 65 characters long including 500*0Sstevel@tonic-gate newline (".length($line).")" unless length ($line) == 65; 501*0Sstevel@tonic-gate # Split line into groups of 4 hex digits, convert groups to ints 502*0Sstevel@tonic-gate # This takes 65.35 503*0Sstevel@tonic-gate # map {hex $_} $line =~ /(....)/g 504*0Sstevel@tonic-gate # This takes 63.75 (2.5% less time) 505*0Sstevel@tonic-gate # unpack "n*", pack "H*", $line 506*0Sstevel@tonic-gate # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay 507*0Sstevel@tonic-gate # Doing it as while ($line =~ /(....)/g) took 74.63 508*0Sstevel@tonic-gate foreach my $val (unpack "n*", pack "H*", $line) 509*0Sstevel@tonic-gate { 510*0Sstevel@tonic-gate next if $val == 0xFFFD; 511*0Sstevel@tonic-gate my $ech = &$type_func($ch,$page); 512*0Sstevel@tonic-gate if ($val || (!$ch && !$page)) 513*0Sstevel@tonic-gate { 514*0Sstevel@tonic-gate my $el = length($ech); 515*0Sstevel@tonic-gate $max_el = $el if $el > $max_el; 516*0Sstevel@tonic-gate $min_el = $el if $el < $min_el; 517*0Sstevel@tonic-gate my $uch = encode_U($val); 518*0Sstevel@tonic-gate if ($seen) { 519*0Sstevel@tonic-gate # We're doing the test. 520*0Sstevel@tonic-gate # We don't need to read this quickly, so storing it as a scalar, 521*0Sstevel@tonic-gate # rather than 3 (anon array, plus the 2 scalars it holds) saves 522*0Sstevel@tonic-gate # RAM and may make us faster on low RAM systems. [see __END__] 523*0Sstevel@tonic-gate if (exists $seen->{$uch}) 524*0Sstevel@tonic-gate { 525*0Sstevel@tonic-gate warn sprintf("U%04X is %02X%02X and %04X\n", 526*0Sstevel@tonic-gate $val,$page,$ch,$seen->{$uch}); 527*0Sstevel@tonic-gate $errors++; 528*0Sstevel@tonic-gate } 529*0Sstevel@tonic-gate else 530*0Sstevel@tonic-gate { 531*0Sstevel@tonic-gate $seen->{$uch} = $page << 8 | $ch; 532*0Sstevel@tonic-gate } 533*0Sstevel@tonic-gate } 534*0Sstevel@tonic-gate # Passing 2 extra args each time is 3.6% slower! 535*0Sstevel@tonic-gate # Even with having to add $fallback ||= 0 later 536*0Sstevel@tonic-gate enter_fb0($e2u,$ech,$uch); 537*0Sstevel@tonic-gate enter_fb0($u2e,$uch,$ech); 538*0Sstevel@tonic-gate } 539*0Sstevel@tonic-gate else 540*0Sstevel@tonic-gate { 541*0Sstevel@tonic-gate # No character at this position 542*0Sstevel@tonic-gate # enter($e2u,$ech,undef,$e2u); 543*0Sstevel@tonic-gate } 544*0Sstevel@tonic-gate $ch++; 545*0Sstevel@tonic-gate } 546*0Sstevel@tonic-gate } while --$i; 547*0Sstevel@tonic-gate } while --$pages; 548*0Sstevel@tonic-gate die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines" 549*0Sstevel@tonic-gate if $min_el > $max_el; 550*0Sstevel@tonic-gate die "$errors mapping conflicts\n" if ($errors && $opt{'S'}); 551*0Sstevel@tonic-gate $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el]; 552*0Sstevel@tonic-gate} 553*0Sstevel@tonic-gate 554*0Sstevel@tonic-gate# my ($a,$s,$d,$t,$fb) = @_; 555*0Sstevel@tonic-gatesub enter { 556*0Sstevel@tonic-gate my ($current,$inbytes,$outbytes,$next,$fallback) = @_; 557*0Sstevel@tonic-gate # state we shift to after this (multibyte) input character defaults to same 558*0Sstevel@tonic-gate # as current state. 559*0Sstevel@tonic-gate $next ||= $current; 560*0Sstevel@tonic-gate # Making sure it is defined seems to be faster than {no warnings;} in 561*0Sstevel@tonic-gate # &process, or passing it in as 0 explicity. 562*0Sstevel@tonic-gate # XXX $fallback ||= 0; 563*0Sstevel@tonic-gate 564*0Sstevel@tonic-gate # Start at the beginning and work forwards through the string to zero. 565*0Sstevel@tonic-gate # effectively we are removing 1 character from the front each time 566*0Sstevel@tonic-gate # but we don't actually edit the string. [this alone seems to be 14% speedup] 567*0Sstevel@tonic-gate # Hence -$pos is the length of the remaining string. 568*0Sstevel@tonic-gate my $pos = -length $inbytes; 569*0Sstevel@tonic-gate while (1) { 570*0Sstevel@tonic-gate my $byte = substr $inbytes, $pos, 1; 571*0Sstevel@tonic-gate # RAW_NEXT => 0, 572*0Sstevel@tonic-gate # RAW_IN_LEN => 1, 573*0Sstevel@tonic-gate # RAW_OUT_BYTES => 2, 574*0Sstevel@tonic-gate # RAW_FALLBACK => 3, 575*0Sstevel@tonic-gate # to unicode an array would seem to be better, because the pages are dense. 576*0Sstevel@tonic-gate # from unicode can be very sparse, favouring a hash. 577*0Sstevel@tonic-gate # hash using the bytes (all length 1) as keys rather than ord value, 578*0Sstevel@tonic-gate # as it's easier to sort these in &process. 579*0Sstevel@tonic-gate 580*0Sstevel@tonic-gate # It's faster to always add $fallback even if it's undef, rather than 581*0Sstevel@tonic-gate # choosing between 3 and 4 element array. (hence why we set it defined 582*0Sstevel@tonic-gate # above) 583*0Sstevel@tonic-gate my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback]; 584*0Sstevel@tonic-gate # When $pos was -1 we were at the last input character. 585*0Sstevel@tonic-gate unless (++$pos) { 586*0Sstevel@tonic-gate $do_now->[RAW_OUT_BYTES] = $outbytes; 587*0Sstevel@tonic-gate $do_now->[RAW_NEXT] = $next; 588*0Sstevel@tonic-gate return; 589*0Sstevel@tonic-gate } 590*0Sstevel@tonic-gate # Tail recursion. The intermdiate state may not have a name yet. 591*0Sstevel@tonic-gate $current = $do_now->[RAW_NEXT]; 592*0Sstevel@tonic-gate } 593*0Sstevel@tonic-gate} 594*0Sstevel@tonic-gate 595*0Sstevel@tonic-gate# This is purely for optimistation. It's just &enter hard coded for $fallback 596*0Sstevel@tonic-gate# of 0, using only a 3 entry array ref to save memory for every entry. 597*0Sstevel@tonic-gatesub enter_fb0 { 598*0Sstevel@tonic-gate my ($current,$inbytes,$outbytes,$next) = @_; 599*0Sstevel@tonic-gate $next ||= $current; 600*0Sstevel@tonic-gate 601*0Sstevel@tonic-gate my $pos = -length $inbytes; 602*0Sstevel@tonic-gate while (1) { 603*0Sstevel@tonic-gate my $byte = substr $inbytes, $pos, 1; 604*0Sstevel@tonic-gate my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'']; 605*0Sstevel@tonic-gate unless (++$pos) { 606*0Sstevel@tonic-gate $do_now->[RAW_OUT_BYTES] = $outbytes; 607*0Sstevel@tonic-gate $do_now->[RAW_NEXT] = $next; 608*0Sstevel@tonic-gate return; 609*0Sstevel@tonic-gate } 610*0Sstevel@tonic-gate $current = $do_now->[RAW_NEXT]; 611*0Sstevel@tonic-gate } 612*0Sstevel@tonic-gate} 613*0Sstevel@tonic-gate 614*0Sstevel@tonic-gatesub process 615*0Sstevel@tonic-gate{ 616*0Sstevel@tonic-gate my ($name,$a) = @_; 617*0Sstevel@tonic-gate $name =~ s/\W+/_/g; 618*0Sstevel@tonic-gate $a->{Cname} = $name; 619*0Sstevel@tonic-gate my $raw = $a->{Raw}; 620*0Sstevel@tonic-gate my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback); 621*0Sstevel@tonic-gate my @ent; 622*0Sstevel@tonic-gate $agg_max_in = 0; 623*0Sstevel@tonic-gate foreach my $key (sort keys %$raw) { 624*0Sstevel@tonic-gate # RAW_NEXT => 0, 625*0Sstevel@tonic-gate # RAW_IN_LEN => 1, 626*0Sstevel@tonic-gate # RAW_OUT_BYTES => 2, 627*0Sstevel@tonic-gate # RAW_FALLBACK => 3, 628*0Sstevel@tonic-gate my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 629*0Sstevel@tonic-gate # Now we are converting from raw to aggregate, switch from 1 byte strings 630*0Sstevel@tonic-gate # to numbers 631*0Sstevel@tonic-gate my $b = ord $key; 632*0Sstevel@tonic-gate $fallback ||= 0; 633*0Sstevel@tonic-gate if ($l && 634*0Sstevel@tonic-gate # If this == fails, we're going to reset $agg_max_in below anyway. 635*0Sstevel@tonic-gate $b == ++$agg_max_in && 636*0Sstevel@tonic-gate # References in numeric context give the pointer as an int. 637*0Sstevel@tonic-gate $agg_next == $next && 638*0Sstevel@tonic-gate $agg_in_len == $in_len && 639*0Sstevel@tonic-gate $agg_out_len == length $out_bytes && 640*0Sstevel@tonic-gate $agg_fallback == $fallback 641*0Sstevel@tonic-gate # && length($l->[AGG_OUT_BYTES]) < 16 642*0Sstevel@tonic-gate ) { 643*0Sstevel@tonic-gate # my $i = ord($b)-ord($l->[AGG_MIN_IN]); 644*0Sstevel@tonic-gate # we can aggregate this byte onto the end. 645*0Sstevel@tonic-gate $l->[AGG_MAX_IN] = $b; 646*0Sstevel@tonic-gate $l->[AGG_OUT_BYTES] .= $out_bytes; 647*0Sstevel@tonic-gate } else { 648*0Sstevel@tonic-gate # AGG_MIN_IN => 0, 649*0Sstevel@tonic-gate # AGG_MAX_IN => 1, 650*0Sstevel@tonic-gate # AGG_OUT_BYTES => 2, 651*0Sstevel@tonic-gate # AGG_NEXT => 3, 652*0Sstevel@tonic-gate # AGG_IN_LEN => 4, 653*0Sstevel@tonic-gate # AGG_OUT_LEN => 5, 654*0Sstevel@tonic-gate # AGG_FALLBACK => 6, 655*0Sstevel@tonic-gate # Reset the last thing we saw, plus set 5 lexicals to save some derefs. 656*0Sstevel@tonic-gate # (only gains .6% on euc-jp -- is it worth it?) 657*0Sstevel@tonic-gate push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next, 658*0Sstevel@tonic-gate $agg_in_len = $in_len, $agg_out_len = length $out_bytes, 659*0Sstevel@tonic-gate $agg_fallback = $fallback]; 660*0Sstevel@tonic-gate } 661*0Sstevel@tonic-gate if (exists $next->{Cname}) { 662*0Sstevel@tonic-gate $next->{'Forward'} = 1 if $next != $a; 663*0Sstevel@tonic-gate } else { 664*0Sstevel@tonic-gate process(sprintf("%s_%02x",$name,$b),$next); 665*0Sstevel@tonic-gate } 666*0Sstevel@tonic-gate } 667*0Sstevel@tonic-gate # encengine.c rules say that last entry must be for 255 668*0Sstevel@tonic-gate if ($agg_max_in < 255) { 669*0Sstevel@tonic-gate push @ent, [1+$agg_max_in, 255,undef,$a,0,0]; 670*0Sstevel@tonic-gate } 671*0Sstevel@tonic-gate $a->{'Entries'} = \@ent; 672*0Sstevel@tonic-gate} 673*0Sstevel@tonic-gate 674*0Sstevel@tonic-gate 675*0Sstevel@tonic-gatesub addstrings 676*0Sstevel@tonic-gate{ 677*0Sstevel@tonic-gate my ($fh,$a) = @_; 678*0Sstevel@tonic-gate my $name = $a->{'Cname'}; 679*0Sstevel@tonic-gate # String tables 680*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}}) 681*0Sstevel@tonic-gate { 682*0Sstevel@tonic-gate next unless $b->[AGG_OUT_LEN]; 683*0Sstevel@tonic-gate $strings{$b->[AGG_OUT_BYTES]} = undef; 684*0Sstevel@tonic-gate } 685*0Sstevel@tonic-gate if ($a->{'Forward'}) 686*0Sstevel@tonic-gate { 687*0Sstevel@tonic-gate my $var = $^O eq 'MacOS' ? 'extern' : 'static'; 688*0Sstevel@tonic-gate print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; 689*0Sstevel@tonic-gate } 690*0Sstevel@tonic-gate $a->{'DoneStrings'} = 1; 691*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}}) 692*0Sstevel@tonic-gate { 693*0Sstevel@tonic-gate my ($s,$e,$out,$t,$end,$l) = @$b; 694*0Sstevel@tonic-gate addstrings($fh,$t) unless $t->{'DoneStrings'}; 695*0Sstevel@tonic-gate } 696*0Sstevel@tonic-gate} 697*0Sstevel@tonic-gate 698*0Sstevel@tonic-gatesub outbigstring 699*0Sstevel@tonic-gate{ 700*0Sstevel@tonic-gate my ($fh,$name) = @_; 701*0Sstevel@tonic-gate 702*0Sstevel@tonic-gate $string_acc = ''; 703*0Sstevel@tonic-gate 704*0Sstevel@tonic-gate # Make the big string in the string accumulator. Longest first, on the hope 705*0Sstevel@tonic-gate # that this makes it more likely that we find the short strings later on. 706*0Sstevel@tonic-gate # Not sure if it helps sorting strings of the same length lexcically. 707*0Sstevel@tonic-gate foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) { 708*0Sstevel@tonic-gate my $index = index $string_acc, $s; 709*0Sstevel@tonic-gate if ($index >= 0) { 710*0Sstevel@tonic-gate $saved += length($s); 711*0Sstevel@tonic-gate $strings_in_acc{$s} = $index; 712*0Sstevel@tonic-gate } else { 713*0Sstevel@tonic-gate OPTIMISER: { 714*0Sstevel@tonic-gate if ($opt{'O'}) { 715*0Sstevel@tonic-gate my $sublength = length $s; 716*0Sstevel@tonic-gate while (--$sublength > 0) { 717*0Sstevel@tonic-gate # progressively lop characters off the end, to see if the start of 718*0Sstevel@tonic-gate # the new string overlaps the end of the accumulator. 719*0Sstevel@tonic-gate if (substr ($string_acc, -$sublength) 720*0Sstevel@tonic-gate eq substr ($s, 0, $sublength)) { 721*0Sstevel@tonic-gate $subsave += $sublength; 722*0Sstevel@tonic-gate $strings_in_acc{$s} = length ($string_acc) - $sublength; 723*0Sstevel@tonic-gate # append the last bit on the end. 724*0Sstevel@tonic-gate $string_acc .= substr ($s, $sublength); 725*0Sstevel@tonic-gate last OPTIMISER; 726*0Sstevel@tonic-gate } 727*0Sstevel@tonic-gate # or if the end of the new string overlaps the start of the 728*0Sstevel@tonic-gate # accumulator 729*0Sstevel@tonic-gate next unless substr ($string_acc, 0, $sublength) 730*0Sstevel@tonic-gate eq substr ($s, -$sublength); 731*0Sstevel@tonic-gate # well, the last $sublength characters of the accumulator match. 732*0Sstevel@tonic-gate # so as we're prepending to the accumulator, need to shift all our 733*0Sstevel@tonic-gate # existing offsets forwards 734*0Sstevel@tonic-gate $_ += $sublength foreach values %strings_in_acc; 735*0Sstevel@tonic-gate $subsave += $sublength; 736*0Sstevel@tonic-gate $strings_in_acc{$s} = 0; 737*0Sstevel@tonic-gate # append the first bit on the start. 738*0Sstevel@tonic-gate $string_acc = substr ($s, 0, -$sublength) . $string_acc; 739*0Sstevel@tonic-gate last OPTIMISER; 740*0Sstevel@tonic-gate } 741*0Sstevel@tonic-gate } 742*0Sstevel@tonic-gate # Optimiser (if it ran) found nothing, so just going have to tack the 743*0Sstevel@tonic-gate # whole thing on the end. 744*0Sstevel@tonic-gate $strings_in_acc{$s} = length $string_acc; 745*0Sstevel@tonic-gate $string_acc .= $s; 746*0Sstevel@tonic-gate }; 747*0Sstevel@tonic-gate } 748*0Sstevel@tonic-gate } 749*0Sstevel@tonic-gate 750*0Sstevel@tonic-gate $strings = length $string_acc; 751*0Sstevel@tonic-gate my $definition = "\nstatic const U8 $name\[$strings] = { " . 752*0Sstevel@tonic-gate join(',',unpack "C*",$string_acc); 753*0Sstevel@tonic-gate # We have a single long line. Split it at convenient commas. 754*0Sstevel@tonic-gate print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs; 755*0Sstevel@tonic-gate print $fh substr ($definition, pos $definition), " };\n"; 756*0Sstevel@tonic-gate} 757*0Sstevel@tonic-gate 758*0Sstevel@tonic-gatesub findstring { 759*0Sstevel@tonic-gate my ($name,$s) = @_; 760*0Sstevel@tonic-gate my $offset = $strings_in_acc{$s}; 761*0Sstevel@tonic-gate die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator" 762*0Sstevel@tonic-gate unless defined $offset; 763*0Sstevel@tonic-gate "$name + $offset"; 764*0Sstevel@tonic-gate} 765*0Sstevel@tonic-gate 766*0Sstevel@tonic-gatesub outtable 767*0Sstevel@tonic-gate{ 768*0Sstevel@tonic-gate my ($fh,$a,$bigname) = @_; 769*0Sstevel@tonic-gate my $name = $a->{'Cname'}; 770*0Sstevel@tonic-gate $a->{'Done'} = 1; 771*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}}) 772*0Sstevel@tonic-gate { 773*0Sstevel@tonic-gate my ($s,$e,$out,$t,$end,$l) = @$b; 774*0Sstevel@tonic-gate outtable($fh,$t,$bigname) unless $t->{'Done'}; 775*0Sstevel@tonic-gate } 776*0Sstevel@tonic-gate print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n"; 777*0Sstevel@tonic-gate foreach my $b (@{$a->{'Entries'}}) 778*0Sstevel@tonic-gate { 779*0Sstevel@tonic-gate my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; 780*0Sstevel@tonic-gate # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan 781*0Sstevel@tonic-gate print $fh "{"; 782*0Sstevel@tonic-gate if ($l) 783*0Sstevel@tonic-gate { 784*0Sstevel@tonic-gate printf $fh findstring($bigname,$out); 785*0Sstevel@tonic-gate } 786*0Sstevel@tonic-gate else 787*0Sstevel@tonic-gate { 788*0Sstevel@tonic-gate print $fh "0"; 789*0Sstevel@tonic-gate } 790*0Sstevel@tonic-gate print $fh ",",$t->{Cname}; 791*0Sstevel@tonic-gate printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; 792*0Sstevel@tonic-gate } 793*0Sstevel@tonic-gate print $fh "};\n"; 794*0Sstevel@tonic-gate} 795*0Sstevel@tonic-gate 796*0Sstevel@tonic-gatesub output_enc 797*0Sstevel@tonic-gate{ 798*0Sstevel@tonic-gate my ($fh,$name,$a) = @_; 799*0Sstevel@tonic-gate die "Changed - fix me for new structure"; 800*0Sstevel@tonic-gate foreach my $b (sort keys %$a) 801*0Sstevel@tonic-gate { 802*0Sstevel@tonic-gate my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; 803*0Sstevel@tonic-gate } 804*0Sstevel@tonic-gate} 805*0Sstevel@tonic-gate 806*0Sstevel@tonic-gatesub decode_U 807*0Sstevel@tonic-gate{ 808*0Sstevel@tonic-gate my $s = shift; 809*0Sstevel@tonic-gate} 810*0Sstevel@tonic-gate 811*0Sstevel@tonic-gatemy @uname; 812*0Sstevel@tonic-gatesub char_names 813*0Sstevel@tonic-gate{ 814*0Sstevel@tonic-gate my $s = do "unicore/Name.pl"; 815*0Sstevel@tonic-gate die "char_names: unicore/Name.pl: $!\n" unless defined $s; 816*0Sstevel@tonic-gate pos($s) = 0; 817*0Sstevel@tonic-gate while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc) 818*0Sstevel@tonic-gate { 819*0Sstevel@tonic-gate my $name = $3; 820*0Sstevel@tonic-gate my $s = hex($1); 821*0Sstevel@tonic-gate last if $s >= 0x10000; 822*0Sstevel@tonic-gate my $e = length($2) ? hex($2) : $s; 823*0Sstevel@tonic-gate for (my $i = $s; $i <= $e; $i++) 824*0Sstevel@tonic-gate { 825*0Sstevel@tonic-gate $uname[$i] = $name; 826*0Sstevel@tonic-gate# print sprintf("U%04X $name\n",$i); 827*0Sstevel@tonic-gate } 828*0Sstevel@tonic-gate } 829*0Sstevel@tonic-gate} 830*0Sstevel@tonic-gate 831*0Sstevel@tonic-gatesub output_ucm_page 832*0Sstevel@tonic-gate{ 833*0Sstevel@tonic-gate my ($cmap,$a,$t,$pre) = @_; 834*0Sstevel@tonic-gate # warn sprintf("Page %x\n",$pre); 835*0Sstevel@tonic-gate my $raw = $t->{Raw}; 836*0Sstevel@tonic-gate foreach my $key (sort keys %$raw) { 837*0Sstevel@tonic-gate # RAW_NEXT => 0, 838*0Sstevel@tonic-gate # RAW_IN_LEN => 1, 839*0Sstevel@tonic-gate # RAW_OUT_BYTES => 2, 840*0Sstevel@tonic-gate # RAW_FALLBACK => 3, 841*0Sstevel@tonic-gate my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 842*0Sstevel@tonic-gate my $u = ord $key; 843*0Sstevel@tonic-gate $fallback ||= 0; 844*0Sstevel@tonic-gate 845*0Sstevel@tonic-gate if ($next != $a && $next != $t) { 846*0Sstevel@tonic-gate output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF); 847*0Sstevel@tonic-gate } elsif (length $out_bytes) { 848*0Sstevel@tonic-gate if ($pre) { 849*0Sstevel@tonic-gate $u = $pre|($u &0x3f); 850*0Sstevel@tonic-gate } 851*0Sstevel@tonic-gate my $s = sprintf "<U%04X> ",$u; 852*0Sstevel@tonic-gate #foreach my $c (split(//,$out_bytes)) { 853*0Sstevel@tonic-gate # $s .= sprintf "\\x%02X",ord($c); 854*0Sstevel@tonic-gate #} 855*0Sstevel@tonic-gate # 9.5% faster changing that loop to this: 856*0Sstevel@tonic-gate $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes; 857*0Sstevel@tonic-gate $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u]; 858*0Sstevel@tonic-gate push(@$cmap,$s); 859*0Sstevel@tonic-gate } else { 860*0Sstevel@tonic-gate warn join(',',$u, @{$raw->{$key}},$a,$t); 861*0Sstevel@tonic-gate } 862*0Sstevel@tonic-gate } 863*0Sstevel@tonic-gate} 864*0Sstevel@tonic-gate 865*0Sstevel@tonic-gatesub output_ucm 866*0Sstevel@tonic-gate{ 867*0Sstevel@tonic-gate my ($fh,$name,$h,$rep,$min_el,$max_el) = @_; 868*0Sstevel@tonic-gate print $fh "# $0 @orig_ARGV\n" unless $opt{'q'}; 869*0Sstevel@tonic-gate print $fh "<code_set_name> \"$name\"\n"; 870*0Sstevel@tonic-gate char_names(); 871*0Sstevel@tonic-gate if (defined $min_el) 872*0Sstevel@tonic-gate { 873*0Sstevel@tonic-gate print $fh "<mb_cur_min> $min_el\n"; 874*0Sstevel@tonic-gate } 875*0Sstevel@tonic-gate if (defined $max_el) 876*0Sstevel@tonic-gate { 877*0Sstevel@tonic-gate print $fh "<mb_cur_max> $max_el\n"; 878*0Sstevel@tonic-gate } 879*0Sstevel@tonic-gate if (defined $rep) 880*0Sstevel@tonic-gate { 881*0Sstevel@tonic-gate print $fh "<subchar> "; 882*0Sstevel@tonic-gate foreach my $c (split(//,$rep)) 883*0Sstevel@tonic-gate { 884*0Sstevel@tonic-gate printf $fh "\\x%02X",ord($c); 885*0Sstevel@tonic-gate } 886*0Sstevel@tonic-gate print $fh "\n"; 887*0Sstevel@tonic-gate } 888*0Sstevel@tonic-gate my @cmap; 889*0Sstevel@tonic-gate output_ucm_page(\@cmap,$h,$h,0); 890*0Sstevel@tonic-gate print $fh "#\nCHARMAP\n"; 891*0Sstevel@tonic-gate foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap) 892*0Sstevel@tonic-gate { 893*0Sstevel@tonic-gate print $fh $line; 894*0Sstevel@tonic-gate } 895*0Sstevel@tonic-gate print $fh "END CHARMAP\n"; 896*0Sstevel@tonic-gate} 897*0Sstevel@tonic-gate 898*0Sstevel@tonic-gateuse vars qw( 899*0Sstevel@tonic-gate $_Enc2xs 900*0Sstevel@tonic-gate $_Version 901*0Sstevel@tonic-gate $_Inc 902*0Sstevel@tonic-gate $_E2X 903*0Sstevel@tonic-gate $_Name 904*0Sstevel@tonic-gate $_TableFiles 905*0Sstevel@tonic-gate $_Now 906*0Sstevel@tonic-gate); 907*0Sstevel@tonic-gate 908*0Sstevel@tonic-gatesub find_e2x{ 909*0Sstevel@tonic-gate eval { require File::Find; }; 910*0Sstevel@tonic-gate my (@inc, %e2x_dir); 911*0Sstevel@tonic-gate for my $inc (@INC){ 912*0Sstevel@tonic-gate push @inc, $inc unless $inc eq '.'; #skip current dir 913*0Sstevel@tonic-gate } 914*0Sstevel@tonic-gate File::Find::find( 915*0Sstevel@tonic-gate sub { 916*0Sstevel@tonic-gate my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, 917*0Sstevel@tonic-gate $atime,$mtime,$ctime,$blksize,$blocks) 918*0Sstevel@tonic-gate = lstat($_) or return; 919*0Sstevel@tonic-gate -f _ or return; 920*0Sstevel@tonic-gate if (/^.*\.e2x$/o){ 921*0Sstevel@tonic-gate no warnings 'once'; 922*0Sstevel@tonic-gate $e2x_dir{$File::Find::dir} ||= $mtime; 923*0Sstevel@tonic-gate } 924*0Sstevel@tonic-gate return; 925*0Sstevel@tonic-gate }, @inc); 926*0Sstevel@tonic-gate warn join("\n", keys %e2x_dir), "\n"; 927*0Sstevel@tonic-gate for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){ 928*0Sstevel@tonic-gate $_E2X = $d; 929*0Sstevel@tonic-gate # warn "$_E2X => ", scalar localtime($e2x_dir{$d}); 930*0Sstevel@tonic-gate return $_E2X; 931*0Sstevel@tonic-gate } 932*0Sstevel@tonic-gate} 933*0Sstevel@tonic-gate 934*0Sstevel@tonic-gatesub make_makefile_pl 935*0Sstevel@tonic-gate{ 936*0Sstevel@tonic-gate eval { require Encode; }; 937*0Sstevel@tonic-gate $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n"; 938*0Sstevel@tonic-gate # our used for variable expanstion 939*0Sstevel@tonic-gate $_Enc2xs = $0; 940*0Sstevel@tonic-gate $_Version = $VERSION; 941*0Sstevel@tonic-gate $_E2X = find_e2x(); 942*0Sstevel@tonic-gate $_Name = shift; 943*0Sstevel@tonic-gate $_TableFiles = join(",", map {qq('$_')} @_); 944*0Sstevel@tonic-gate $_Now = scalar localtime(); 945*0Sstevel@tonic-gate 946*0Sstevel@tonic-gate eval { require File::Spec; }; 947*0Sstevel@tonic-gate _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL"); 948*0Sstevel@tonic-gate _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm"); 949*0Sstevel@tonic-gate _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t"); 950*0Sstevel@tonic-gate _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README"); 951*0Sstevel@tonic-gate _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes"); 952*0Sstevel@tonic-gate exit; 953*0Sstevel@tonic-gate} 954*0Sstevel@tonic-gate 955*0Sstevel@tonic-gateuse vars qw( 956*0Sstevel@tonic-gate $_ModLines 957*0Sstevel@tonic-gate $_LocalVer 958*0Sstevel@tonic-gate ); 959*0Sstevel@tonic-gate 960*0Sstevel@tonic-gatesub make_configlocal_pm 961*0Sstevel@tonic-gate{ 962*0Sstevel@tonic-gate eval { require Encode; }; 963*0Sstevel@tonic-gate $@ and die "Unable to require Encode: $@\n"; 964*0Sstevel@tonic-gate eval { require File::Spec; }; 965*0Sstevel@tonic-gate # our used for variable expanstion 966*0Sstevel@tonic-gate my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8'); 967*0Sstevel@tonic-gate my %LocalMod = (); 968*0Sstevel@tonic-gate for my $d (@INC){ 969*0Sstevel@tonic-gate my $inc = File::Spec->catfile($d, "Encode"); 970*0Sstevel@tonic-gate -d $inc or next; 971*0Sstevel@tonic-gate opendir my $dh, $inc or die "$inc:$!"; 972*0Sstevel@tonic-gate warn "Checking $inc...\n"; 973*0Sstevel@tonic-gate for my $f (grep /\.pm$/o, readdir($dh)){ 974*0Sstevel@tonic-gate -f File::Spec->catfile($inc, "$f") or next; 975*0Sstevel@tonic-gate $INC{"Encode/$f"} and next; 976*0Sstevel@tonic-gate warn "require Encode/$f;\n"; 977*0Sstevel@tonic-gate eval { require "Encode/$f"; }; 978*0Sstevel@tonic-gate $@ and die "Can't require Encode/$f: $@\n"; 979*0Sstevel@tonic-gate for my $enc (Encode->encodings()){ 980*0Sstevel@tonic-gate no warnings 'once'; 981*0Sstevel@tonic-gate $in_core{$enc} and next; 982*0Sstevel@tonic-gate $Encode::Config::ExtModule{$enc} and next; 983*0Sstevel@tonic-gate my $mod = "Encode/$f"; 984*0Sstevel@tonic-gate $mod =~ s/\.pm$//o; $mod =~ s,/,::,og; 985*0Sstevel@tonic-gate $LocalMod{$enc} ||= $mod; 986*0Sstevel@tonic-gate } 987*0Sstevel@tonic-gate } 988*0Sstevel@tonic-gate } 989*0Sstevel@tonic-gate $_ModLines = ""; 990*0Sstevel@tonic-gate for my $enc (sort keys %LocalMod){ 991*0Sstevel@tonic-gate $_ModLines .= 992*0Sstevel@tonic-gate qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n); 993*0Sstevel@tonic-gate } 994*0Sstevel@tonic-gate warn $_ModLines; 995*0Sstevel@tonic-gate $_LocalVer = _mkversion(); 996*0Sstevel@tonic-gate $_E2X = find_e2x(); 997*0Sstevel@tonic-gate $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o; 998*0Sstevel@tonic-gate _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"), 999*0Sstevel@tonic-gate File::Spec->catfile($_Inc,"ConfigLocal.pm"), 1000*0Sstevel@tonic-gate 1); 1001*0Sstevel@tonic-gate exit; 1002*0Sstevel@tonic-gate} 1003*0Sstevel@tonic-gate 1004*0Sstevel@tonic-gatesub _mkversion{ 1005*0Sstevel@tonic-gate my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime(); 1006*0Sstevel@tonic-gate $yyyy += 1900, $mo +=1; 1007*0Sstevel@tonic-gate return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm); 1008*0Sstevel@tonic-gate} 1009*0Sstevel@tonic-gate 1010*0Sstevel@tonic-gatesub _print_expand{ 1011*0Sstevel@tonic-gate eval { require File::Basename; }; 1012*0Sstevel@tonic-gate $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n"; 1013*0Sstevel@tonic-gate File::Basename->import(); 1014*0Sstevel@tonic-gate my ($src, $dst, $clobber) = @_; 1015*0Sstevel@tonic-gate if (!$clobber and -e $dst){ 1016*0Sstevel@tonic-gate warn "$dst exists. skipping\n"; 1017*0Sstevel@tonic-gate return; 1018*0Sstevel@tonic-gate } 1019*0Sstevel@tonic-gate warn "Generating $dst...\n"; 1020*0Sstevel@tonic-gate open my $in, $src or die "$src : $!"; 1021*0Sstevel@tonic-gate if ((my $d = dirname($dst)) ne '.'){ 1022*0Sstevel@tonic-gate -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; 1023*0Sstevel@tonic-gate } 1024*0Sstevel@tonic-gate open my $out, ">$dst" or die "$!"; 1025*0Sstevel@tonic-gate my $asis = 0; 1026*0Sstevel@tonic-gate while (<$in>){ 1027*0Sstevel@tonic-gate if (/^#### END_OF_HEADER/){ 1028*0Sstevel@tonic-gate $asis = 1; next; 1029*0Sstevel@tonic-gate } 1030*0Sstevel@tonic-gate s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis; 1031*0Sstevel@tonic-gate print $out $_; 1032*0Sstevel@tonic-gate } 1033*0Sstevel@tonic-gate} 1034*0Sstevel@tonic-gate__END__ 1035*0Sstevel@tonic-gate 1036*0Sstevel@tonic-gate=head1 NAME 1037*0Sstevel@tonic-gate 1038*0Sstevel@tonic-gateenc2xs -- Perl Encode Module Generator 1039*0Sstevel@tonic-gate 1040*0Sstevel@tonic-gate=head1 SYNOPSIS 1041*0Sstevel@tonic-gate 1042*0Sstevel@tonic-gate enc2xs -[options] 1043*0Sstevel@tonic-gate enc2xs -M ModName mapfiles... 1044*0Sstevel@tonic-gate enc2xs -C 1045*0Sstevel@tonic-gate 1046*0Sstevel@tonic-gate=head1 DESCRIPTION 1047*0Sstevel@tonic-gate 1048*0Sstevel@tonic-gateF<enc2xs> builds a Perl extension for use by Encode from either 1049*0Sstevel@tonic-gateUnicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc). 1050*0Sstevel@tonic-gateBesides being used internally during the build process of the Encode 1051*0Sstevel@tonic-gatemodule, you can use F<enc2xs> to add your own encoding to perl. 1052*0Sstevel@tonic-gateNo knowledge of XS is necessary. 1053*0Sstevel@tonic-gate 1054*0Sstevel@tonic-gate=head1 Quick Guide 1055*0Sstevel@tonic-gate 1056*0Sstevel@tonic-gateIf you want to know as little about Perl as possible but need to 1057*0Sstevel@tonic-gateadd a new encoding, just read this chapter and forget the rest. 1058*0Sstevel@tonic-gate 1059*0Sstevel@tonic-gate=over 4 1060*0Sstevel@tonic-gate 1061*0Sstevel@tonic-gate=item 0. 1062*0Sstevel@tonic-gate 1063*0Sstevel@tonic-gateHave a .ucm file ready. You can get it from somewhere or you can write 1064*0Sstevel@tonic-gateyour own from scratch or you can grab one from the Encode distribution 1065*0Sstevel@tonic-gateand customize it. For the UCM format, see the next Chapter. In the 1066*0Sstevel@tonic-gateexample below, I'll call my theoretical encoding myascii, defined 1067*0Sstevel@tonic-gatein I<my.ucm>. C<$> is a shell prompt. 1068*0Sstevel@tonic-gate 1069*0Sstevel@tonic-gate $ ls -F 1070*0Sstevel@tonic-gate my.ucm 1071*0Sstevel@tonic-gate 1072*0Sstevel@tonic-gate=item 1. 1073*0Sstevel@tonic-gate 1074*0Sstevel@tonic-gateIssue a command as follows; 1075*0Sstevel@tonic-gate 1076*0Sstevel@tonic-gate $ enc2xs -M My my.ucm 1077*0Sstevel@tonic-gate generating Makefile.PL 1078*0Sstevel@tonic-gate generating My.pm 1079*0Sstevel@tonic-gate generating README 1080*0Sstevel@tonic-gate generating Changes 1081*0Sstevel@tonic-gate 1082*0Sstevel@tonic-gateNow take a look at your current directory. It should look like this. 1083*0Sstevel@tonic-gate 1084*0Sstevel@tonic-gate $ ls -F 1085*0Sstevel@tonic-gate Makefile.PL My.pm my.ucm t/ 1086*0Sstevel@tonic-gate 1087*0Sstevel@tonic-gateThe following files were created. 1088*0Sstevel@tonic-gate 1089*0Sstevel@tonic-gate Makefile.PL - MakeMaker script 1090*0Sstevel@tonic-gate My.pm - Encode submodule 1091*0Sstevel@tonic-gate t/My.t - test file 1092*0Sstevel@tonic-gate 1093*0Sstevel@tonic-gate=over 4 1094*0Sstevel@tonic-gate 1095*0Sstevel@tonic-gate=item 1.1. 1096*0Sstevel@tonic-gate 1097*0Sstevel@tonic-gateIf you want *.ucm installed together with the modules, do as follows; 1098*0Sstevel@tonic-gate 1099*0Sstevel@tonic-gate $ mkdir Encode 1100*0Sstevel@tonic-gate $ mv *.ucm Encode 1101*0Sstevel@tonic-gate $ enc2xs -M My Encode/*ucm 1102*0Sstevel@tonic-gate 1103*0Sstevel@tonic-gate=back 1104*0Sstevel@tonic-gate 1105*0Sstevel@tonic-gate=item 2. 1106*0Sstevel@tonic-gate 1107*0Sstevel@tonic-gateEdit the files generated. You don't have to if you have no time AND no 1108*0Sstevel@tonic-gateintention to give it to someone else. But it is a good idea to edit 1109*0Sstevel@tonic-gatethe pod and to add more tests. 1110*0Sstevel@tonic-gate 1111*0Sstevel@tonic-gate=item 3. 1112*0Sstevel@tonic-gate 1113*0Sstevel@tonic-gateNow issue a command all Perl Mongers love: 1114*0Sstevel@tonic-gate 1115*0Sstevel@tonic-gate $ perl Makefile.PL 1116*0Sstevel@tonic-gate Writing Makefile for Encode::My 1117*0Sstevel@tonic-gate 1118*0Sstevel@tonic-gate=item 4. 1119*0Sstevel@tonic-gate 1120*0Sstevel@tonic-gateNow all you have to do is make. 1121*0Sstevel@tonic-gate 1122*0Sstevel@tonic-gate $ make 1123*0Sstevel@tonic-gate cp My.pm blib/lib/Encode/My.pm 1124*0Sstevel@tonic-gate /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \ 1125*0Sstevel@tonic-gate -o encode_t.c -f encode_t.fnm 1126*0Sstevel@tonic-gate Reading myascii (myascii) 1127*0Sstevel@tonic-gate Writing compiled form 1128*0Sstevel@tonic-gate 128 bytes in string tables 1129*0Sstevel@tonic-gate 384 bytes (25%) saved spotting duplicates 1130*0Sstevel@tonic-gate 1 bytes (99.2%) saved using substrings 1131*0Sstevel@tonic-gate .... 1132*0Sstevel@tonic-gate chmod 644 blib/arch/auto/Encode/My/My.bs 1133*0Sstevel@tonic-gate $ 1134*0Sstevel@tonic-gate 1135*0Sstevel@tonic-gateThe time it takes varies depending on how fast your machine is and 1136*0Sstevel@tonic-gatehow large your encoding is. Unless you are working on something big 1137*0Sstevel@tonic-gatelike euc-tw, it won't take too long. 1138*0Sstevel@tonic-gate 1139*0Sstevel@tonic-gate=item 5. 1140*0Sstevel@tonic-gate 1141*0Sstevel@tonic-gateYou can "make install" already but you should test first. 1142*0Sstevel@tonic-gate 1143*0Sstevel@tonic-gate $ make test 1144*0Sstevel@tonic-gate PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \ 1145*0Sstevel@tonic-gate -e 'use Test::Harness qw(&runtests $verbose); \ 1146*0Sstevel@tonic-gate $verbose=0; runtests @ARGV;' t/*.t 1147*0Sstevel@tonic-gate t/My....ok 1148*0Sstevel@tonic-gate All tests successful. 1149*0Sstevel@tonic-gate Files=1, Tests=2, 0 wallclock secs 1150*0Sstevel@tonic-gate ( 0.09 cusr + 0.01 csys = 0.09 CPU) 1151*0Sstevel@tonic-gate 1152*0Sstevel@tonic-gate=item 6. 1153*0Sstevel@tonic-gate 1154*0Sstevel@tonic-gateIf you are content with the test result, just "make install" 1155*0Sstevel@tonic-gate 1156*0Sstevel@tonic-gate=item 7. 1157*0Sstevel@tonic-gate 1158*0Sstevel@tonic-gateIf you want to add your encoding to Encode's demand-loading list 1159*0Sstevel@tonic-gate(so you don't have to "use Encode::YourEncoding"), run 1160*0Sstevel@tonic-gate 1161*0Sstevel@tonic-gate enc2xs -C 1162*0Sstevel@tonic-gate 1163*0Sstevel@tonic-gateto update Encode::ConfigLocal, a module that controls local settings. 1164*0Sstevel@tonic-gateAfter that, "use Encode;" is enough to load your encodings on demand. 1165*0Sstevel@tonic-gate 1166*0Sstevel@tonic-gate=back 1167*0Sstevel@tonic-gate 1168*0Sstevel@tonic-gate=head1 The Unicode Character Map 1169*0Sstevel@tonic-gate 1170*0Sstevel@tonic-gateEncode uses the Unicode Character Map (UCM) format for source character 1171*0Sstevel@tonic-gatemappings. This format is used by IBM's ICU package and was adopted 1172*0Sstevel@tonic-gateby Nick Ing-Simmons for use with the Encode module. Since UCM is 1173*0Sstevel@tonic-gatemore flexible than Tcl's Encoding Map and far more user-friendly, 1174*0Sstevel@tonic-gatethis is the recommended formet for Encode now. 1175*0Sstevel@tonic-gate 1176*0Sstevel@tonic-gateA UCM file looks like this. 1177*0Sstevel@tonic-gate 1178*0Sstevel@tonic-gate # 1179*0Sstevel@tonic-gate # Comments 1180*0Sstevel@tonic-gate # 1181*0Sstevel@tonic-gate <code_set_name> "US-ascii" # Required 1182*0Sstevel@tonic-gate <code_set_alias> "ascii" # Optional 1183*0Sstevel@tonic-gate <mb_cur_min> 1 # Required; usually 1 1184*0Sstevel@tonic-gate <mb_cur_max> 1 # Max. # of bytes/char 1185*0Sstevel@tonic-gate <subchar> \x3F # Substitution char 1186*0Sstevel@tonic-gate # 1187*0Sstevel@tonic-gate CHARMAP 1188*0Sstevel@tonic-gate <U0000> \x00 |0 # <control> 1189*0Sstevel@tonic-gate <U0001> \x01 |0 # <control> 1190*0Sstevel@tonic-gate <U0002> \x02 |0 # <control> 1191*0Sstevel@tonic-gate .... 1192*0Sstevel@tonic-gate <U007C> \x7C |0 # VERTICAL LINE 1193*0Sstevel@tonic-gate <U007D> \x7D |0 # RIGHT CURLY BRACKET 1194*0Sstevel@tonic-gate <U007E> \x7E |0 # TILDE 1195*0Sstevel@tonic-gate <U007F> \x7F |0 # <control> 1196*0Sstevel@tonic-gate END CHARMAP 1197*0Sstevel@tonic-gate 1198*0Sstevel@tonic-gate=over 4 1199*0Sstevel@tonic-gate 1200*0Sstevel@tonic-gate=item * 1201*0Sstevel@tonic-gate 1202*0Sstevel@tonic-gateAnything that follows C<#> is treated as a comment. 1203*0Sstevel@tonic-gate 1204*0Sstevel@tonic-gate=item * 1205*0Sstevel@tonic-gate 1206*0Sstevel@tonic-gateThe header section continues until a line containing the word 1207*0Sstevel@tonic-gateCHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one 1208*0Sstevel@tonic-gatepair per line. Strings used as values must be quoted. Barewords are 1209*0Sstevel@tonic-gatetreated as numbers. I<\xXX> represents a byte. 1210*0Sstevel@tonic-gate 1211*0Sstevel@tonic-gateMost of the keywords are self-explanatory. I<subchar> means 1212*0Sstevel@tonic-gatesubstitution character, not subcharacter. When you decode a Unicode 1213*0Sstevel@tonic-gatesequence to this encoding but no matching character is found, the byte 1214*0Sstevel@tonic-gatesequence defined here will be used. For most cases, the value here is 1215*0Sstevel@tonic-gate\x3F; in ASCII, this is a question mark. 1216*0Sstevel@tonic-gate 1217*0Sstevel@tonic-gate=item * 1218*0Sstevel@tonic-gate 1219*0Sstevel@tonic-gateCHARMAP starts the character map section. Each line has a form as 1220*0Sstevel@tonic-gatefollows: 1221*0Sstevel@tonic-gate 1222*0Sstevel@tonic-gate <UXXXX> \xXX.. |0 # comment 1223*0Sstevel@tonic-gate ^ ^ ^ 1224*0Sstevel@tonic-gate | | +- Fallback flag 1225*0Sstevel@tonic-gate | +-------- Encoded byte sequence 1226*0Sstevel@tonic-gate +-------------- Unicode Character ID in hex 1227*0Sstevel@tonic-gate 1228*0Sstevel@tonic-gateThe format is roughly the same as a header section except for the 1229*0Sstevel@tonic-gatefallback flag: | followed by 0..3. The meaning of the possible 1230*0Sstevel@tonic-gatevalues is as follows: 1231*0Sstevel@tonic-gate 1232*0Sstevel@tonic-gate=over 4 1233*0Sstevel@tonic-gate 1234*0Sstevel@tonic-gate=item |0 1235*0Sstevel@tonic-gate 1236*0Sstevel@tonic-gateRound trip safe. A character decoded to Unicode encodes back to the 1237*0Sstevel@tonic-gatesame byte sequence. Most characters have this flag. 1238*0Sstevel@tonic-gate 1239*0Sstevel@tonic-gate=item |1 1240*0Sstevel@tonic-gate 1241*0Sstevel@tonic-gateFallback for unicode -> encoding. When seen, enc2xs adds this 1242*0Sstevel@tonic-gatecharacter for the encode map only. 1243*0Sstevel@tonic-gate 1244*0Sstevel@tonic-gate=item |2 1245*0Sstevel@tonic-gate 1246*0Sstevel@tonic-gateSkip sub-char mapping should there be no code point. 1247*0Sstevel@tonic-gate 1248*0Sstevel@tonic-gate=item |3 1249*0Sstevel@tonic-gate 1250*0Sstevel@tonic-gateFallback for encoding -> unicode. When seen, enc2xs adds this 1251*0Sstevel@tonic-gatecharacter for the decode map only. 1252*0Sstevel@tonic-gate 1253*0Sstevel@tonic-gate=back 1254*0Sstevel@tonic-gate 1255*0Sstevel@tonic-gate=item * 1256*0Sstevel@tonic-gate 1257*0Sstevel@tonic-gateAnd finally, END OF CHARMAP ends the section. 1258*0Sstevel@tonic-gate 1259*0Sstevel@tonic-gate=back 1260*0Sstevel@tonic-gate 1261*0Sstevel@tonic-gateWhen you are manually creating a UCM file, you should copy ascii.ucm 1262*0Sstevel@tonic-gateor an existing encoding which is close to yours, rather than write 1263*0Sstevel@tonic-gateyour own from scratch. 1264*0Sstevel@tonic-gate 1265*0Sstevel@tonic-gateWhen you do so, make sure you leave at least B<U0000> to B<U0020> as 1266*0Sstevel@tonic-gateis, unless your environment is EBCDIC. 1267*0Sstevel@tonic-gate 1268*0Sstevel@tonic-gateB<CAVEAT>: not all features in UCM are implemented. For example, 1269*0Sstevel@tonic-gateicu:state is not used. Because of that, you need to write a perl 1270*0Sstevel@tonic-gatemodule if you want to support algorithmical encodings, notably 1271*0Sstevel@tonic-gatethe ISO-2022 series. Such modules include L<Encode::JP::2022_JP>, 1272*0Sstevel@tonic-gateL<Encode::KR::2022_KR>, and L<Encode::TW::HZ>. 1273*0Sstevel@tonic-gate 1274*0Sstevel@tonic-gate=head2 Coping with duplicate mappings 1275*0Sstevel@tonic-gate 1276*0Sstevel@tonic-gateWhen you create a map, you SHOULD make your mappings round-trip safe. 1277*0Sstevel@tonic-gateThat is, C<encode('your-encoding', decode('your-encoding', $data)) eq 1278*0Sstevel@tonic-gate$data> stands for all characters that are marked as C<|0>. Here is 1279*0Sstevel@tonic-gatehow to make sure: 1280*0Sstevel@tonic-gate 1281*0Sstevel@tonic-gate=over 4 1282*0Sstevel@tonic-gate 1283*0Sstevel@tonic-gate=item * 1284*0Sstevel@tonic-gate 1285*0Sstevel@tonic-gateSort your map in Unicode order. 1286*0Sstevel@tonic-gate 1287*0Sstevel@tonic-gate=item * 1288*0Sstevel@tonic-gate 1289*0Sstevel@tonic-gateWhen you have a duplicate entry, mark either one with '|1' or '|3'. 1290*0Sstevel@tonic-gate 1291*0Sstevel@tonic-gate=item * 1292*0Sstevel@tonic-gate 1293*0Sstevel@tonic-gateAnd make sure the '|1' or '|3' entry FOLLOWS the '|0' entry. 1294*0Sstevel@tonic-gate 1295*0Sstevel@tonic-gate=back 1296*0Sstevel@tonic-gate 1297*0Sstevel@tonic-gateHere is an example from big5-eten. 1298*0Sstevel@tonic-gate 1299*0Sstevel@tonic-gate <U2550> \xF9\xF9 |0 1300*0Sstevel@tonic-gate <U2550> \xA2\xA4 |3 1301*0Sstevel@tonic-gate 1302*0Sstevel@tonic-gateInternally Encoding -> Unicode and Unicode -> Encoding Map looks like 1303*0Sstevel@tonic-gatethis; 1304*0Sstevel@tonic-gate 1305*0Sstevel@tonic-gate E to U U to E 1306*0Sstevel@tonic-gate -------------------------------------- 1307*0Sstevel@tonic-gate \xF9\xF9 => U2550 U2550 => \xF9\xF9 1308*0Sstevel@tonic-gate \xA2\xA4 => U2550 1309*0Sstevel@tonic-gate 1310*0Sstevel@tonic-gateSo it is round-trip safe for \xF9\xF9. But if the line above is upside 1311*0Sstevel@tonic-gatedown, here is what happens. 1312*0Sstevel@tonic-gate 1313*0Sstevel@tonic-gate E to U U to E 1314*0Sstevel@tonic-gate -------------------------------------- 1315*0Sstevel@tonic-gate \xA2\xA4 => U2550 U2550 => \xF9\xF9 1316*0Sstevel@tonic-gate (\xF9\xF9 => U2550 is now overwritten!) 1317*0Sstevel@tonic-gate 1318*0Sstevel@tonic-gateThe Encode package comes with F<ucmlint>, a crude but sufficient 1319*0Sstevel@tonic-gateutility to check the integrity of a UCM file. Check under the 1320*0Sstevel@tonic-gateEncode/bin directory for this. 1321*0Sstevel@tonic-gate 1322*0Sstevel@tonic-gate 1323*0Sstevel@tonic-gate=head1 Bookmarks 1324*0Sstevel@tonic-gate 1325*0Sstevel@tonic-gate=over 4 1326*0Sstevel@tonic-gate 1327*0Sstevel@tonic-gate=item * 1328*0Sstevel@tonic-gate 1329*0Sstevel@tonic-gateICU Home Page 1330*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/> 1331*0Sstevel@tonic-gate 1332*0Sstevel@tonic-gate=item * 1333*0Sstevel@tonic-gate 1334*0Sstevel@tonic-gateICU Character Mapping Tables 1335*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/charset/> 1336*0Sstevel@tonic-gate 1337*0Sstevel@tonic-gate=item * 1338*0Sstevel@tonic-gate 1339*0Sstevel@tonic-gateICU:Conversion Data 1340*0Sstevel@tonic-gateL<http://oss.software.ibm.com/icu/userguide/conversion-data.html> 1341*0Sstevel@tonic-gate 1342*0Sstevel@tonic-gate=back 1343*0Sstevel@tonic-gate 1344*0Sstevel@tonic-gate=head1 SEE ALSO 1345*0Sstevel@tonic-gate 1346*0Sstevel@tonic-gateL<Encode>, 1347*0Sstevel@tonic-gateL<perlmod>, 1348*0Sstevel@tonic-gateL<perlpod> 1349*0Sstevel@tonic-gate 1350*0Sstevel@tonic-gate=cut 1351*0Sstevel@tonic-gate 1352*0Sstevel@tonic-gate# -Q to disable the duplicate codepoint test 1353*0Sstevel@tonic-gate# -S make mapping errors fatal 1354*0Sstevel@tonic-gate# -q to remove comments written to output files 1355*0Sstevel@tonic-gate# -O to enable the (brute force) substring optimiser 1356*0Sstevel@tonic-gate# -o <output> to specify the output file name (else it's the first arg) 1357*0Sstevel@tonic-gate# -f <inlist> to give a file with a list of input files (else use the args) 1358*0Sstevel@tonic-gate# -n <name> to name the encoding (else use the basename of the input file. 1359*0Sstevel@tonic-gate 1360*0Sstevel@tonic-gateWith %seen holding array refs: 1361*0Sstevel@tonic-gate 1362*0Sstevel@tonic-gate 865.66 real 28.80 user 8.79 sys 1363*0Sstevel@tonic-gate 7904 maximum resident set size 1364*0Sstevel@tonic-gate 1356 average shared memory size 1365*0Sstevel@tonic-gate 18566 average unshared data size 1366*0Sstevel@tonic-gate 229 average unshared stack size 1367*0Sstevel@tonic-gate 46080 page reclaims 1368*0Sstevel@tonic-gate 33373 page faults 1369*0Sstevel@tonic-gate 1370*0Sstevel@tonic-gateWith %seen holding simple scalars: 1371*0Sstevel@tonic-gate 1372*0Sstevel@tonic-gate 342.16 real 27.11 user 3.54 sys 1373*0Sstevel@tonic-gate 8388 maximum resident set size 1374*0Sstevel@tonic-gate 1394 average shared memory size 1375*0Sstevel@tonic-gate 14969 average unshared data size 1376*0Sstevel@tonic-gate 236 average unshared stack size 1377*0Sstevel@tonic-gate 28159 page reclaims 1378*0Sstevel@tonic-gate 9839 page faults 1379*0Sstevel@tonic-gate 1380*0Sstevel@tonic-gateYes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is 1381*0Sstevel@tonic-gatehow %seen is storing things its seen. So it is pathalogically bad on a 16M 1382*0Sstevel@tonic-gateRAM machine, but it's going to help even on modern machines. 1383*0Sstevel@tonic-gateSwapping is bad, m'kay :-) 1384