1*0Sstevel@tonic-gatepackage Unicode::UCD; 2*0Sstevel@tonic-gate 3*0Sstevel@tonic-gateuse strict; 4*0Sstevel@tonic-gateuse warnings; 5*0Sstevel@tonic-gate 6*0Sstevel@tonic-gateour $VERSION = '0.22'; 7*0Sstevel@tonic-gate 8*0Sstevel@tonic-gateuse Storable qw(dclone); 9*0Sstevel@tonic-gate 10*0Sstevel@tonic-gaterequire Exporter; 11*0Sstevel@tonic-gate 12*0Sstevel@tonic-gateour @ISA = qw(Exporter); 13*0Sstevel@tonic-gate 14*0Sstevel@tonic-gateour @EXPORT_OK = qw(charinfo 15*0Sstevel@tonic-gate charblock charscript 16*0Sstevel@tonic-gate charblocks charscripts 17*0Sstevel@tonic-gate charinrange 18*0Sstevel@tonic-gate compexcl 19*0Sstevel@tonic-gate casefold casespec); 20*0Sstevel@tonic-gate 21*0Sstevel@tonic-gateuse Carp; 22*0Sstevel@tonic-gate 23*0Sstevel@tonic-gate=head1 NAME 24*0Sstevel@tonic-gate 25*0Sstevel@tonic-gateUnicode::UCD - Unicode character database 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate=head1 SYNOPSIS 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate use Unicode::UCD 'charinfo'; 30*0Sstevel@tonic-gate my $charinfo = charinfo($codepoint); 31*0Sstevel@tonic-gate 32*0Sstevel@tonic-gate use Unicode::UCD 'charblock'; 33*0Sstevel@tonic-gate my $charblock = charblock($codepoint); 34*0Sstevel@tonic-gate 35*0Sstevel@tonic-gate use Unicode::UCD 'charscript'; 36*0Sstevel@tonic-gate my $charscript = charscript($codepoint); 37*0Sstevel@tonic-gate 38*0Sstevel@tonic-gate use Unicode::UCD 'charblocks'; 39*0Sstevel@tonic-gate my $charblocks = charblocks(); 40*0Sstevel@tonic-gate 41*0Sstevel@tonic-gate use Unicode::UCD 'charscripts'; 42*0Sstevel@tonic-gate my %charscripts = charscripts(); 43*0Sstevel@tonic-gate 44*0Sstevel@tonic-gate use Unicode::UCD qw(charscript charinrange); 45*0Sstevel@tonic-gate my $range = charscript($script); 46*0Sstevel@tonic-gate print "looks like $script\n" if charinrange($range, $codepoint); 47*0Sstevel@tonic-gate 48*0Sstevel@tonic-gate use Unicode::UCD 'compexcl'; 49*0Sstevel@tonic-gate my $compexcl = compexcl($codepoint); 50*0Sstevel@tonic-gate 51*0Sstevel@tonic-gate my $unicode_version = Unicode::UCD::UnicodeVersion(); 52*0Sstevel@tonic-gate 53*0Sstevel@tonic-gate=head1 DESCRIPTION 54*0Sstevel@tonic-gate 55*0Sstevel@tonic-gateThe Unicode::UCD module offers a simple interface to the Unicode 56*0Sstevel@tonic-gateCharacter Database. 57*0Sstevel@tonic-gate 58*0Sstevel@tonic-gate=cut 59*0Sstevel@tonic-gate 60*0Sstevel@tonic-gatemy $UNICODEFH; 61*0Sstevel@tonic-gatemy $BLOCKSFH; 62*0Sstevel@tonic-gatemy $SCRIPTSFH; 63*0Sstevel@tonic-gatemy $VERSIONFH; 64*0Sstevel@tonic-gatemy $COMPEXCLFH; 65*0Sstevel@tonic-gatemy $CASEFOLDFH; 66*0Sstevel@tonic-gatemy $CASESPECFH; 67*0Sstevel@tonic-gate 68*0Sstevel@tonic-gatesub openunicode { 69*0Sstevel@tonic-gate my ($rfh, @path) = @_; 70*0Sstevel@tonic-gate my $f; 71*0Sstevel@tonic-gate unless (defined $$rfh) { 72*0Sstevel@tonic-gate for my $d (@INC) { 73*0Sstevel@tonic-gate use File::Spec; 74*0Sstevel@tonic-gate $f = File::Spec->catfile($d, "unicore", @path); 75*0Sstevel@tonic-gate last if open($$rfh, $f); 76*0Sstevel@tonic-gate undef $f; 77*0Sstevel@tonic-gate } 78*0Sstevel@tonic-gate croak __PACKAGE__, ": failed to find ", 79*0Sstevel@tonic-gate File::Spec->catfile(@path), " in @INC" 80*0Sstevel@tonic-gate unless defined $f; 81*0Sstevel@tonic-gate } 82*0Sstevel@tonic-gate return $f; 83*0Sstevel@tonic-gate} 84*0Sstevel@tonic-gate 85*0Sstevel@tonic-gate=head2 charinfo 86*0Sstevel@tonic-gate 87*0Sstevel@tonic-gate use Unicode::UCD 'charinfo'; 88*0Sstevel@tonic-gate 89*0Sstevel@tonic-gate my $charinfo = charinfo(0x41); 90*0Sstevel@tonic-gate 91*0Sstevel@tonic-gatecharinfo() returns a reference to a hash that has the following fields 92*0Sstevel@tonic-gateas defined by the Unicode standard: 93*0Sstevel@tonic-gate 94*0Sstevel@tonic-gate key 95*0Sstevel@tonic-gate 96*0Sstevel@tonic-gate code code point with at least four hexdigits 97*0Sstevel@tonic-gate name name of the character IN UPPER CASE 98*0Sstevel@tonic-gate category general category of the character 99*0Sstevel@tonic-gate combining classes used in the Canonical Ordering Algorithm 100*0Sstevel@tonic-gate bidi bidirectional category 101*0Sstevel@tonic-gate decomposition character decomposition mapping 102*0Sstevel@tonic-gate decimal if decimal digit this is the integer numeric value 103*0Sstevel@tonic-gate digit if digit this is the numeric value 104*0Sstevel@tonic-gate numeric if numeric is the integer or rational numeric value 105*0Sstevel@tonic-gate mirrored if mirrored in bidirectional text 106*0Sstevel@tonic-gate unicode10 Unicode 1.0 name if existed and different 107*0Sstevel@tonic-gate comment ISO 10646 comment field 108*0Sstevel@tonic-gate upper uppercase equivalent mapping 109*0Sstevel@tonic-gate lower lowercase equivalent mapping 110*0Sstevel@tonic-gate title titlecase equivalent mapping 111*0Sstevel@tonic-gate 112*0Sstevel@tonic-gate block block the character belongs to (used in \p{In...}) 113*0Sstevel@tonic-gate script script the character belongs to 114*0Sstevel@tonic-gate 115*0Sstevel@tonic-gateIf no match is found, a reference to an empty hash is returned. 116*0Sstevel@tonic-gate 117*0Sstevel@tonic-gateThe C<block> property is the same as returned by charinfo(). It is 118*0Sstevel@tonic-gatenot defined in the Unicode Character Database proper (Chapter 4 of the 119*0Sstevel@tonic-gateUnicode 3.0 Standard, aka TUS3) but instead in an auxiliary database 120*0Sstevel@tonic-gate(Chapter 14 of TUS3). Similarly for the C<script> property. 121*0Sstevel@tonic-gate 122*0Sstevel@tonic-gateNote that you cannot do (de)composition and casing based solely on the 123*0Sstevel@tonic-gateabove C<decomposition> and C<lower>, C<upper>, C<title>, properties, 124*0Sstevel@tonic-gateyou will need also the compexcl(), casefold(), and casespec() functions. 125*0Sstevel@tonic-gate 126*0Sstevel@tonic-gate=cut 127*0Sstevel@tonic-gate 128*0Sstevel@tonic-gate# NB: This function is duplicated in charnames.pm 129*0Sstevel@tonic-gatesub _getcode { 130*0Sstevel@tonic-gate my $arg = shift; 131*0Sstevel@tonic-gate 132*0Sstevel@tonic-gate if ($arg =~ /^[1-9]\d*$/) { 133*0Sstevel@tonic-gate return $arg; 134*0Sstevel@tonic-gate } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 135*0Sstevel@tonic-gate return hex($1); 136*0Sstevel@tonic-gate } 137*0Sstevel@tonic-gate 138*0Sstevel@tonic-gate return; 139*0Sstevel@tonic-gate} 140*0Sstevel@tonic-gate 141*0Sstevel@tonic-gate# Lingua::KO::Hangul::Util not part of the standard distribution 142*0Sstevel@tonic-gate# but it will be used if available. 143*0Sstevel@tonic-gate 144*0Sstevel@tonic-gateeval { require Lingua::KO::Hangul::Util }; 145*0Sstevel@tonic-gatemy $hasHangulUtil = ! $@; 146*0Sstevel@tonic-gateif ($hasHangulUtil) { 147*0Sstevel@tonic-gate Lingua::KO::Hangul::Util->import(); 148*0Sstevel@tonic-gate} 149*0Sstevel@tonic-gate 150*0Sstevel@tonic-gatesub hangul_decomp { # internal: called from charinfo 151*0Sstevel@tonic-gate if ($hasHangulUtil) { 152*0Sstevel@tonic-gate my @tmp = decomposeHangul(shift); 153*0Sstevel@tonic-gate return sprintf("%04X %04X", @tmp) if @tmp == 2; 154*0Sstevel@tonic-gate return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 155*0Sstevel@tonic-gate } 156*0Sstevel@tonic-gate return; 157*0Sstevel@tonic-gate} 158*0Sstevel@tonic-gate 159*0Sstevel@tonic-gatesub hangul_charname { # internal: called from charinfo 160*0Sstevel@tonic-gate return sprintf("HANGUL SYLLABLE-%04X", shift); 161*0Sstevel@tonic-gate} 162*0Sstevel@tonic-gate 163*0Sstevel@tonic-gatesub han_charname { # internal: called from charinfo 164*0Sstevel@tonic-gate return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 165*0Sstevel@tonic-gate} 166*0Sstevel@tonic-gate 167*0Sstevel@tonic-gatemy @CharinfoRanges = ( 168*0Sstevel@tonic-gate# block name 169*0Sstevel@tonic-gate# [ first, last, coderef to name, coderef to decompose ], 170*0Sstevel@tonic-gate# CJK Ideographs Extension A 171*0Sstevel@tonic-gate [ 0x3400, 0x4DB5, \&han_charname, undef ], 172*0Sstevel@tonic-gate# CJK Ideographs 173*0Sstevel@tonic-gate [ 0x4E00, 0x9FA5, \&han_charname, undef ], 174*0Sstevel@tonic-gate# Hangul Syllables 175*0Sstevel@tonic-gate [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 176*0Sstevel@tonic-gate# Non-Private Use High Surrogates 177*0Sstevel@tonic-gate [ 0xD800, 0xDB7F, undef, undef ], 178*0Sstevel@tonic-gate# Private Use High Surrogates 179*0Sstevel@tonic-gate [ 0xDB80, 0xDBFF, undef, undef ], 180*0Sstevel@tonic-gate# Low Surrogates 181*0Sstevel@tonic-gate [ 0xDC00, 0xDFFF, undef, undef ], 182*0Sstevel@tonic-gate# The Private Use Area 183*0Sstevel@tonic-gate [ 0xE000, 0xF8FF, undef, undef ], 184*0Sstevel@tonic-gate# CJK Ideographs Extension B 185*0Sstevel@tonic-gate [ 0x20000, 0x2A6D6, \&han_charname, undef ], 186*0Sstevel@tonic-gate# Plane 15 Private Use Area 187*0Sstevel@tonic-gate [ 0xF0000, 0xFFFFD, undef, undef ], 188*0Sstevel@tonic-gate# Plane 16 Private Use Area 189*0Sstevel@tonic-gate [ 0x100000, 0x10FFFD, undef, undef ], 190*0Sstevel@tonic-gate); 191*0Sstevel@tonic-gate 192*0Sstevel@tonic-gatesub charinfo { 193*0Sstevel@tonic-gate my $arg = shift; 194*0Sstevel@tonic-gate my $code = _getcode($arg); 195*0Sstevel@tonic-gate croak __PACKAGE__, "::charinfo: unknown code '$arg'" 196*0Sstevel@tonic-gate unless defined $code; 197*0Sstevel@tonic-gate my $hexk = sprintf("%06X", $code); 198*0Sstevel@tonic-gate my($rcode,$rname,$rdec); 199*0Sstevel@tonic-gate foreach my $range (@CharinfoRanges){ 200*0Sstevel@tonic-gate if ($range->[0] <= $code && $code <= $range->[1]) { 201*0Sstevel@tonic-gate $rcode = $hexk; 202*0Sstevel@tonic-gate $rcode =~ s/^0+//; 203*0Sstevel@tonic-gate $rcode = sprintf("%04X", hex($rcode)); 204*0Sstevel@tonic-gate $rname = $range->[2] ? $range->[2]->($code) : ''; 205*0Sstevel@tonic-gate $rdec = $range->[3] ? $range->[3]->($code) : ''; 206*0Sstevel@tonic-gate $hexk = sprintf("%06X", $range->[0]); # replace by the first 207*0Sstevel@tonic-gate last; 208*0Sstevel@tonic-gate } 209*0Sstevel@tonic-gate } 210*0Sstevel@tonic-gate openunicode(\$UNICODEFH, "UnicodeData.txt"); 211*0Sstevel@tonic-gate if (defined $UNICODEFH) { 212*0Sstevel@tonic-gate use Search::Dict 1.02; 213*0Sstevel@tonic-gate if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 214*0Sstevel@tonic-gate my $line = <$UNICODEFH>; 215*0Sstevel@tonic-gate return unless defined $line; 216*0Sstevel@tonic-gate chomp $line; 217*0Sstevel@tonic-gate my %prop; 218*0Sstevel@tonic-gate @prop{qw( 219*0Sstevel@tonic-gate code name category 220*0Sstevel@tonic-gate combining bidi decomposition 221*0Sstevel@tonic-gate decimal digit numeric 222*0Sstevel@tonic-gate mirrored unicode10 comment 223*0Sstevel@tonic-gate upper lower title 224*0Sstevel@tonic-gate )} = split(/;/, $line, -1); 225*0Sstevel@tonic-gate $hexk =~ s/^0+//; 226*0Sstevel@tonic-gate $hexk = sprintf("%04X", hex($hexk)); 227*0Sstevel@tonic-gate if ($prop{code} eq $hexk) { 228*0Sstevel@tonic-gate $prop{block} = charblock($code); 229*0Sstevel@tonic-gate $prop{script} = charscript($code); 230*0Sstevel@tonic-gate if(defined $rname){ 231*0Sstevel@tonic-gate $prop{code} = $rcode; 232*0Sstevel@tonic-gate $prop{name} = $rname; 233*0Sstevel@tonic-gate $prop{decomposition} = $rdec; 234*0Sstevel@tonic-gate } 235*0Sstevel@tonic-gate return \%prop; 236*0Sstevel@tonic-gate } 237*0Sstevel@tonic-gate } 238*0Sstevel@tonic-gate } 239*0Sstevel@tonic-gate return; 240*0Sstevel@tonic-gate} 241*0Sstevel@tonic-gate 242*0Sstevel@tonic-gatesub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 243*0Sstevel@tonic-gate my ($table, $lo, $hi, $code) = @_; 244*0Sstevel@tonic-gate 245*0Sstevel@tonic-gate return if $lo > $hi; 246*0Sstevel@tonic-gate 247*0Sstevel@tonic-gate my $mid = int(($lo+$hi) / 2); 248*0Sstevel@tonic-gate 249*0Sstevel@tonic-gate if ($table->[$mid]->[0] < $code) { 250*0Sstevel@tonic-gate if ($table->[$mid]->[1] >= $code) { 251*0Sstevel@tonic-gate return $table->[$mid]->[2]; 252*0Sstevel@tonic-gate } else { 253*0Sstevel@tonic-gate _search($table, $mid + 1, $hi, $code); 254*0Sstevel@tonic-gate } 255*0Sstevel@tonic-gate } elsif ($table->[$mid]->[0] > $code) { 256*0Sstevel@tonic-gate _search($table, $lo, $mid - 1, $code); 257*0Sstevel@tonic-gate } else { 258*0Sstevel@tonic-gate return $table->[$mid]->[2]; 259*0Sstevel@tonic-gate } 260*0Sstevel@tonic-gate} 261*0Sstevel@tonic-gate 262*0Sstevel@tonic-gatesub charinrange { 263*0Sstevel@tonic-gate my ($range, $arg) = @_; 264*0Sstevel@tonic-gate my $code = _getcode($arg); 265*0Sstevel@tonic-gate croak __PACKAGE__, "::charinrange: unknown code '$arg'" 266*0Sstevel@tonic-gate unless defined $code; 267*0Sstevel@tonic-gate _search($range, 0, $#$range, $code); 268*0Sstevel@tonic-gate} 269*0Sstevel@tonic-gate 270*0Sstevel@tonic-gate=head2 charblock 271*0Sstevel@tonic-gate 272*0Sstevel@tonic-gate use Unicode::UCD 'charblock'; 273*0Sstevel@tonic-gate 274*0Sstevel@tonic-gate my $charblock = charblock(0x41); 275*0Sstevel@tonic-gate my $charblock = charblock(1234); 276*0Sstevel@tonic-gate my $charblock = charblock("0x263a"); 277*0Sstevel@tonic-gate my $charblock = charblock("U+263a"); 278*0Sstevel@tonic-gate 279*0Sstevel@tonic-gate my $range = charblock('Armenian'); 280*0Sstevel@tonic-gate 281*0Sstevel@tonic-gateWith a B<code point argument> charblock() returns the I<block> the character 282*0Sstevel@tonic-gatebelongs to, e.g. C<Basic Latin>. Note that not all the character 283*0Sstevel@tonic-gatepositions within all blocks are defined. 284*0Sstevel@tonic-gate 285*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>. 286*0Sstevel@tonic-gate 287*0Sstevel@tonic-gateIf supplied with an argument that can't be a code point, charblock() tries 288*0Sstevel@tonic-gateto do the opposite and interpret the argument as a character block. The 289*0Sstevel@tonic-gatereturn value is a I<range>: an anonymous list of lists that contain 290*0Sstevel@tonic-gateI<start-of-range>, I<end-of-range> code point pairs. You can test whether a 291*0Sstevel@tonic-gatecode point is in a range using the L</charinrange> function. If the 292*0Sstevel@tonic-gateargument is not a known charater block, C<undef> is returned. 293*0Sstevel@tonic-gate 294*0Sstevel@tonic-gate=cut 295*0Sstevel@tonic-gate 296*0Sstevel@tonic-gatemy @BLOCKS; 297*0Sstevel@tonic-gatemy %BLOCKS; 298*0Sstevel@tonic-gate 299*0Sstevel@tonic-gatesub _charblocks { 300*0Sstevel@tonic-gate unless (@BLOCKS) { 301*0Sstevel@tonic-gate if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 302*0Sstevel@tonic-gate local $_; 303*0Sstevel@tonic-gate while (<$BLOCKSFH>) { 304*0Sstevel@tonic-gate if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 305*0Sstevel@tonic-gate my ($lo, $hi) = (hex($1), hex($2)); 306*0Sstevel@tonic-gate my $subrange = [ $lo, $hi, $3 ]; 307*0Sstevel@tonic-gate push @BLOCKS, $subrange; 308*0Sstevel@tonic-gate push @{$BLOCKS{$3}}, $subrange; 309*0Sstevel@tonic-gate } 310*0Sstevel@tonic-gate } 311*0Sstevel@tonic-gate close($BLOCKSFH); 312*0Sstevel@tonic-gate } 313*0Sstevel@tonic-gate } 314*0Sstevel@tonic-gate} 315*0Sstevel@tonic-gate 316*0Sstevel@tonic-gatesub charblock { 317*0Sstevel@tonic-gate my $arg = shift; 318*0Sstevel@tonic-gate 319*0Sstevel@tonic-gate _charblocks() unless @BLOCKS; 320*0Sstevel@tonic-gate 321*0Sstevel@tonic-gate my $code = _getcode($arg); 322*0Sstevel@tonic-gate 323*0Sstevel@tonic-gate if (defined $code) { 324*0Sstevel@tonic-gate _search(\@BLOCKS, 0, $#BLOCKS, $code); 325*0Sstevel@tonic-gate } else { 326*0Sstevel@tonic-gate if (exists $BLOCKS{$arg}) { 327*0Sstevel@tonic-gate return dclone $BLOCKS{$arg}; 328*0Sstevel@tonic-gate } else { 329*0Sstevel@tonic-gate return; 330*0Sstevel@tonic-gate } 331*0Sstevel@tonic-gate } 332*0Sstevel@tonic-gate} 333*0Sstevel@tonic-gate 334*0Sstevel@tonic-gate=head2 charscript 335*0Sstevel@tonic-gate 336*0Sstevel@tonic-gate use Unicode::UCD 'charscript'; 337*0Sstevel@tonic-gate 338*0Sstevel@tonic-gate my $charscript = charscript(0x41); 339*0Sstevel@tonic-gate my $charscript = charscript(1234); 340*0Sstevel@tonic-gate my $charscript = charscript("U+263a"); 341*0Sstevel@tonic-gate 342*0Sstevel@tonic-gate my $range = charscript('Thai'); 343*0Sstevel@tonic-gate 344*0Sstevel@tonic-gateWith a B<code point argument> charscript() returns the I<script> the 345*0Sstevel@tonic-gatecharacter belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 346*0Sstevel@tonic-gate 347*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>. 348*0Sstevel@tonic-gate 349*0Sstevel@tonic-gateIf supplied with an argument that can't be a code point, charscript() tries 350*0Sstevel@tonic-gateto do the opposite and interpret the argument as a character script. The 351*0Sstevel@tonic-gatereturn value is a I<range>: an anonymous list of lists that contain 352*0Sstevel@tonic-gateI<start-of-range>, I<end-of-range> code point pairs. You can test whether a 353*0Sstevel@tonic-gatecode point is in a range using the L</charinrange> function. If the 354*0Sstevel@tonic-gateargument is not a known charater script, C<undef> is returned. 355*0Sstevel@tonic-gate 356*0Sstevel@tonic-gate=cut 357*0Sstevel@tonic-gate 358*0Sstevel@tonic-gatemy @SCRIPTS; 359*0Sstevel@tonic-gatemy %SCRIPTS; 360*0Sstevel@tonic-gate 361*0Sstevel@tonic-gatesub _charscripts { 362*0Sstevel@tonic-gate unless (@SCRIPTS) { 363*0Sstevel@tonic-gate if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 364*0Sstevel@tonic-gate local $_; 365*0Sstevel@tonic-gate while (<$SCRIPTSFH>) { 366*0Sstevel@tonic-gate if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 367*0Sstevel@tonic-gate my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 368*0Sstevel@tonic-gate my $script = lc($3); 369*0Sstevel@tonic-gate $script =~ s/\b(\w)/uc($1)/ge; 370*0Sstevel@tonic-gate my $subrange = [ $lo, $hi, $script ]; 371*0Sstevel@tonic-gate push @SCRIPTS, $subrange; 372*0Sstevel@tonic-gate push @{$SCRIPTS{$script}}, $subrange; 373*0Sstevel@tonic-gate } 374*0Sstevel@tonic-gate } 375*0Sstevel@tonic-gate close($SCRIPTSFH); 376*0Sstevel@tonic-gate @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 377*0Sstevel@tonic-gate } 378*0Sstevel@tonic-gate } 379*0Sstevel@tonic-gate} 380*0Sstevel@tonic-gate 381*0Sstevel@tonic-gatesub charscript { 382*0Sstevel@tonic-gate my $arg = shift; 383*0Sstevel@tonic-gate 384*0Sstevel@tonic-gate _charscripts() unless @SCRIPTS; 385*0Sstevel@tonic-gate 386*0Sstevel@tonic-gate my $code = _getcode($arg); 387*0Sstevel@tonic-gate 388*0Sstevel@tonic-gate if (defined $code) { 389*0Sstevel@tonic-gate _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 390*0Sstevel@tonic-gate } else { 391*0Sstevel@tonic-gate if (exists $SCRIPTS{$arg}) { 392*0Sstevel@tonic-gate return dclone $SCRIPTS{$arg}; 393*0Sstevel@tonic-gate } else { 394*0Sstevel@tonic-gate return; 395*0Sstevel@tonic-gate } 396*0Sstevel@tonic-gate } 397*0Sstevel@tonic-gate} 398*0Sstevel@tonic-gate 399*0Sstevel@tonic-gate=head2 charblocks 400*0Sstevel@tonic-gate 401*0Sstevel@tonic-gate use Unicode::UCD 'charblocks'; 402*0Sstevel@tonic-gate 403*0Sstevel@tonic-gate my $charblocks = charblocks(); 404*0Sstevel@tonic-gate 405*0Sstevel@tonic-gatecharblocks() returns a reference to a hash with the known block names 406*0Sstevel@tonic-gateas the keys, and the code point ranges (see L</charblock>) as the values. 407*0Sstevel@tonic-gate 408*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>. 409*0Sstevel@tonic-gate 410*0Sstevel@tonic-gate=cut 411*0Sstevel@tonic-gate 412*0Sstevel@tonic-gatesub charblocks { 413*0Sstevel@tonic-gate _charblocks() unless %BLOCKS; 414*0Sstevel@tonic-gate return dclone \%BLOCKS; 415*0Sstevel@tonic-gate} 416*0Sstevel@tonic-gate 417*0Sstevel@tonic-gate=head2 charscripts 418*0Sstevel@tonic-gate 419*0Sstevel@tonic-gate use Unicode::UCD 'charscripts'; 420*0Sstevel@tonic-gate 421*0Sstevel@tonic-gate my %charscripts = charscripts(); 422*0Sstevel@tonic-gate 423*0Sstevel@tonic-gatecharscripts() returns a hash with the known script names as the keys, 424*0Sstevel@tonic-gateand the code point ranges (see L</charscript>) as the values. 425*0Sstevel@tonic-gate 426*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>. 427*0Sstevel@tonic-gate 428*0Sstevel@tonic-gate=cut 429*0Sstevel@tonic-gate 430*0Sstevel@tonic-gatesub charscripts { 431*0Sstevel@tonic-gate _charscripts() unless %SCRIPTS; 432*0Sstevel@tonic-gate return dclone \%SCRIPTS; 433*0Sstevel@tonic-gate} 434*0Sstevel@tonic-gate 435*0Sstevel@tonic-gate=head2 Blocks versus Scripts 436*0Sstevel@tonic-gate 437*0Sstevel@tonic-gateThe difference between a block and a script is that scripts are closer 438*0Sstevel@tonic-gateto the linguistic notion of a set of characters required to present 439*0Sstevel@tonic-gatelanguages, while block is more of an artifact of the Unicode character 440*0Sstevel@tonic-gatenumbering and separation into blocks of (mostly) 256 characters. 441*0Sstevel@tonic-gate 442*0Sstevel@tonic-gateFor example the Latin B<script> is spread over several B<blocks>, such 443*0Sstevel@tonic-gateas C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 444*0Sstevel@tonic-gateC<Latin Extended-B>. On the other hand, the Latin script does not 445*0Sstevel@tonic-gatecontain all the characters of the C<Basic Latin> block (also known as 446*0Sstevel@tonic-gatethe ASCII): it includes only the letters, and not, for example, the digits 447*0Sstevel@tonic-gateor the punctuation. 448*0Sstevel@tonic-gate 449*0Sstevel@tonic-gateFor blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt 450*0Sstevel@tonic-gate 451*0Sstevel@tonic-gateFor scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ 452*0Sstevel@tonic-gate 453*0Sstevel@tonic-gate=head2 Matching Scripts and Blocks 454*0Sstevel@tonic-gate 455*0Sstevel@tonic-gateScripts are matched with the regular-expression construct 456*0Sstevel@tonic-gateC<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 457*0Sstevel@tonic-gatewhile C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 458*0Sstevel@tonic-gateany of the 256 code points in the Tibetan block). 459*0Sstevel@tonic-gate 460*0Sstevel@tonic-gate=head2 Code Point Arguments 461*0Sstevel@tonic-gate 462*0Sstevel@tonic-gateA I<code point argument> is either a decimal or a hexadecimal scalar 463*0Sstevel@tonic-gatedesignating a Unicode character, or C<U+> followed by hexadecimals 464*0Sstevel@tonic-gatedesignating a Unicode character. In other words, if you want a code 465*0Sstevel@tonic-gatepoint to be interpreted as a hexadecimal number, you must prefix it 466*0Sstevel@tonic-gatewith either C<0x> or C<U+>, because a string like e.g. C<123> will 467*0Sstevel@tonic-gatebe interpreted as a decimal code point. Also note that Unicode is 468*0Sstevel@tonic-gateB<not> limited to 16 bits (the number of Unicode characters is 469*0Sstevel@tonic-gateopen-ended, in theory unlimited): you may have more than 4 hexdigits. 470*0Sstevel@tonic-gate 471*0Sstevel@tonic-gate=head2 charinrange 472*0Sstevel@tonic-gate 473*0Sstevel@tonic-gateIn addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 474*0Sstevel@tonic-gatecan also test whether a code point is in the I<range> as returned by 475*0Sstevel@tonic-gateL</charblock> and L</charscript> or as the values of the hash returned 476*0Sstevel@tonic-gateby L</charblocks> and L</charscripts> by using charinrange(): 477*0Sstevel@tonic-gate 478*0Sstevel@tonic-gate use Unicode::UCD qw(charscript charinrange); 479*0Sstevel@tonic-gate 480*0Sstevel@tonic-gate $range = charscript('Hiragana'); 481*0Sstevel@tonic-gate print "looks like hiragana\n" if charinrange($range, $codepoint); 482*0Sstevel@tonic-gate 483*0Sstevel@tonic-gate=cut 484*0Sstevel@tonic-gate 485*0Sstevel@tonic-gate=head2 compexcl 486*0Sstevel@tonic-gate 487*0Sstevel@tonic-gate use Unicode::UCD 'compexcl'; 488*0Sstevel@tonic-gate 489*0Sstevel@tonic-gate my $compexcl = compexcl("09dc"); 490*0Sstevel@tonic-gate 491*0Sstevel@tonic-gateThe compexcl() returns the composition exclusion (that is, if the 492*0Sstevel@tonic-gatecharacter should not be produced during a precomposition) of the 493*0Sstevel@tonic-gatecharacter specified by a B<code point argument>. 494*0Sstevel@tonic-gate 495*0Sstevel@tonic-gateIf there is a composition exclusion for the character, true is 496*0Sstevel@tonic-gatereturned. Otherwise, false is returned. 497*0Sstevel@tonic-gate 498*0Sstevel@tonic-gate=cut 499*0Sstevel@tonic-gate 500*0Sstevel@tonic-gatemy %COMPEXCL; 501*0Sstevel@tonic-gate 502*0Sstevel@tonic-gatesub _compexcl { 503*0Sstevel@tonic-gate unless (%COMPEXCL) { 504*0Sstevel@tonic-gate if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 505*0Sstevel@tonic-gate local $_; 506*0Sstevel@tonic-gate while (<$COMPEXCLFH>) { 507*0Sstevel@tonic-gate if (/^([0-9A-F]+)\s+\#\s+/) { 508*0Sstevel@tonic-gate my $code = hex($1); 509*0Sstevel@tonic-gate $COMPEXCL{$code} = undef; 510*0Sstevel@tonic-gate } 511*0Sstevel@tonic-gate } 512*0Sstevel@tonic-gate close($COMPEXCLFH); 513*0Sstevel@tonic-gate } 514*0Sstevel@tonic-gate } 515*0Sstevel@tonic-gate} 516*0Sstevel@tonic-gate 517*0Sstevel@tonic-gatesub compexcl { 518*0Sstevel@tonic-gate my $arg = shift; 519*0Sstevel@tonic-gate my $code = _getcode($arg); 520*0Sstevel@tonic-gate croak __PACKAGE__, "::compexcl: unknown code '$arg'" 521*0Sstevel@tonic-gate unless defined $code; 522*0Sstevel@tonic-gate 523*0Sstevel@tonic-gate _compexcl() unless %COMPEXCL; 524*0Sstevel@tonic-gate 525*0Sstevel@tonic-gate return exists $COMPEXCL{$code}; 526*0Sstevel@tonic-gate} 527*0Sstevel@tonic-gate 528*0Sstevel@tonic-gate=head2 casefold 529*0Sstevel@tonic-gate 530*0Sstevel@tonic-gate use Unicode::UCD 'casefold'; 531*0Sstevel@tonic-gate 532*0Sstevel@tonic-gate my $casefold = casefold("00DF"); 533*0Sstevel@tonic-gate 534*0Sstevel@tonic-gateThe casefold() returns the locale-independent case folding of the 535*0Sstevel@tonic-gatecharacter specified by a B<code point argument>. 536*0Sstevel@tonic-gate 537*0Sstevel@tonic-gateIf there is a case folding for that character, a reference to a hash 538*0Sstevel@tonic-gatewith the following fields is returned: 539*0Sstevel@tonic-gate 540*0Sstevel@tonic-gate key 541*0Sstevel@tonic-gate 542*0Sstevel@tonic-gate code code point with at least four hexdigits 543*0Sstevel@tonic-gate status "C", "F", "S", or "I" 544*0Sstevel@tonic-gate mapping one or more codes separated by spaces 545*0Sstevel@tonic-gate 546*0Sstevel@tonic-gateThe meaning of the I<status> is as follows: 547*0Sstevel@tonic-gate 548*0Sstevel@tonic-gate C common case folding, common mappings shared 549*0Sstevel@tonic-gate by both simple and full mappings 550*0Sstevel@tonic-gate F full case folding, mappings that cause strings 551*0Sstevel@tonic-gate to grow in length. Multiple characters are separated 552*0Sstevel@tonic-gate by spaces 553*0Sstevel@tonic-gate S simple case folding, mappings to single characters 554*0Sstevel@tonic-gate where different from F 555*0Sstevel@tonic-gate I special case for dotted uppercase I and 556*0Sstevel@tonic-gate dotless lowercase i 557*0Sstevel@tonic-gate - If this mapping is included, the result is 558*0Sstevel@tonic-gate case-insensitive, but dotless and dotted I's 559*0Sstevel@tonic-gate are not distinguished 560*0Sstevel@tonic-gate - If this mapping is excluded, the result is not 561*0Sstevel@tonic-gate fully case-insensitive, but dotless and dotted 562*0Sstevel@tonic-gate I's are distinguished 563*0Sstevel@tonic-gate 564*0Sstevel@tonic-gateIf there is no case folding for that character, C<undef> is returned. 565*0Sstevel@tonic-gate 566*0Sstevel@tonic-gateFor more information about case mappings see 567*0Sstevel@tonic-gatehttp://www.unicode.org/unicode/reports/tr21/ 568*0Sstevel@tonic-gate 569*0Sstevel@tonic-gate=cut 570*0Sstevel@tonic-gate 571*0Sstevel@tonic-gatemy %CASEFOLD; 572*0Sstevel@tonic-gate 573*0Sstevel@tonic-gatesub _casefold { 574*0Sstevel@tonic-gate unless (%CASEFOLD) { 575*0Sstevel@tonic-gate if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 576*0Sstevel@tonic-gate local $_; 577*0Sstevel@tonic-gate while (<$CASEFOLDFH>) { 578*0Sstevel@tonic-gate if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 579*0Sstevel@tonic-gate my $code = hex($1); 580*0Sstevel@tonic-gate $CASEFOLD{$code} = { code => $1, 581*0Sstevel@tonic-gate status => $2, 582*0Sstevel@tonic-gate mapping => $3 }; 583*0Sstevel@tonic-gate } 584*0Sstevel@tonic-gate } 585*0Sstevel@tonic-gate close($CASEFOLDFH); 586*0Sstevel@tonic-gate } 587*0Sstevel@tonic-gate } 588*0Sstevel@tonic-gate} 589*0Sstevel@tonic-gate 590*0Sstevel@tonic-gatesub casefold { 591*0Sstevel@tonic-gate my $arg = shift; 592*0Sstevel@tonic-gate my $code = _getcode($arg); 593*0Sstevel@tonic-gate croak __PACKAGE__, "::casefold: unknown code '$arg'" 594*0Sstevel@tonic-gate unless defined $code; 595*0Sstevel@tonic-gate 596*0Sstevel@tonic-gate _casefold() unless %CASEFOLD; 597*0Sstevel@tonic-gate 598*0Sstevel@tonic-gate return $CASEFOLD{$code}; 599*0Sstevel@tonic-gate} 600*0Sstevel@tonic-gate 601*0Sstevel@tonic-gate=head2 casespec 602*0Sstevel@tonic-gate 603*0Sstevel@tonic-gate use Unicode::UCD 'casespec'; 604*0Sstevel@tonic-gate 605*0Sstevel@tonic-gate my $casespec = casespec("FB00"); 606*0Sstevel@tonic-gate 607*0Sstevel@tonic-gateThe casespec() returns the potentially locale-dependent case mapping 608*0Sstevel@tonic-gateof the character specified by a B<code point argument>. The mapping 609*0Sstevel@tonic-gatemay change the length of the string (which the basic Unicode case 610*0Sstevel@tonic-gatemappings as returned by charinfo() never do). 611*0Sstevel@tonic-gate 612*0Sstevel@tonic-gateIf there is a case folding for that character, a reference to a hash 613*0Sstevel@tonic-gatewith the following fields is returned: 614*0Sstevel@tonic-gate 615*0Sstevel@tonic-gate key 616*0Sstevel@tonic-gate 617*0Sstevel@tonic-gate code code point with at least four hexdigits 618*0Sstevel@tonic-gate lower lowercase 619*0Sstevel@tonic-gate title titlecase 620*0Sstevel@tonic-gate upper uppercase 621*0Sstevel@tonic-gate condition condition list (may be undef) 622*0Sstevel@tonic-gate 623*0Sstevel@tonic-gateThe C<condition> is optional. Where present, it consists of one or 624*0Sstevel@tonic-gatemore I<locales> or I<contexts>, separated by spaces (other than as 625*0Sstevel@tonic-gateused to separate elements, spaces are to be ignored). A condition 626*0Sstevel@tonic-gatelist overrides the normal behavior if all of the listed conditions are 627*0Sstevel@tonic-gatetrue. Case distinctions in the condition list are not significant. 628*0Sstevel@tonic-gateConditions preceded by "NON_" represent the negation of the condition. 629*0Sstevel@tonic-gate 630*0Sstevel@tonic-gateNote that when there are multiple case folding definitions for a 631*0Sstevel@tonic-gatesingle code point because of different locales, the value returned by 632*0Sstevel@tonic-gatecasespec() is a hash reference which has the locales as the keys and 633*0Sstevel@tonic-gatehash references as described above as the values. 634*0Sstevel@tonic-gate 635*0Sstevel@tonic-gateA I<locale> is defined as a 2-letter ISO 3166 country code, possibly 636*0Sstevel@tonic-gatefollowed by a "_" and a 2-letter ISO language code (possibly followed 637*0Sstevel@tonic-gateby a "_" and a variant code). You can find the lists of those codes, 638*0Sstevel@tonic-gatesee L<Locale::Country> and L<Locale::Language>. 639*0Sstevel@tonic-gate 640*0Sstevel@tonic-gateA I<context> is one of the following choices: 641*0Sstevel@tonic-gate 642*0Sstevel@tonic-gate FINAL The letter is not followed by a letter of 643*0Sstevel@tonic-gate general category L (e.g. Ll, Lt, Lu, Lm, or Lo) 644*0Sstevel@tonic-gate MODERN The mapping is only used for modern text 645*0Sstevel@tonic-gate AFTER_i The last base character was "i" (U+0069) 646*0Sstevel@tonic-gate 647*0Sstevel@tonic-gateFor more information about case mappings see 648*0Sstevel@tonic-gatehttp://www.unicode.org/unicode/reports/tr21/ 649*0Sstevel@tonic-gate 650*0Sstevel@tonic-gate=cut 651*0Sstevel@tonic-gate 652*0Sstevel@tonic-gatemy %CASESPEC; 653*0Sstevel@tonic-gate 654*0Sstevel@tonic-gatesub _casespec { 655*0Sstevel@tonic-gate unless (%CASESPEC) { 656*0Sstevel@tonic-gate if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 657*0Sstevel@tonic-gate local $_; 658*0Sstevel@tonic-gate while (<$CASESPECFH>) { 659*0Sstevel@tonic-gate if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 660*0Sstevel@tonic-gate my ($hexcode, $lower, $title, $upper, $condition) = 661*0Sstevel@tonic-gate ($1, $2, $3, $4, $5); 662*0Sstevel@tonic-gate my $code = hex($hexcode); 663*0Sstevel@tonic-gate if (exists $CASESPEC{$code}) { 664*0Sstevel@tonic-gate if (exists $CASESPEC{$code}->{code}) { 665*0Sstevel@tonic-gate my ($oldlower, 666*0Sstevel@tonic-gate $oldtitle, 667*0Sstevel@tonic-gate $oldupper, 668*0Sstevel@tonic-gate $oldcondition) = 669*0Sstevel@tonic-gate @{$CASESPEC{$code}}{qw(lower 670*0Sstevel@tonic-gate title 671*0Sstevel@tonic-gate upper 672*0Sstevel@tonic-gate condition)}; 673*0Sstevel@tonic-gate if (defined $oldcondition) { 674*0Sstevel@tonic-gate my ($oldlocale) = 675*0Sstevel@tonic-gate ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 676*0Sstevel@tonic-gate delete $CASESPEC{$code}; 677*0Sstevel@tonic-gate $CASESPEC{$code}->{$oldlocale} = 678*0Sstevel@tonic-gate { code => $hexcode, 679*0Sstevel@tonic-gate lower => $oldlower, 680*0Sstevel@tonic-gate title => $oldtitle, 681*0Sstevel@tonic-gate upper => $oldupper, 682*0Sstevel@tonic-gate condition => $oldcondition }; 683*0Sstevel@tonic-gate } 684*0Sstevel@tonic-gate } 685*0Sstevel@tonic-gate my ($locale) = 686*0Sstevel@tonic-gate ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 687*0Sstevel@tonic-gate $CASESPEC{$code}->{$locale} = 688*0Sstevel@tonic-gate { code => $hexcode, 689*0Sstevel@tonic-gate lower => $lower, 690*0Sstevel@tonic-gate title => $title, 691*0Sstevel@tonic-gate upper => $upper, 692*0Sstevel@tonic-gate condition => $condition }; 693*0Sstevel@tonic-gate } else { 694*0Sstevel@tonic-gate $CASESPEC{$code} = 695*0Sstevel@tonic-gate { code => $hexcode, 696*0Sstevel@tonic-gate lower => $lower, 697*0Sstevel@tonic-gate title => $title, 698*0Sstevel@tonic-gate upper => $upper, 699*0Sstevel@tonic-gate condition => $condition }; 700*0Sstevel@tonic-gate } 701*0Sstevel@tonic-gate } 702*0Sstevel@tonic-gate } 703*0Sstevel@tonic-gate close($CASESPECFH); 704*0Sstevel@tonic-gate } 705*0Sstevel@tonic-gate } 706*0Sstevel@tonic-gate} 707*0Sstevel@tonic-gate 708*0Sstevel@tonic-gatesub casespec { 709*0Sstevel@tonic-gate my $arg = shift; 710*0Sstevel@tonic-gate my $code = _getcode($arg); 711*0Sstevel@tonic-gate croak __PACKAGE__, "::casespec: unknown code '$arg'" 712*0Sstevel@tonic-gate unless defined $code; 713*0Sstevel@tonic-gate 714*0Sstevel@tonic-gate _casespec() unless %CASESPEC; 715*0Sstevel@tonic-gate 716*0Sstevel@tonic-gate return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; 717*0Sstevel@tonic-gate} 718*0Sstevel@tonic-gate 719*0Sstevel@tonic-gate=head2 Unicode::UCD::UnicodeVersion 720*0Sstevel@tonic-gate 721*0Sstevel@tonic-gateUnicode::UCD::UnicodeVersion() returns the version of the Unicode 722*0Sstevel@tonic-gateCharacter Database, in other words, the version of the Unicode 723*0Sstevel@tonic-gatestandard the database implements. The version is a string 724*0Sstevel@tonic-gateof numbers delimited by dots (C<'.'>). 725*0Sstevel@tonic-gate 726*0Sstevel@tonic-gate=cut 727*0Sstevel@tonic-gate 728*0Sstevel@tonic-gatemy $UNICODEVERSION; 729*0Sstevel@tonic-gate 730*0Sstevel@tonic-gatesub UnicodeVersion { 731*0Sstevel@tonic-gate unless (defined $UNICODEVERSION) { 732*0Sstevel@tonic-gate openunicode(\$VERSIONFH, "version"); 733*0Sstevel@tonic-gate chomp($UNICODEVERSION = <$VERSIONFH>); 734*0Sstevel@tonic-gate close($VERSIONFH); 735*0Sstevel@tonic-gate croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 736*0Sstevel@tonic-gate unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 737*0Sstevel@tonic-gate } 738*0Sstevel@tonic-gate return $UNICODEVERSION; 739*0Sstevel@tonic-gate} 740*0Sstevel@tonic-gate 741*0Sstevel@tonic-gate=head2 Implementation Note 742*0Sstevel@tonic-gate 743*0Sstevel@tonic-gateThe first use of charinfo() opens a read-only filehandle to the Unicode 744*0Sstevel@tonic-gateCharacter Database (the database is included in the Perl distribution). 745*0Sstevel@tonic-gateThe filehandle is then kept open for further queries. In other words, 746*0Sstevel@tonic-gateif you are wondering where one of your filehandles went, that's where. 747*0Sstevel@tonic-gate 748*0Sstevel@tonic-gate=head1 BUGS 749*0Sstevel@tonic-gate 750*0Sstevel@tonic-gateDoes not yet support EBCDIC platforms. 751*0Sstevel@tonic-gate 752*0Sstevel@tonic-gate=head1 AUTHOR 753*0Sstevel@tonic-gate 754*0Sstevel@tonic-gateJarkko Hietaniemi 755*0Sstevel@tonic-gate 756*0Sstevel@tonic-gate=cut 757*0Sstevel@tonic-gate 758*0Sstevel@tonic-gate1; 759