lib/Unicode/UCD.pm

*0Sstevel@tonic-gatepackage Unicode::UCD;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse strict;
*0Sstevel@tonic-gateuse warnings;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateour $VERSION = '0.22';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse Storable qw(dclone);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gaterequire Exporter;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateour @ISA = qw(Exporter);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateour @EXPORT_OK = qw(charinfo
*0Sstevel@tonic-gate		    charblock charscript
*0Sstevel@tonic-gate		    charblocks charscripts
*0Sstevel@tonic-gate		    charinrange
*0Sstevel@tonic-gate		    compexcl
*0Sstevel@tonic-gate		    casefold casespec);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateuse Carp;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 NAME
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateUnicode::UCD - Unicode character database
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 SYNOPSIS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charinfo';
*0Sstevel@tonic-gate    my $charinfo   = charinfo($codepoint);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charblock';
*0Sstevel@tonic-gate    my $charblock  = charblock($codepoint);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charscript';
*0Sstevel@tonic-gate    my $charscript = charscript($codepoint);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charblocks';
*0Sstevel@tonic-gate    my $charblocks = charblocks();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charscripts';
*0Sstevel@tonic-gate    my %charscripts = charscripts();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD qw(charscript charinrange);
*0Sstevel@tonic-gate    my $range = charscript($script);
*0Sstevel@tonic-gate    print "looks like $script\n" if charinrange($range, $codepoint);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'compexcl';
*0Sstevel@tonic-gate    my $compexcl = compexcl($codepoint);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $unicode_version = Unicode::UCD::UnicodeVersion();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 DESCRIPTION
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe Unicode::UCD module offers a simple interface to the Unicode
*0Sstevel@tonic-gateCharacter Database.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy $UNICODEFH;
*0Sstevel@tonic-gatemy $BLOCKSFH;
*0Sstevel@tonic-gatemy $SCRIPTSFH;
*0Sstevel@tonic-gatemy $VERSIONFH;
*0Sstevel@tonic-gatemy $COMPEXCLFH;
*0Sstevel@tonic-gatemy $CASEFOLDFH;
*0Sstevel@tonic-gatemy $CASESPECFH;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub openunicode {
*0Sstevel@tonic-gate    my ($rfh, @path) = @_;
*0Sstevel@tonic-gate    my $f;
*0Sstevel@tonic-gate    unless (defined $$rfh) {
*0Sstevel@tonic-gate	for my $d (@INC) {
*0Sstevel@tonic-gate	    use File::Spec;
*0Sstevel@tonic-gate	    $f = File::Spec->catfile($d, "unicore", @path);
*0Sstevel@tonic-gate	    last if open($$rfh, $f);
*0Sstevel@tonic-gate	    undef $f;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate	croak __PACKAGE__, ": failed to find ",
*0Sstevel@tonic-gate              File::Spec->catfile(@path), " in @INC"
*0Sstevel@tonic-gate	    unless defined $f;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    return $f;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 charinfo
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charinfo';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $charinfo = charinfo(0x41);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatecharinfo() returns a reference to a hash that has the following fields
*0Sstevel@tonic-gateas defined by the Unicode standard:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    key
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    code             code point with at least four hexdigits
*0Sstevel@tonic-gate    name             name of the character IN UPPER CASE
*0Sstevel@tonic-gate    category         general category of the character
*0Sstevel@tonic-gate    combining        classes used in the Canonical Ordering Algorithm
*0Sstevel@tonic-gate    bidi             bidirectional category
*0Sstevel@tonic-gate    decomposition    character decomposition mapping
*0Sstevel@tonic-gate    decimal          if decimal digit this is the integer numeric value
*0Sstevel@tonic-gate    digit            if digit this is the numeric value
*0Sstevel@tonic-gate    numeric          if numeric is the integer or rational numeric value
*0Sstevel@tonic-gate    mirrored         if mirrored in bidirectional text
*0Sstevel@tonic-gate    unicode10        Unicode 1.0 name if existed and different
*0Sstevel@tonic-gate    comment          ISO 10646 comment field
*0Sstevel@tonic-gate    upper            uppercase equivalent mapping
*0Sstevel@tonic-gate    lower            lowercase equivalent mapping
*0Sstevel@tonic-gate    title            titlecase equivalent mapping
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    block            block the character belongs to (used in \p{In...})
*0Sstevel@tonic-gate    script           script the character belongs to
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf no match is found, a reference to an empty hash is returned.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe C<block> property is the same as returned by charinfo().  It is
*0Sstevel@tonic-gatenot defined in the Unicode Character Database proper (Chapter 4 of the
*0Sstevel@tonic-gateUnicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
*0Sstevel@tonic-gate(Chapter 14 of TUS3).  Similarly for the C<script> property.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNote that you cannot do (de)composition and casing based solely on the
*0Sstevel@tonic-gateabove C<decomposition> and C<lower>, C<upper>, C<title>, properties,
*0Sstevel@tonic-gateyou will need also the compexcl(), casefold(), and casespec() functions.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# NB: This function is duplicated in charnames.pm
*0Sstevel@tonic-gatesub _getcode {
*0Sstevel@tonic-gate    my $arg = shift;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    if ($arg =~ /^[1-9]\d*$/) {
*0Sstevel@tonic-gate	return $arg;
*0Sstevel@tonic-gate    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
*0Sstevel@tonic-gate	return hex($1);
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    return;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate# Lingua::KO::Hangul::Util not part of the standard distribution
*0Sstevel@tonic-gate# but it will be used if available.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateeval { require Lingua::KO::Hangul::Util };
*0Sstevel@tonic-gatemy $hasHangulUtil = ! $@;
*0Sstevel@tonic-gateif ($hasHangulUtil) {
*0Sstevel@tonic-gate    Lingua::KO::Hangul::Util->import();
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub hangul_decomp { # internal: called from charinfo
*0Sstevel@tonic-gate    if ($hasHangulUtil) {
*0Sstevel@tonic-gate	my @tmp = decomposeHangul(shift);
*0Sstevel@tonic-gate	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
*0Sstevel@tonic-gate	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    return;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub hangul_charname { # internal: called from charinfo
*0Sstevel@tonic-gate    return sprintf("HANGUL SYLLABLE-%04X", shift);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub han_charname { # internal: called from charinfo
*0Sstevel@tonic-gate    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy @CharinfoRanges = (
*0Sstevel@tonic-gate# block name
*0Sstevel@tonic-gate# [ first, last, coderef to name, coderef to decompose ],
*0Sstevel@tonic-gate# CJK Ideographs Extension A
*0Sstevel@tonic-gate  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
*0Sstevel@tonic-gate# CJK Ideographs
*0Sstevel@tonic-gate  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
*0Sstevel@tonic-gate# Hangul Syllables
*0Sstevel@tonic-gate  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
*0Sstevel@tonic-gate# Non-Private Use High Surrogates
*0Sstevel@tonic-gate  [ 0xD800,   0xDB7F,   undef,   undef  ],
*0Sstevel@tonic-gate# Private Use High Surrogates
*0Sstevel@tonic-gate  [ 0xDB80,   0xDBFF,   undef,   undef  ],
*0Sstevel@tonic-gate# Low Surrogates
*0Sstevel@tonic-gate  [ 0xDC00,   0xDFFF,   undef,   undef  ],
*0Sstevel@tonic-gate# The Private Use Area
*0Sstevel@tonic-gate  [ 0xE000,   0xF8FF,   undef,   undef  ],
*0Sstevel@tonic-gate# CJK Ideographs Extension B
*0Sstevel@tonic-gate  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
*0Sstevel@tonic-gate# Plane 15 Private Use Area
*0Sstevel@tonic-gate  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
*0Sstevel@tonic-gate# Plane 16 Private Use Area
*0Sstevel@tonic-gate  [ 0x100000, 0x10FFFD, undef,   undef  ],
*0Sstevel@tonic-gate);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub charinfo {
*0Sstevel@tonic-gate    my $arg  = shift;
*0Sstevel@tonic-gate    my $code = _getcode($arg);
*0Sstevel@tonic-gate    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
*0Sstevel@tonic-gate	unless defined $code;
*0Sstevel@tonic-gate    my $hexk = sprintf("%06X", $code);
*0Sstevel@tonic-gate    my($rcode,$rname,$rdec);
*0Sstevel@tonic-gate    foreach my $range (@CharinfoRanges){
*0Sstevel@tonic-gate      if ($range->[0] <= $code && $code <= $range->[1]) {
*0Sstevel@tonic-gate        $rcode = $hexk;
*0Sstevel@tonic-gate	$rcode =~ s/^0+//;
*0Sstevel@tonic-gate	$rcode =  sprintf("%04X", hex($rcode));
*0Sstevel@tonic-gate        $rname = $range->[2] ? $range->[2]->($code) : '';
*0Sstevel@tonic-gate        $rdec  = $range->[3] ? $range->[3]->($code) : '';
*0Sstevel@tonic-gate        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
*0Sstevel@tonic-gate        last;
*0Sstevel@tonic-gate      }
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    openunicode(\$UNICODEFH, "UnicodeData.txt");
*0Sstevel@tonic-gate    if (defined $UNICODEFH) {
*0Sstevel@tonic-gate	use Search::Dict 1.02;
*0Sstevel@tonic-gate	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
*0Sstevel@tonic-gate	    my $line = <$UNICODEFH>;
*0Sstevel@tonic-gate	    return unless defined $line;
*0Sstevel@tonic-gate	    chomp $line;
*0Sstevel@tonic-gate	    my %prop;
*0Sstevel@tonic-gate	    @prop{qw(
*0Sstevel@tonic-gate		     code name category
*0Sstevel@tonic-gate		     combining bidi decomposition
*0Sstevel@tonic-gate		     decimal digit numeric
*0Sstevel@tonic-gate		     mirrored unicode10 comment
*0Sstevel@tonic-gate		     upper lower title
*0Sstevel@tonic-gate		    )} = split(/;/, $line, -1);
*0Sstevel@tonic-gate	    $hexk =~ s/^0+//;
*0Sstevel@tonic-gate	    $hexk =  sprintf("%04X", hex($hexk));
*0Sstevel@tonic-gate	    if ($prop{code} eq $hexk) {
*0Sstevel@tonic-gate		$prop{block}  = charblock($code);
*0Sstevel@tonic-gate		$prop{script} = charscript($code);
*0Sstevel@tonic-gate		if(defined $rname){
*0Sstevel@tonic-gate                    $prop{code} = $rcode;
*0Sstevel@tonic-gate                    $prop{name} = $rname;
*0Sstevel@tonic-gate                    $prop{decomposition} = $rdec;
*0Sstevel@tonic-gate                }
*0Sstevel@tonic-gate		return \%prop;
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    return;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
*0Sstevel@tonic-gate    my ($table, $lo, $hi, $code) = @_;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    return if $lo > $hi;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $mid = int(($lo+$hi) / 2);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    if ($table->[$mid]->[0] < $code) {
*0Sstevel@tonic-gate	if ($table->[$mid]->[1] >= $code) {
*0Sstevel@tonic-gate	    return $table->[$mid]->[2];
*0Sstevel@tonic-gate	} else {
*0Sstevel@tonic-gate	    _search($table, $mid + 1, $hi, $code);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    } elsif ($table->[$mid]->[0] > $code) {
*0Sstevel@tonic-gate	_search($table, $lo, $mid - 1, $code);
*0Sstevel@tonic-gate    } else {
*0Sstevel@tonic-gate	return $table->[$mid]->[2];
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub charinrange {
*0Sstevel@tonic-gate    my ($range, $arg) = @_;
*0Sstevel@tonic-gate    my $code = _getcode($arg);
*0Sstevel@tonic-gate    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
*0Sstevel@tonic-gate	unless defined $code;
*0Sstevel@tonic-gate    _search($range, 0, $#$range, $code);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 charblock
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charblock';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $charblock = charblock(0x41);
*0Sstevel@tonic-gate    my $charblock = charblock(1234);
*0Sstevel@tonic-gate    my $charblock = charblock("0x263a");
*0Sstevel@tonic-gate    my $charblock = charblock("U+263a");
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $range     = charblock('Armenian');
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWith a B<code point argument> charblock() returns the I<block> the character
*0Sstevel@tonic-gatebelongs to, e.g.  C<Basic Latin>.  Note that not all the character
*0Sstevel@tonic-gatepositions within all blocks are defined.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf supplied with an argument that can't be a code point, charblock() tries
*0Sstevel@tonic-gateto do the opposite and interpret the argument as a character block. The
*0Sstevel@tonic-gatereturn value is a I<range>: an anonymous list of lists that contain
*0Sstevel@tonic-gateI<start-of-range>, I<end-of-range> code point pairs. You can test whether a
*0Sstevel@tonic-gatecode point is in a range using the L</charinrange> function. If the
*0Sstevel@tonic-gateargument is not a known charater block, C<undef> is returned.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy @BLOCKS;
*0Sstevel@tonic-gatemy %BLOCKS;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _charblocks {
*0Sstevel@tonic-gate    unless (@BLOCKS) {
*0Sstevel@tonic-gate	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
*0Sstevel@tonic-gate	    local $_;
*0Sstevel@tonic-gate	    while (<$BLOCKSFH>) {
*0Sstevel@tonic-gate		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
*0Sstevel@tonic-gate		    my ($lo, $hi) = (hex($1), hex($2));
*0Sstevel@tonic-gate		    my $subrange = [ $lo, $hi, $3 ];
*0Sstevel@tonic-gate		    push @BLOCKS, $subrange;
*0Sstevel@tonic-gate		    push @{$BLOCKS{$3}}, $subrange;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    close($BLOCKSFH);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub charblock {
*0Sstevel@tonic-gate    my $arg = shift;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    _charblocks() unless @BLOCKS;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $code = _getcode($arg);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    if (defined $code) {
*0Sstevel@tonic-gate	_search(\@BLOCKS, 0, $#BLOCKS, $code);
*0Sstevel@tonic-gate    } else {
*0Sstevel@tonic-gate	if (exists $BLOCKS{$arg}) {
*0Sstevel@tonic-gate	    return dclone $BLOCKS{$arg};
*0Sstevel@tonic-gate	} else {
*0Sstevel@tonic-gate	    return;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 charscript
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charscript';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $charscript = charscript(0x41);
*0Sstevel@tonic-gate    my $charscript = charscript(1234);
*0Sstevel@tonic-gate    my $charscript = charscript("U+263a");
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $range      = charscript('Thai');
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateWith a B<code point argument> charscript() returns the I<script> the
*0Sstevel@tonic-gatecharacter belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf supplied with an argument that can't be a code point, charscript() tries
*0Sstevel@tonic-gateto do the opposite and interpret the argument as a character script. The
*0Sstevel@tonic-gatereturn value is a I<range>: an anonymous list of lists that contain
*0Sstevel@tonic-gateI<start-of-range>, I<end-of-range> code point pairs. You can test whether a
*0Sstevel@tonic-gatecode point is in a range using the L</charinrange> function. If the
*0Sstevel@tonic-gateargument is not a known charater script, C<undef> is returned.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy @SCRIPTS;
*0Sstevel@tonic-gatemy %SCRIPTS;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _charscripts {
*0Sstevel@tonic-gate    unless (@SCRIPTS) {
*0Sstevel@tonic-gate	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
*0Sstevel@tonic-gate	    local $_;
*0Sstevel@tonic-gate	    while (<$SCRIPTSFH>) {
*0Sstevel@tonic-gate		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
*0Sstevel@tonic-gate		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
*0Sstevel@tonic-gate		    my $script = lc($3);
*0Sstevel@tonic-gate		    $script =~ s/\b(\w)/uc($1)/ge;
*0Sstevel@tonic-gate		    my $subrange = [ $lo, $hi, $script ];
*0Sstevel@tonic-gate		    push @SCRIPTS, $subrange;
*0Sstevel@tonic-gate		    push @{$SCRIPTS{$script}}, $subrange;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    close($SCRIPTSFH);
*0Sstevel@tonic-gate	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub charscript {
*0Sstevel@tonic-gate    my $arg = shift;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    _charscripts() unless @SCRIPTS;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $code = _getcode($arg);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    if (defined $code) {
*0Sstevel@tonic-gate	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
*0Sstevel@tonic-gate    } else {
*0Sstevel@tonic-gate	if (exists $SCRIPTS{$arg}) {
*0Sstevel@tonic-gate	    return dclone $SCRIPTS{$arg};
*0Sstevel@tonic-gate	} else {
*0Sstevel@tonic-gate	    return;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 charblocks
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charblocks';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $charblocks = charblocks();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatecharblocks() returns a reference to a hash with the known block names
*0Sstevel@tonic-gateas the keys, and the code point ranges (see L</charblock>) as the values.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub charblocks {
*0Sstevel@tonic-gate    _charblocks() unless %BLOCKS;
*0Sstevel@tonic-gate    return dclone \%BLOCKS;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 charscripts
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'charscripts';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my %charscripts = charscripts();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatecharscripts() returns a hash with the known script names as the keys,
*0Sstevel@tonic-gateand the code point ranges (see L</charscript>) as the values.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateSee also L</Blocks versus Scripts>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub charscripts {
*0Sstevel@tonic-gate    _charscripts() unless %SCRIPTS;
*0Sstevel@tonic-gate    return dclone \%SCRIPTS;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Blocks versus Scripts
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe difference between a block and a script is that scripts are closer
*0Sstevel@tonic-gateto the linguistic notion of a set of characters required to present
*0Sstevel@tonic-gatelanguages, while block is more of an artifact of the Unicode character
*0Sstevel@tonic-gatenumbering and separation into blocks of (mostly) 256 characters.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFor example the Latin B<script> is spread over several B<blocks>, such
*0Sstevel@tonic-gateas C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
*0Sstevel@tonic-gateC<Latin Extended-B>.  On the other hand, the Latin script does not
*0Sstevel@tonic-gatecontain all the characters of the C<Basic Latin> block (also known as
*0Sstevel@tonic-gatethe ASCII): it includes only the letters, and not, for example, the digits
*0Sstevel@tonic-gateor the punctuation.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFor blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFor scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Matching Scripts and Blocks
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateScripts are matched with the regular-expression construct
*0Sstevel@tonic-gateC<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
*0Sstevel@tonic-gatewhile C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
*0Sstevel@tonic-gateany of the 256 code points in the Tibetan block).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Code Point Arguments
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateA I<code point argument> is either a decimal or a hexadecimal scalar
*0Sstevel@tonic-gatedesignating a Unicode character, or C<U+> followed by hexadecimals
*0Sstevel@tonic-gatedesignating a Unicode character.  In other words, if you want a code
*0Sstevel@tonic-gatepoint to be interpreted as a hexadecimal number, you must prefix it
*0Sstevel@tonic-gatewith either C<0x> or C<U+>, because a string like e.g. C<123> will
*0Sstevel@tonic-gatebe interpreted as a decimal code point.  Also note that Unicode is
*0Sstevel@tonic-gateB<not> limited to 16 bits (the number of Unicode characters is
*0Sstevel@tonic-gateopen-ended, in theory unlimited): you may have more than 4 hexdigits.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 charinrange
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIn addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
*0Sstevel@tonic-gatecan also test whether a code point is in the I<range> as returned by
*0Sstevel@tonic-gateL</charblock> and L</charscript> or as the values of the hash returned
*0Sstevel@tonic-gateby L</charblocks> and L</charscripts> by using charinrange():
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD qw(charscript charinrange);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    $range = charscript('Hiragana');
*0Sstevel@tonic-gate    print "looks like hiragana\n" if charinrange($range, $codepoint);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 compexcl
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'compexcl';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $compexcl = compexcl("09dc");
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe compexcl() returns the composition exclusion (that is, if the
*0Sstevel@tonic-gatecharacter should not be produced during a precomposition) of the
*0Sstevel@tonic-gatecharacter specified by a B<code point argument>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf there is a composition exclusion for the character, true is
*0Sstevel@tonic-gatereturned.  Otherwise, false is returned.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy %COMPEXCL;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _compexcl {
*0Sstevel@tonic-gate    unless (%COMPEXCL) {
*0Sstevel@tonic-gate	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
*0Sstevel@tonic-gate	    local $_;
*0Sstevel@tonic-gate	    while (<$COMPEXCLFH>) {
*0Sstevel@tonic-gate		if (/^([0-9A-F]+)\s+\#\s+/) {
*0Sstevel@tonic-gate		    my $code = hex($1);
*0Sstevel@tonic-gate		    $COMPEXCL{$code} = undef;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    close($COMPEXCLFH);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub compexcl {
*0Sstevel@tonic-gate    my $arg  = shift;
*0Sstevel@tonic-gate    my $code = _getcode($arg);
*0Sstevel@tonic-gate    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
*0Sstevel@tonic-gate	unless defined $code;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    _compexcl() unless %COMPEXCL;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    return exists $COMPEXCL{$code};
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 casefold
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'casefold';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $casefold = casefold("00DF");
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe casefold() returns the locale-independent case folding of the
*0Sstevel@tonic-gatecharacter specified by a B<code point argument>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf there is a case folding for that character, a reference to a hash
*0Sstevel@tonic-gatewith the following fields is returned:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    key
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    code             code point with at least four hexdigits
*0Sstevel@tonic-gate    status           "C", "F", "S", or "I"
*0Sstevel@tonic-gate    mapping          one or more codes separated by spaces
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe meaning of the I<status> is as follows:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate   C                 common case folding, common mappings shared
*0Sstevel@tonic-gate                     by both simple and full mappings
*0Sstevel@tonic-gate   F                 full case folding, mappings that cause strings
*0Sstevel@tonic-gate                     to grow in length. Multiple characters are separated
*0Sstevel@tonic-gate                     by spaces
*0Sstevel@tonic-gate   S                 simple case folding, mappings to single characters
*0Sstevel@tonic-gate                     where different from F
*0Sstevel@tonic-gate   I                 special case for dotted uppercase I and
*0Sstevel@tonic-gate                     dotless lowercase i
*0Sstevel@tonic-gate                     - If this mapping is included, the result is
*0Sstevel@tonic-gate                       case-insensitive, but dotless and dotted I's
*0Sstevel@tonic-gate                       are not distinguished
*0Sstevel@tonic-gate                     - If this mapping is excluded, the result is not
*0Sstevel@tonic-gate                       fully case-insensitive, but dotless and dotted
*0Sstevel@tonic-gate                       I's are distinguished
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf there is no case folding for that character, C<undef> is returned.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFor more information about case mappings see
*0Sstevel@tonic-gatehttp://www.unicode.org/unicode/reports/tr21/
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy %CASEFOLD;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _casefold {
*0Sstevel@tonic-gate    unless (%CASEFOLD) {
*0Sstevel@tonic-gate	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
*0Sstevel@tonic-gate	    local $_;
*0Sstevel@tonic-gate	    while (<$CASEFOLDFH>) {
*0Sstevel@tonic-gate		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
*0Sstevel@tonic-gate		    my $code = hex($1);
*0Sstevel@tonic-gate		    $CASEFOLD{$code} = { code    => $1,
*0Sstevel@tonic-gate					 status  => $2,
*0Sstevel@tonic-gate					 mapping => $3 };
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    close($CASEFOLDFH);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub casefold {
*0Sstevel@tonic-gate    my $arg  = shift;
*0Sstevel@tonic-gate    my $code = _getcode($arg);
*0Sstevel@tonic-gate    croak __PACKAGE__, "::casefold: unknown code '$arg'"
*0Sstevel@tonic-gate	unless defined $code;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    _casefold() unless %CASEFOLD;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    return $CASEFOLD{$code};
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 casespec
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    use Unicode::UCD 'casespec';
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    my $casespec = casespec("FB00");
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe casespec() returns the potentially locale-dependent case mapping
*0Sstevel@tonic-gateof the character specified by a B<code point argument>.  The mapping
*0Sstevel@tonic-gatemay change the length of the string (which the basic Unicode case
*0Sstevel@tonic-gatemappings as returned by charinfo() never do).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateIf there is a case folding for that character, a reference to a hash
*0Sstevel@tonic-gatewith the following fields is returned:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    key
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    code             code point with at least four hexdigits
*0Sstevel@tonic-gate    lower            lowercase
*0Sstevel@tonic-gate    title            titlecase
*0Sstevel@tonic-gate    upper            uppercase
*0Sstevel@tonic-gate    condition        condition list (may be undef)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe C<condition> is optional.  Where present, it consists of one or
*0Sstevel@tonic-gatemore I<locales> or I<contexts>, separated by spaces (other than as
*0Sstevel@tonic-gateused to separate elements, spaces are to be ignored).  A condition
*0Sstevel@tonic-gatelist overrides the normal behavior if all of the listed conditions are
*0Sstevel@tonic-gatetrue.  Case distinctions in the condition list are not significant.
*0Sstevel@tonic-gateConditions preceded by "NON_" represent the negation of the condition.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateNote that when there are multiple case folding definitions for a
*0Sstevel@tonic-gatesingle code point because of different locales, the value returned by
*0Sstevel@tonic-gatecasespec() is a hash reference which has the locales as the keys and
*0Sstevel@tonic-gatehash references as described above as the values.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateA I<locale> is defined as a 2-letter ISO 3166 country code, possibly
*0Sstevel@tonic-gatefollowed by a "_" and a 2-letter ISO language code (possibly followed
*0Sstevel@tonic-gateby a "_" and a variant code).  You can find the lists of those codes,
*0Sstevel@tonic-gatesee L<Locale::Country> and L<Locale::Language>.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateA I<context> is one of the following choices:
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    FINAL            The letter is not followed by a letter of
*0Sstevel@tonic-gate                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
*0Sstevel@tonic-gate    MODERN           The mapping is only used for modern text
*0Sstevel@tonic-gate    AFTER_i          The last base character was "i" (U+0069)
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateFor more information about case mappings see
*0Sstevel@tonic-gatehttp://www.unicode.org/unicode/reports/tr21/
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy %CASESPEC;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub _casespec {
*0Sstevel@tonic-gate    unless (%CASESPEC) {
*0Sstevel@tonic-gate	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
*0Sstevel@tonic-gate	    local $_;
*0Sstevel@tonic-gate	    while (<$CASESPECFH>) {
*0Sstevel@tonic-gate		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
*0Sstevel@tonic-gate		    my ($hexcode, $lower, $title, $upper, $condition) =
*0Sstevel@tonic-gate			($1, $2, $3, $4, $5);
*0Sstevel@tonic-gate		    my $code = hex($hexcode);
*0Sstevel@tonic-gate		    if (exists $CASESPEC{$code}) {
*0Sstevel@tonic-gate			if (exists $CASESPEC{$code}->{code}) {
*0Sstevel@tonic-gate			    my ($oldlower,
*0Sstevel@tonic-gate				$oldtitle,
*0Sstevel@tonic-gate				$oldupper,
*0Sstevel@tonic-gate				$oldcondition) =
*0Sstevel@tonic-gate				    @{$CASESPEC{$code}}{qw(lower
*0Sstevel@tonic-gate							   title
*0Sstevel@tonic-gate							   upper
*0Sstevel@tonic-gate							   condition)};
*0Sstevel@tonic-gate			    if (defined $oldcondition) {
*0Sstevel@tonic-gate				my ($oldlocale) =
*0Sstevel@tonic-gate				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
*0Sstevel@tonic-gate				delete $CASESPEC{$code};
*0Sstevel@tonic-gate				$CASESPEC{$code}->{$oldlocale} =
*0Sstevel@tonic-gate				{ code      => $hexcode,
*0Sstevel@tonic-gate				  lower     => $oldlower,
*0Sstevel@tonic-gate				  title     => $oldtitle,
*0Sstevel@tonic-gate				  upper     => $oldupper,
*0Sstevel@tonic-gate				  condition => $oldcondition };
*0Sstevel@tonic-gate			    }
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate			my ($locale) =
*0Sstevel@tonic-gate			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
*0Sstevel@tonic-gate			$CASESPEC{$code}->{$locale} =
*0Sstevel@tonic-gate			{ code      => $hexcode,
*0Sstevel@tonic-gate			  lower     => $lower,
*0Sstevel@tonic-gate			  title     => $title,
*0Sstevel@tonic-gate			  upper     => $upper,
*0Sstevel@tonic-gate			  condition => $condition };
*0Sstevel@tonic-gate		    } else {
*0Sstevel@tonic-gate			$CASESPEC{$code} =
*0Sstevel@tonic-gate			{ code      => $hexcode,
*0Sstevel@tonic-gate			  lower     => $lower,
*0Sstevel@tonic-gate			  title     => $title,
*0Sstevel@tonic-gate			  upper     => $upper,
*0Sstevel@tonic-gate			  condition => $condition };
*0Sstevel@tonic-gate		    }
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate	    }
*0Sstevel@tonic-gate	    close($CASESPECFH);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub casespec {
*0Sstevel@tonic-gate    my $arg  = shift;
*0Sstevel@tonic-gate    my $code = _getcode($arg);
*0Sstevel@tonic-gate    croak __PACKAGE__, "::casespec: unknown code '$arg'"
*0Sstevel@tonic-gate	unless defined $code;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    _casespec() unless %CASESPEC;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate    return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Unicode::UCD::UnicodeVersion
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateUnicode::UCD::UnicodeVersion() returns the version of the Unicode
*0Sstevel@tonic-gateCharacter Database, in other words, the version of the Unicode
*0Sstevel@tonic-gatestandard the database implements.  The version is a string
*0Sstevel@tonic-gateof numbers delimited by dots (C<'.'>).
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatemy $UNICODEVERSION;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatesub UnicodeVersion {
*0Sstevel@tonic-gate    unless (defined $UNICODEVERSION) {
*0Sstevel@tonic-gate	openunicode(\$VERSIONFH, "version");
*0Sstevel@tonic-gate	chomp($UNICODEVERSION = <$VERSIONFH>);
*0Sstevel@tonic-gate	close($VERSIONFH);
*0Sstevel@tonic-gate	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
*0Sstevel@tonic-gate	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
*0Sstevel@tonic-gate    }
*0Sstevel@tonic-gate    return $UNICODEVERSION;
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head2 Implementation Note
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateThe first use of charinfo() opens a read-only filehandle to the Unicode
*0Sstevel@tonic-gateCharacter Database (the database is included in the Perl distribution).
*0Sstevel@tonic-gateThe filehandle is then kept open for further queries.  In other words,
*0Sstevel@tonic-gateif you are wondering where one of your filehandles went, that's where.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 BUGS
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateDoes not yet support EBCDIC platforms.
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=head1 AUTHOR
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateJarkko Hietaniemi
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate=cut
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate1;