perlunicook.pod - OpenGrok cross reference for /openbsd-src/gnu/usr.bin/perl/pod/perlunicook.pod

Lines Matching +full:fine +full:- +full:tuning
6 perlunicook - cookbookish examples of handling Unicode in Perl
26  use utf8;      # so literals and identifiers can be in UTF-8
28  use open      qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8
38 =head2 ℞ 1: Generic Unicode-savvy filter
51 =head2 ℞ 2: Fine-tuning Unicode warnings
56  no warnings "nonchar";      # the 66 forbidden non-characters
57  no warnings "surrogate";    # UTF-16/CESU-8 nonsense
62 Without the all-critical C<use utf8> declaration, putting UTF‑8 in your
72  my @鯉        = qw( koi8-f koi8-u koi8-r );
101 In an interpolated literal, whether a double-quoted string or a
111  # even non-BMP ranges in regex work fine
112  /[\x{1D434}-\x{1D467}]/
127 by that name for use in interpolated literals (double-quoted
141 Anything else is a Perl-specific convenience abbreviation.  Specify one or
142 more scripts by names if you want short names that are script-specific.
164 Notice the C<%vx> vector-print functionality in C<printf>.
174 characters, or even to give unnamed private-use characters useful names.
187 C<CJK UNIFIED IDEOGRAPH-6771> and C<CJK UNIFIED IDEOGRAPH-4EAC>,
192  # cpan -i Unicode::Unihan
195  my $unhan = Unicode::Unihan->new;
197      printf "CJK $str in %-12s is ", $lang;
198      say $unhan->$lang($str);
212  # cpan -i Lingua::JA::Romanize::Japanese
214  my $k2r = Lingua::JA::Romanize::Japanese->new;
216  say "Japanese for $str is ", $k2r->chars($str);
231   my $bytes = encode("MIME-Header-ISO_2022_JP", $chars, 1);
239      $ perl -CA ...
244     @ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
248     # cpan -i Encode::Locale
257 Use a command-line option, an environment variable, or else
260      $ perl -CS ...
264      use open qw(:std :encoding(UTF-8));
266      binmode(STDIN,  ":encoding(UTF-8)");
272     # cpan -i Encode::Locale
277     binmode STDIN,  ":encoding(console_in)"  if -t STDIN;
278     binmode STDOUT, ":encoding(console_out)" if -t STDOUT;
279     binmode STDERR, ":encoding(console_out)" if -t STDERR;
283 Files opened without an encoding argument will be in UTF-8:
285      $ perl -CD ...
289      use open qw(:encoding(UTF-8));
293      $ perl -CSDA ...
297      use open qw(:std :encoding(UTF-8));
299      @ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
304 to deal with encoded text, not by calling low-level
308      open(my $in_file, "< :encoding(UTF-16)", "wintext");
311      binmode($in_file, ":encoding(UTF-16)");
324 the incantation C<":raw :encoding(UTF-16LE) :crlf"> includes implicit
338 =head2 ℞ 21: Unicode case-insensitive comparisons
347  # sort case-insensitively
356 A Unicode linebreak matches the two-character CRLF
370  my $cat = charinfo(0x3A3)->{category};  # "Lu"
372 =head2 ℞ 24: Disabling Unicode-awareness in builtin charclasses
385 Or use specific un-Unicode properties, like C<\p{ahex}>
407 Define at compile-time your own custom character
410  # using private-use characters
428 same text to be searched. Note that this is about much more than just pre-
438 =head2 ℞ 28: Convert non-ASCII Unicode numerics
441 ASCII digits only, but Perl’s implicit string-to-number
459 Programmer-visible “characters” are codepoints matched by C</./s>,
460 but user-visible “characters” are graphemes matched by C</\X/>.
473  # cpan -i Unicode::GCString
475  my $gcs = Unicode::GCString->new($str);
476  my $first_five = $gcs->substr(0, 5);
487  # OR: cpan -i Unicode::GCString
489  $str = reverse Unicode::GCString->new($str);
500   # OR: cpan -i Unicode::GCString
502  my $gcs = Unicode::GCString->new($str);
503  my $count = $gcs->length;
505 =head2 ℞ 34: Unicode column-width for printing
519      my $gcs = Unicode::GCString->new($str);
520      my $cols = $gcs->columns;
521      my $pad = " " x (10 - $cols);
539  my $col = Unicode::Collate->new();
540  my @list = $col->sort(@old_list);
543 for a convenient command-line interface to this module.
545 =head2 ℞ 36: Case- I<and> accent-insensitive Unicode sort
551  my $col = Unicode::Collate->new(level => 1);
552  my @list = $col->sort(@old_list);
558  # either use v5.12, OR: cpan -i Unicode::Collate::Locale
560  my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
561  my @list = $col->sort(@old_list);
563 The I<ucsort> program mentioned above accepts a C<--locale> parameter.
570      $b->{AGE}   <=>  $a->{AGE}
572      $a->{NAME}  cmp  $b->{NAME}
577  my $coll = Unicode::Collate->new();
579      $rec->{NAME_key} = $coll->getSortKey( $rec->{NAME} );
582      $b->{AGE}       <=>  $a->{AGE}
584      $a->{NAME_key}  cmp  $b->{NAME_key}
587 =head2 ℞ 39: Case- I<and> accent-insensitive comparisons
593  my $es = Unicode::Collate->new(
599  $es->eq("García",  "GARCIA" );
600  $es->eq("Márquez", "MARQUEZ");
602 =head2 ℞ 40: Case- I<and> accent-insensitive locale comparisons
606  my $de = Unicode::Collate::Locale->new(
611  $de->eq("tschüß", "TSCHUESS");  # notice ü => UE, ß => SS
617  # cpan -i Unicode::LineBreak
622  my $fmt = Unicode::LineBreak->new;
623  print $fmt->break($para), "\n";
638     my $enc_key   = encode("UTF-8", $uni_key, 1);
639     my $enc_value = encode("UTF-8", $uni_value, 1);
645     my $enc_key   = encode("UTF-8", $uni_key, 1);
647     my $uni_value = decode("UTF-8", $enc_value, 1);
659     $dbobj->Filter_Value("utf8");  # this is the magic bit
673 Here’s a full program showing how to make use of locale-sensitive
701  # umenu - demo sorting and printing of Unicode food
708  use open      qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8
731      "シュークリーム"    => 1.85, # cream-filled pastry like eclair
747  my $coll  = Unicode::Collate::Locale->new(locale => "ja");
749  for my $item ($coll->sort(keys %price)) {
755      return $str . ($padchar x ($width - colwidth($str)));
759      return Unicode::GCString->new($str)->columns;
791 I<uniquote> instead of I<cat -v> or I<hexdump>,
799 It also supplies these programs, all of which are general filters that do Unicode-y things:
845 Christiansen <et al.>, 2012-02-13 by O’Reilly Media.  The code itself is
853 v1.0.0 – first public release, 2012-02-27