xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/lib/Encode/Unicode/UTF7.pm (revision 9f11ffb7133c203312a01e4b986886bc88c7d74b)
1#
2# $Id: UTF7.pm,v 2.10 2017/06/10 17:23:50 dankogai Exp $
3#
4package Encode::Unicode::UTF7;
5use strict;
6use warnings;
7use parent qw(Encode::Encoding);
8__PACKAGE__->Define('UTF-7');
9our $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
10use MIME::Base64;
11use Encode qw(find_encoding);
12
13#
14# Algorithms taken from Unicode::String by Gisle Aas
15#
16
17our $OPTIONAL_DIRECT_CHARS = 1;
18my $specials = quotemeta "\'(),-./:?";
19$OPTIONAL_DIRECT_CHARS
20  and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
21
22# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
23# We use qr/[\n\r\t\ ] instead
24my $re_asis    = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/;
25my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/;
26my $e_utf16    = find_encoding("UTF-16BE");
27
28sub needs_lines { 1 }
29
30sub encode($$;$) {
31    my ( $obj, $str, $chk ) = @_;
32    return undef unless defined $str;
33    my $len = length($str);
34    pos($str) = 0;
35    my $bytes = substr($str, 0, 0); # to propagate taintedness
36    while ( pos($str) < $len ) {
37        if ( $str =~ /\G($re_asis+)/ogc ) {
38	    my $octets = $1;
39	    utf8::downgrade($octets);
40	    $bytes .= $octets;
41        }
42        elsif ( $str =~ /\G($re_encoded+)/ogsc ) {
43            if ( $1 eq "+" ) {
44                $bytes .= "+-";
45            }
46            else {
47                my $s = $1;
48                my $base64 = encode_base64( $e_utf16->encode($s), '' );
49                $base64 =~ s/=+$//;
50                $bytes .= "+$base64-";
51            }
52        }
53        else {
54            die "This should not happen! (pos=" . pos($str) . ")";
55        }
56    }
57    $_[1] = '' if $chk;
58    return $bytes;
59}
60
61sub decode($$;$) {
62    use re 'taint';
63    my ( $obj, $bytes, $chk ) = @_;
64    return undef unless defined $bytes;
65    my $len = length($bytes);
66    my $str = substr($bytes, 0, 0); # to propagate taintedness;
67    pos($bytes) = 0;
68    no warnings 'uninitialized';
69    while ( pos($bytes) < $len ) {
70        if ( $bytes =~ /\G([^+]+)/ogc ) {
71            $str .= $1;
72        }
73        elsif ( $bytes =~ /\G\+-/ogc ) {
74            $str .= "+";
75        }
76        elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) {
77            my $base64 = $1;
78            my $pad    = length($base64) % 4;
79            $base64 .= "=" x ( 4 - $pad ) if $pad;
80            $str .= $e_utf16->decode( decode_base64($base64) );
81        }
82        elsif ( $bytes =~ /\G\+/ogc ) {
83            $^W and warn "Bad UTF7 data escape";
84            $str .= "+";
85        }
86        else {
87            die "This should not happen " . pos($bytes);
88        }
89    }
90    $_[1] = '' if $chk;
91    return $str;
92}
931;
94__END__
95
96=head1 NAME
97
98Encode::Unicode::UTF7 -- UTF-7 encoding
99
100=head1 SYNOPSIS
101
102    use Encode qw/encode decode/;
103    $utf7 = encode("UTF-7", $utf8);
104    $utf8 = decode("UTF-7", $ucs2);
105
106=head1 ABSTRACT
107
108This module implements UTF-7 encoding documented in RFC 2152.  UTF-7,
109as its name suggests, is a 7-bit re-encoded version of UTF-16BE.  It
110is designed to be MTA-safe and expected to be a standard way to
111exchange Unicoded mails via mails.  But with the advent of UTF-8 and
1128-bit compliant MTAs, UTF-7 is hardly ever used.
113
114UTF-7 was not supported by Encode until version 1.95 because of that.
115But Unicode::String, a module by Gisle Aas which adds Unicode supports
116to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
117so Encode can supersede Unicode::String 100%.
118
119=head1 In Practice
120
121When you want to encode Unicode for mails and web pages, however, do
122not use UTF-7 unless you are sure your recipients and readers can
123handle it.  Very few MUAs and WWW Browsers support these days (only
124Mozilla seems to support one).  For general cases, use UTF-8 for
125message body and MIME-Header for header instead.
126
127=head1 SEE ALSO
128
129L<Encode>, L<Encode::Unicode>, L<Unicode::String>
130
131RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
132
133=cut
134