xref: /openbsd-src/gnu/usr.bin/perl/t/re/script_run.t (revision 56d68f1e19ff848c889ecfa71d3a06340ff64892)
15759b3d2Safresh1BEGIN {
25759b3d2Safresh1    chdir 't' if -d 't';
35759b3d2Safresh1    require './test.pl';
45759b3d2Safresh1    set_up_inc('../lib');
55759b3d2Safresh1}
65759b3d2Safresh1
75759b3d2Safresh1use strict;
85759b3d2Safresh1use warnings;
95759b3d2Safresh1
105759b3d2Safresh1$|=1;
115759b3d2Safresh1
125759b3d2Safresh1# The Script_Extension property has only recently become reasonably stable, so
135759b3d2Safresh1# later Unicode releases may change things.   Some of these tests were
145759b3d2Safresh1# designed to provide more code covereage in regexec.c, so changes in it or
155759b3d2Safresh1# later Standards could cause them to not test what they originally were aimed
165759b3d2Safresh1# to do.
175759b3d2Safresh1
185759b3d2Safresh1# Since there's so few tests currently, we can afford to try each syntax on
195759b3d2Safresh1# all of them
205759b3d2Safresh1foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
215759b3d2Safresh1    my $script_run;
225759b3d2Safresh1    eval '$script_run = qr/ ^ (*$type: .* ) $ /x;';
235759b3d2Safresh1
245759b3d2Safresh1    unlike("\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}\N{CYRILLIC SMALL LETTER U}}\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}l", $script_run, "Cyrillic 'paypal' with a Latin 'l' is not a script run");
255759b3d2Safresh1    unlike("A\N{GREEK CAPITAL LETTER GAMMA}", $script_run, "Latin followed by Greek isn't a script run");
265759b3d2Safresh1
275759b3d2Safresh1    like("\N{CYRILLIC THOUSANDS SIGN}\N{COMBINING CYRILLIC TITLO}", $script_run, "Cyrillic followed by Permic-Arabic is Arabic");
285759b3d2Safresh1    like("\N{OLD PERMIC LETTER AN}\N{COMBINING CYRILLIC TITLO}", $script_run, "Permic followed by Permic-Arabic is Permic");
295759b3d2Safresh1    unlike("\N{GLAGOLITIC CAPITAL LETTER AZU}\N{COMBINING CYRILLIC TITLO}", $script_run, "Glagolithic followed by Permic-Arabic isn't a script run");
305759b3d2Safresh1
315759b3d2Safresh1    like("\N{CYRILLIC THOUSANDS SIGN}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Cyrillic followed by Glagolithic-Arabic is Arabic");
325759b3d2Safresh1    like("\N{GLAGOLITIC CAPITAL LETTER AZU}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Glagolithic followed by Glagolithic-Arabic is Glagolithic");
335759b3d2Safresh1    unlike("\N{OLD PERMIC LETTER AN}\N{COMBINING CYRILLIC PALATALIZATION}", $script_run, "Permic followed by Glagolithic-Arabic isn't a script run");
345759b3d2Safresh1
355759b3d2Safresh1    like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{ARABIC COMMA}\N{ARABIC-INDIC DIGIT FOUR}\N{THAANA LETTER HAA}", $script_run, "Arabic-Thaana chars followed by Thaana is Thaana");
365759b3d2Safresh1    unlike("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}A", $script_run, "Arabic-Thaana chars followed by Latin isn't a script run");
375759b3d2Safresh1    like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{ARABIC COMMA}\N{ARABIC-INDIC DIGIT FOUR}\N{ARABIC NUMBER SIGN}", $script_run, "Arabic-Thaana chars followed by Arabic is Arabic");
385759b3d2Safresh1    unlike("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{EXTENDED ARABIC-INDIC DIGIT NINE}", $script_run, "Arabic-Thaana digits followed by an Arabic digit from a different sequence isn't a script run");
395759b3d2Safresh1    like("\N{ARABIC-INDIC DIGIT ZERO}\N{ARABIC-INDIC DIGIT ONE}\N{ARABIC-INDIC DIGIT TWO}\N{ARABIC-INDIC DIGIT THREE}\N{THAANA LETTER HAA}", $script_run, "Arabic-Thaana digits followed by a Thaana leter is a script run");
405759b3d2Safresh1
415759b3d2Safresh1    # The next tests are at a hard-coded boundary in regexec.c at the time of this
425759b3d2Safresh1    # writing (U+02B9/02BA).
435759b3d2Safresh1    like("abc\N{MODIFIER LETTER SMALL Y}", $script_run, "All Latin is a script run");
445759b3d2Safresh1    like("abc\N{MODIFIER LETTER PRIME}", $script_run, "Latin then Common is a script run");
455759b3d2Safresh1    like(":a", $script_run, "Common then Latin is a script run");
465759b3d2Safresh1    like("-\N{SINHALA LETTER RAYANNA}", $script_run, "Common then Sinhala (which has its own 0) is a script run");
475759b3d2Safresh1
485759b3d2Safresh1    like("\N{HEBREW LETTER ALEF}\N{HEBREW LETTER TAV}\N{MODIFIER LETTER PRIME}", $script_run, "Hebrew then Common is a script run");
495759b3d2Safresh1    unlike("\N{HEBREW LETTER ALEF}\N{HEBREW LETTER TAV}\N{MODIFIER LETTER SMALL Y}", $script_run, "Hebrew then Latin isn't a script run");
505759b3d2Safresh1    like("9876543210\N{DESERET SMALL LETTER WU}", $script_run, "0-9 are the digits for Deseret");
515759b3d2Safresh1    like("\N{DESERET SMALL LETTER WU}9876543210", $script_run, "Also when they aren't in the initial position");
52e603c72fSafresh1    like("\N{DESERET SMALL LETTER WU}\N{FULLWIDTH DIGIT FIVE}", $script_run, "Fullwidth digits may be digits for Deseret");
53e603c72fSafresh1    like("\N{FULLWIDTH DIGIT SIX}\N{DESERET SMALL LETTER LONG I}", $script_run, "... likewise if the digits come first");
545759b3d2Safresh1
555759b3d2Safresh1    like("1234567890\N{ARABIC LETTER ALEF}", $script_run, "[0-9] work for Arabic");
565759b3d2Safresh1    unlike("1234567890\N{ARABIC LETTER ALEF}\N{ARABIC-INDIC DIGIT FOUR}\N{ARABIC-INDIC DIGIT FIVE}", $script_run, "... but not in combination with real ARABIC digits");
575759b3d2Safresh1    unlike("\N{ARABIC LETTER ALEF}\N{ARABIC-INDIC DIGIT SIX}\N{ARABIC-INDIC DIGIT SEVEN}1", $script_run, "... nor when the ARABIC digits come before them");
585759b3d2Safresh1
595759b3d2Safresh1    # This exercises the case where the script zero but not the script is
605759b3d2Safresh1    # ambiguous until a non-ambiguous digit is found.
615759b3d2Safresh1    like("\N{ARABIC LETTER ALEF}\N{EXTENDED ARABIC-INDIC DIGIT EIGHT}", $script_run, "ARABIC with a Shia digit is a script run");
625759b3d2Safresh1
635759b3d2Safresh1    like("\N{U+03A2}", $script_run, "A single unassigned code point is a script run");
645759b3d2Safresh1    unlike("\N{U+03A2}\N{U+03A2}", $script_run, "But not more than one");
655759b3d2Safresh1    unlike("A\N{U+03A2}", $script_run, "... and not in combination with an assigned one");
665759b3d2Safresh1    unlike("\N{U+03A2}A", $script_run, "... in either order");
675759b3d2Safresh1    unlike("\N{U+03A2}0", $script_run, "... nor with a digit following");
685759b3d2Safresh1
695759b3d2Safresh1    like("A\N{COMBINING GRAVE ACCENT}", $script_run, "An inherited script matches others");
705759b3d2Safresh1    like("\N{COMBINING GRAVE ACCENT}A", $script_run, "... even if first in the sequence");
715759b3d2Safresh1
725759b3d2Safresh1    like("\N{COMBINING TILDE}\N{COMBINING GRAVE ACCENT}", $script_run, "A script containing only inherited characters matches");
735759b3d2Safresh1
745759b3d2Safresh1    like("\N{DEVANAGARI DOUBLE DANDA}\N{DEVANAGARI DANDA}\N{DEVANAGARI STRESS SIGN UDATTA}\N{DEVANAGARI STRESS SIGN ANUDATTA}\N{NORTH INDIC FRACTION ONE QUARTER}\N{NORTH INDIC QUANTITY MARK}", $script_run, "A bunch of narrowing down of multiple possible scripts");
755759b3d2Safresh1
765759b3d2Safresh1    unlike("\N{JAVANESE PANGRANGKEP}\N{GEORGIAN PARAGRAPH SEPARATOR}", $script_run, "Two code points each in multiple scripts that don't intersect aren't a script run");
775759b3d2Safresh1    like("\N{DEVANAGARI SIGN CANDRABINDU VIRAMA}\N{VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA}", $script_run, "Two code points each in multiple scripts that 't intersect singly are a script run");
785759b3d2Safresh1
795759b3d2Safresh1    like("", $script_run, "An empty string is a script run");
805759b3d2Safresh1
815759b3d2Safresh1    use utf8;
825759b3d2Safresh1
835759b3d2Safresh1    # From UTS 39
845759b3d2Safresh1    like("写真だけの結婚式", $script_run, "Mixed Hiragana and Han");
855759b3d2Safresh1
865759b3d2Safresh1    unlike "\N{THAI DIGIT FIVE}1", $script_run, "Thai digit followed by '1'";
875759b3d2Safresh1    unlike "1\N{THAI DIGIT FIVE}", $script_run, "'1' followed by Thai digit ";
885759b3d2Safresh1    unlike "\N{BENGALI DIGIT ZERO}\N{CHAKMA DIGIT SEVEN}", $script_run,
895759b3d2Safresh1           "Two digits in same extended script but from different sets of 10";
905759b3d2Safresh1}
915759b3d2Safresh1
925759b3d2Safresh1    # Until fixed, this was skipping the '['
93b46d8ef2Safresh1    unlike("abc]c", qr/^ (*sr:a(*sr:[bc]*)c) $/x,
94b46d8ef2Safresh1           "Doesn't skip parts of exact matches");
955759b3d2Safresh1
965759b3d2Safresh1    like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
97b46d8ef2Safresh1    unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/,
98b46d8ef2Safresh1           "Nested asr works to exclude some things");
99b46d8ef2Safresh1
100b46d8ef2Safresh1    like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/,
101b46d8ef2Safresh1         "Script with own zero works with ASCII digits"); # perl #133547
102b46d8ef2Safresh1    like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/,
103b46d8ef2Safresh1         "Script without own zero works with ASCII digits");
1045759b3d2Safresh1
105e603c72fSafresh1    like("A\x{ff10}\x{ff19}B", qr/^(*sr:.{4})/,
106e603c72fSafresh1         "Non-ASCII Common digits work with Latin"); # perl #133547
107e603c72fSafresh1    like("A\x{ff10}BC", qr/^(*sr:.{4})/,
108e603c72fSafresh1         "Non-ASCII Common digits work with Latin"); # perl #133547
109e603c72fSafresh1    like("A\x{1d7ce}\x{1d7cf}B", qr/^(*sr:.{4})/,
110e603c72fSafresh1         "Non-ASCII Common digits work with Latin"); # perl #133547
111e603c72fSafresh1    like("A\x{1d7ce}BC", qr/^(*sr:.{4})/,
112e603c72fSafresh1         "Non-ASCII Common digits work with Latin"); # perl #133547
113e603c72fSafresh1    like("\x{1d7ce}\x{1d7cf}AB", qr/^(*sr:.{4})/,
114e603c72fSafresh1         "Non-ASCII Common digits work with Latin"); # perl #133547
115e603c72fSafresh1    like("α\x{1d7ce}βγ", qr/^(*sr:.{4})/,
116e603c72fSafresh1         "Non-ASCII Common digits work with Greek"); # perl #133547
117e603c72fSafresh1    like("\x{1d7ce}αβγ", qr/^(*sr:.{4})/,
118e603c72fSafresh1         "Non-ASCII Common digits work with Greek"); # perl #133547
119e603c72fSafresh1
120*56d68f1eSafresh1    fresh_perl_is('print scalar "0" =~ m!(((*sr:()|)0)(*sr:)0|)!;',
121b46d8ef2Safresh1                  1, {}, '[perl #133997]');
122b46d8ef2Safresh1
1235759b3d2Safresh1done_testing();
124