xref: /openbsd-src/gnu/usr.bin/perl/lib/unicore/README.perl (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1# The goal is for perl to compile and reasonably run any version of Unicode.
2# Working reasonably well doesn't mean that the test suite will run without
3# showing errors.  A few of the very-Unicode specific test files have been
4# modified to account for different versions, but most have not.  For example,
5# some tests use characters that aren't encoded in all Unicode versions; others
6# have hard-coded the General Categories for a code point that were correct at
7# the time the test was written.  Perl itself will not compile under Unicode
8# releases prior to 3.0 without a simple change to Unicode::Normalize.
9# mktables contains instructions for this.
10
11# The *.txt files were copied from
12
13# 	ftp://www.unicode.org/Public/UNIDATA
14
15# (which always points to the latest version) with subdirectories 'extracted' and
16# 'auxiliary'.  Older versions are located under Public with an appropriate name.
17# They are also available via http at www.unicode.org/versions/
18#
19
20# The Unihan files were not included due to space considerations.  Also NOT
21# included were any *.html files.  It is possible to add the Unihan files and
22# have some properties from them automatically compiled.  By editing mktables
23# (see instructions near its beginning) you can add other Unihan properties.
24
25# The file named 'version' should exist and be a single line with the Unicode
26# version, like:
27#
28# 5.2.0
29#
30# (without the initial '# ')
31
32# To be 8.3 filesystem friendly, the names of some of the input files have been
33# changed from the values that are in the Unicode DB.  Not all of the Test
34# files are currently used, so may not be present, so some of the mv's can
35# fail.  The .html Test files are not touched.
36
37mv PropertyValueAliases.txt PropValueAliases.txt
38mv NamedSequencesProv.txt NamedSqProv.txt
39mv NormalizationTest.txt NormTest.txt
40mv DerivedAge.txt DAge.txt
41mv DerivedCoreProperties.txt DCoreProperties.txt
42mv DerivedNormalizationProps.txt DNormalizationProps.txt
43
44# Some early releases don't have the extracted directory, and hence these files
45# should be moved to it.
46mkdir extracted 2>/dev/null
47mv DerivedBidiClass.txt DerivedBinaryProperties.txt extracted 2>/dev/null
48mv DerivedCombiningClass.txt DerivedDecompositionType.txt extracted 2>/dev/null
49mv DerivedEastAsianWidth.txt DerivedGeneralCategory.txt extracted 2>/dev/null
50mv DerivedJoiningGroup.txt DerivedJoiningType.txt extracted 2>/dev/null
51mv DerivedLineBreak.txt DerivedNumericType.txt DerivedNumericValues.txt extracted 2>/dev/null
52
53mv extracted/DerivedBidiClass.txt extracted/DBidiClass.txt
54mv extracted/DerivedBinaryProperties.txt extracted/DBinaryProperties.txt
55mv extracted/DerivedCombiningClass.txt extracted/DCombiningClass.txt
56mv extracted/DerivedDecompositionType.txt extracted/DDecompositionType.txt
57mv extracted/DerivedEastAsianWidth.txt extracted/DEastAsianWidth.txt
58mv extracted/DerivedGeneralCategory.txt extracted/DGeneralCategory.txt
59mv extracted/DerivedJoiningGroup.txt extracted/DJoinGroup.txt
60mv extracted/DerivedJoiningType.txt extracted/DJoinType.txt
61mv extracted/DerivedLineBreak.txt extracted/DLineBreak.txt
62mv extracted/DerivedNumericType.txt extracted/DNumType.txt
63mv extracted/DerivedNumericValues.txt extracted/DNumValues.txt
64mv extracted/DerivedName.txt extracted/DName.txt
65rmdir extracted 2>/dev/null     # Will fail if non-empty, but if it is empty
66                                # was an early release that didn't have it.
67
68mv auxiliary/GraphemeBreakTest.txt auxiliary/GCBTest.txt
69mv auxiliary/LineBreakTest.txt auxiliary/LBTest.txt
70mv auxiliary/SentenceBreakTest.txt auxiliary/SBTest.txt
71mv auxiliary/WordBreakTest.txt auxiliary/WBTest.txt
72
73# If you have the Unihan database (5.2 and above), you should also do the
74# following:
75
76mv Unihan_DictionaryIndices.txt UnihanIndicesDictionary.txt
77mv Unihan_DictionaryLikeData.txt UnihanDataDictionaryLike.txt
78mv Unihan_IRGSources.txt UnihanIRGSources.txt
79mv Unihan_NumericValues.txt UnihanNumericValues.txt
80mv Unihan_OtherMappings.txt UnihanOtherMappings.txt
81mv Unihan_RadicalStrokeCounts.txt UnihanRadicalStrokeCounts.txt
82mv Unihan_Readings.txt UnihanReadings.txt
83mv Unihan_Variants.txt UnihanVariants.txt
84
85# If you download everything, the names of files that are not used by mktables
86# are not changed by the above, and hence may not work correctly as-is on 8.3
87# filesystems.
88
89# mktables is used to generate the tables used by the rest of Perl.  It will
90# warn you about any *.txt and *.html files in the directory substructure that
91# it doesn't know about.  You should remove any so-identified, or edit mktables
92# to add them to its lists to process.  You can run
93#
94#    mktables -globlist
95#
96# to have it try to process these tables generically.
97
98# COMPILING ON OLDER UNICODE VERSIONS
99#
100# To compile perl for use with an older Unicode release, delete everything in
101# the lib/unicore directory except mktables and Makefile.  Then download the
102# Unicode-supplied files for the desired version to that directory  (A url for
103# these is given earlier in this file).  Then create the 'version' file with a
104# single line, like '6.1.0'.  Do a 'make test' from the project level.  You
105# will get some porting errors for needing to regen.  Regenerate what it tells
106# you are needed, and make test again.  If you compile an old enough version,
107# you will also have to download a few files from later Unicode versions,
108# following the instructions that will be given if warranted.  It should
109# compile in any release without warnings, except for some casing conflicts
110# in Unicode 2.1.8, and some extraneous files will show up in very early
111# releases of the form qr/diff.*\.txt/.  If you add Unihan.txt, one line is in error in
112#
113# Other glitches are noted in mktables under 'UNICODE VERSIONS NOTES'
114
115# FOR PUMPKINS
116#
117# The files are inter-related.  If you take the latest UnicodeData.txt, for
118# example, but leave the older versions of other files, there can be subtle
119# problems.  So get everything available from Unicode, and delete those which
120# aren't needed.
121#
122# When moving to a new version of Unicode, you need to update 'version' by hand
123#
124#	p4 edit version
125# 	...
126#
127# You should look in the Unicode release notes (which are probably towards the
128# bottom of http://www.unicode.org/reports/tr44/) to see if any properties have
129# newly been moved to be Obsolete, Deprecated, or Stabilized.  The full names
130# for these should be added to the respective lists near the beginning of
131# mktables, using an 'if' to add them for just this Unicode version going
132# forward, so that mktables can continue to be used for earlier Unicode
133# versions.
134#
135# When putting out a new Perl release, think about if any of the Deprecated
136# properties should be moved to Suppressed.
137#
138# perlrecharclass.pod has a list of all the characters that are white space,
139# which needs to be updated if there are changes.  A quick way to check if
140# there have been changes would be to see if the number of such characters
141# listed in perluniprops.pod (generated by running mktables) for the property
142# \p{White_Space} is no longer 25.  Further investigation would then be
143# necessary to classify the new characters as horizontal and vertical.
144#
145# The code in regexec.c for the \X match construct is intimately tied to the
146# regular expression in UAX #29 (http://www.unicode.org/reports/tr29/).  You
147# should see if it has changed, and if so, regexec.c should be modified.  The
148# current one is
149# ( CRLF
150# | Prepend* ( RI-sequence | Hangul-Syllable | !Control )
151#   ( Grapheme_Extend | SpacingMark )*
152# | . )
153#
154# mktables has many checks to warn you if there are unexpected or novel things
155# that it doesn't know how to handle.
156#
157# Module::CoreList should be changed to include the new release
158#
159# Also, you should regen l1_char_class_tab.h, by
160#
161# perl regen/mk_L_charclass.pl
162#
163# and, regen charclass_invlists.h by
164#
165# perl regen/mk_invlists.pl
166#
167# Finally:
168#
169# 	p4 submit
170#
171# --
172# jhi@iki.fi; updated by nick@ccl4.org, public@khwilliamson.com
173