lib/unicore/README.perl

   1 # Perl should compile and reasonably run any version of Unicode.  That doesn't
   2 # mean that the test suite will run without showing errors.  A few of the
   3 # very-Unicode specific test files have been modified to account for different
   4 # versions, but most have not.  For example, some tests use characters that
   5 # aren't encoded in all Unicode versions; others have hard-coded the General
   6 # Categories that were correct at the time the test was written.  Perl itself
   7 # will not compile under Unicode releases prior to 3.0 without a simple change to
   8 # Unicode::Normalize.  mktables contains instructions for this, as well as other
   9 # hints for using older Unicode versions.
  10
  11 # The *.txt files were copied from
  12
  13 #       ftp://www.unicode.org/Public/UNIDATA
  14
  15 # (which always points to the latest version) with subdirectories 'extracted' and
  16 # 'auxiliary'.  Older versions are located under Public with an appropriate name.
  17
  18 # The Unihan files were not included due to space considerations.  Also NOT
  19 # included were any *.html files.  It is possible to add the Unihan files, and
  20 # edit mktables (see instructions near its beginning) to look at them.
  21
  22 # The file named 'version' should exist and be a single line with the Unicode
  23 # version, like:
  24 # 5.2.0
  25
  26 # To be 8.3 filesystem friendly, the names of some of the input files have been
  27 # changed from the values that are in the Unicode DB.  Not all of the Test
  28 # files are currently used, so may not be present, so some of the mv's can
  29 # fail.  The .html Test files are not touched.
  30
  31 mv PropertyValueAliases.txt PropValueAliases.txt
  32 mv NamedSequencesProv.txt NamedSqProv.txt
  33 mv NormalizationTest.txt NormTest.txt
  34 mv DerivedAge.txt DAge.txt
  35 mv DerivedCoreProperties.txt DCoreProperties.txt
  36 mv DerivedNormalizationProps.txt DNormalizationProps.txt
  37
  38 # Some early releases don't have the extracted directory, and hence these files
  39 # should be moved to it.
  40 mkdir extracted 2>/dev/null
  41 mv DerivedBidiClass.txt DerivedBinaryProperties.txt extracted 2>/dev/null
  42 mv DerivedCombiningClass.txt DerivedDecompositionType.txt extracted 2>/dev/null
  43 mv DerivedEastAsianWidth.txt DerivedGeneralCategory.txt extracted 2>/dev/null
  44 mv DerivedJoiningGroup.txt DerivedJoiningType.txt extracted 2>/dev/null
  45 mv DerivedLineBreak.txt DerivedNumericType.txt DerivedNumericValues.txt extracted 2>/dev/null
  46
  47 mv extracted/DerivedBidiClass.txt extracted/DBidiClass.txt
  48 mv extracted/DerivedBinaryProperties.txt extracted/DBinaryProperties.txt
  49 mv extracted/DerivedCombiningClass.txt extracted/DCombiningClass.txt
  50 mv extracted/DerivedDecompositionType.txt extracted/DDecompositionType.txt
  51 mv extracted/DerivedEastAsianWidth.txt extracted/DEastAsianWidth.txt
  52 mv extracted/DerivedGeneralCategory.txt extracted/DGeneralCategory.txt
  53 mv extracted/DerivedJoiningGroup.txt extracted/DJoinGroup.txt
  54 mv extracted/DerivedJoiningType.txt extracted/DJoinType.txt
  55 mv extracted/DerivedLineBreak.txt extracted/DLineBreak.txt
  56 mv extracted/DerivedNumericType.txt extracted/DNumType.txt
  57 mv extracted/DerivedNumericValues.txt extracted/DNumValues.txt
  58
  59 mv auxiliary/GraphemeBreakTest.txt auxiliary/GCBTest.txt
  60 mv auxiliary/LineBreakTest.txt auxiliary/LBTest.txt
  61 mv auxiliary/SentenceBreakTest.txt auxiliary/SBTest.txt
  62 mv auxiliary/WordBreakTest.txt auxiliary/WBTest.txt
  63
  64 # If you have the Unihan database (5.2 and above), you should also do the
  65 # following:
  66
  67 mv Unihan_DictionaryIndices.txt UnihanIndicesDictionary.txt
  68 mv Unihan_DictionaryLikeData.txt UnihanDataDictionaryLike.txt
  69 mv Unihan_IRGSources.txt UnihanIRGSources.txt
  70 mv Unihan_NumericValues.txt UnihanNumericValues.txt
  71 mv Unihan_OtherMappings.txt UnihanOtherMappings.txt
  72 mv Unihan_RadicalStrokeCounts.txt UnihanRadicalStrokeCounts.txt
  73 mv Unihan_Readings.txt UnihanReadings.txt
  74 mv Unihan_Variants.txt UnihanVariants.txt
  75
  76 # If you download everything, the names of files that are not used by mktables
  77 # are not changed by the above, and hence may not work correctly as-is on 8.3
  78 # filesystems.
  79
  80 # mktables is used to generate the tables used by the rest of Perl.  It will
  81 # warn you about any *.txt files in the directory substructure that it doesn't
  82 # know about.  You should remove any so-identified, or edit mktables to add
  83 # them to its lists to process.  You can run
  84 #
  85 #    mktables -globlist
  86 #
  87 #to have it try to process these tables generically.
  88 #
  89 # FOR PUMPKINS
  90 #
  91 # The files are inter-related.  If you take the latest UnicodeData.txt, for
  92 # example, but leave the older versions of other files, there can be subtle
  93 # problems.  So get everything available from Unicode, and delete those which
  94 # aren't needed.
  95 #
  96 # When moving to a new version of Unicode, you need to update 'version' by hand
  97 #
  98 #       p4 edit version
  99 #       ...
 100 #
 101 # You should look in the Unicode release notes (which are probably towards the
 102 # bottom of http://www.unicode.org/reports/tr44/) to see if any properties have
 103 # newly been moved to be Obsolete, Deprecated, or Stabilized.  The full names
 104 # for these should be added to the respective lists near the beginning of
 105 # mktables, using an 'if' to add them for just this Unicode version going
 106 # forward, so that mktables can continue to be used for earlier Unicode
 107 # versions.
 108 #
 109 # When putting out a new Perl release, think about if any of the Deprecated
 110 # properties should be moved to Suppressed.
 111 #
 112 # perlrecharclass.pod has a list of all the characters that are white space,
 113 # which needs to be updated if there are changes.  A quick way to check if
 114 # there have been changes would be to see if the number of such characters
 115 # listed in perluniprops.pod (generated by running mktables) for the property
 116 # \p{White_Space} is no longer 25.  Further investigation would then be
 117 # necessary to classify the new characters as horizontal and vertical.
 118 #
 119 # The code in regexec.c for the \X match construct is intimately tied to the
 120 # regular expression in UAX #29 (http://www.unicode.org/reports/tr29/).  You
 121 # should see if it has changed, and if so, regexec.c should be modified.  The
 122 # current one is
 123 # ( CRLF
 124 # | Prepend* ( RI-sequence | Hangul-Syllable | !Control )
 125 #   ( Grapheme_Extend | SpacingMark )*
 126 # | . )
 127 #
 128 # mktables has many checks to warn you if there are unexpected or novel things
 129 # that it doesn't know how to handle.
 130 #
 131 # Module::CoreList should be changed to include the new release
 132 #
 133 # Also, you should regen l1_char_class_tab.h, by
 134 #
 135 # perl regen/mk_L_charclass.pl
 136 #
 137 # and, regen charclass_invlists.h by
 138 #
 139 # perl regen/mk_invlists.pl
 140 #
 141 # Finally:
 142 #
 143 #       p4 submit
 144 #
 145 # --
 146 # jhi@iki.fi; updated by nick@ccl4.org, public@khwilliamson.com