Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | * Copyright (C) 2004-2013, International Business Machines |
michael@0 | 2 | * Corporation and others. All Rights Reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * file name: changes.txt |
michael@0 | 5 | * encoding: US-ASCII |
michael@0 | 6 | * tab size: 8 (not used) |
michael@0 | 7 | * indentation:4 |
michael@0 | 8 | * |
michael@0 | 9 | * created on: 2004may06 |
michael@0 | 10 | * created by: Markus W. Scherer |
michael@0 | 11 | * |
michael@0 | 12 | * change log for Unicode updates |
michael@0 | 13 | |
michael@0 | 14 | ---------------------------------------------------------------------------- *** |
michael@0 | 15 | |
michael@0 | 16 | Unicode 6.3 update |
michael@0 | 17 | |
michael@0 | 18 | http://www.unicode.org/review/pri249/ -- beta review |
michael@0 | 19 | http://www.unicode.org/reports/uax-proposed-updates.html |
michael@0 | 20 | http://www.unicode.org/versions/beta-6.3.0.html#notable_issues |
michael@0 | 21 | http://www.unicode.org/reports/tr44/tr44-11.html |
michael@0 | 22 | |
michael@0 | 23 | *** ICU Trac |
michael@0 | 24 | |
michael@0 | 25 | - ticket 10128: update ICU to Unicode 6.3 beta |
michael@0 | 26 | - ticket 10168: update ICU to Unicode 6.3 final |
michael@0 | 27 | - C++ branches/markus/uni63 at r33552 from trunk at r33551 |
michael@0 | 28 | - Java branches/markus/uni63 at r33550 from trunk at r33553 |
michael@0 | 29 | |
michael@0 | 30 | - ticket 10142: implement Unicode 6.3 bidi algorithm additions |
michael@0 | 31 | |
michael@0 | 32 | *** Unicode version numbers |
michael@0 | 33 | - makedata.mak |
michael@0 | 34 | - uchar.h |
michael@0 | 35 | (configure.in & configure: have been modified to extract the version from uchar.h) |
michael@0 | 36 | - com.ibm.icu.util.VersionInfo |
michael@0 | 37 | - com.ibm.icu.dev.test.lang.UCharacterTest.VERSION_ |
michael@0 | 38 | |
michael@0 | 39 | - Run ICU4C "configure" _after_ updating the Unicode version number in uchar.h |
michael@0 | 40 | so that the makefiles see the new version number. |
michael@0 | 41 | |
michael@0 | 42 | *** data files & enums & parser code |
michael@0 | 43 | |
michael@0 | 44 | * file preparation |
michael@0 | 45 | |
michael@0 | 46 | - download UCD, UCA & IDNA files |
michael@0 | 47 | - make sure that the Unicode data folder passed into preparseucd.py |
michael@0 | 48 | includes a copy of the latest IdnaMappingTable.txt (can be in some subfolder) |
michael@0 | 49 | - modify preparseucd.py: |
michael@0 | 50 | parse new file BidiBrackets.txt |
michael@0 | 51 | with new properties bpb=Bidi_Paired_Bracket and bpt=Bidi_Paired_Bracket_Type |
michael@0 | 52 | - ~/svn.icutools/trunk/src/unicode$ py/preparseucd.py ~/unidata/uni63/20130425 ~/svn.icu/uni63/src ~/svn.icutools/trunk/src |
michael@0 | 53 | - This writes files (especially ppucd.txt) to the ICU4C unidata and testdata subfolders. |
michael@0 | 54 | - Check test file diffs for previously commented-out, known-failing data lines; |
michael@0 | 55 | probably need to keep those commented out. |
michael@0 | 56 | |
michael@0 | 57 | * PropertyAliases.txt changes |
michael@0 | 58 | - 1 new Enumerated Property |
michael@0 | 59 | bpt ; Bidi_Paired_Bracket_Type |
michael@0 | 60 | -> uchar.h & UProperty.java & UCharacter.BidiPairedBracketType |
michael@0 | 61 | -> ubidi_props.h & .c & UBiDiProps.java |
michael@0 | 62 | -> remember to write the max value at UBIDI_MAX_VALUES_INDEX |
michael@0 | 63 | -> uprops.cpp |
michael@0 | 64 | -> change ubidi.icu format version from 2.0 to 2.1 |
michael@0 | 65 | - 1 new Miscellaneous Property |
michael@0 | 66 | bpb ; Bidi_Paired_Bracket |
michael@0 | 67 | -> uchar.h & UProperty.java |
michael@0 | 68 | -> ppucd.h & .cpp |
michael@0 | 69 | |
michael@0 | 70 | * PropertyValueAliases.txt changes |
michael@0 | 71 | - 3 Bidi_Paired_Bracket_Type (bpt) values: |
michael@0 | 72 | bpt; c ; Close |
michael@0 | 73 | bpt; n ; None |
michael@0 | 74 | bpt; o ; Open |
michael@0 | 75 | -> uchar.h & UCharacter.BidiPairedBracketType |
michael@0 | 76 | -> ubidi_props.h & .c & UBiDiProps.java |
michael@0 | 77 | -> change ubidi.icu format version from 2.0 to 2.1 |
michael@0 | 78 | - 4 new Bidi_Class (bc) values: |
michael@0 | 79 | bc ; FSI ; First_Strong_Isolate |
michael@0 | 80 | bc ; LRI ; Left_To_Right_Isolate |
michael@0 | 81 | bc ; RLI ; Right_To_Left_Isolate |
michael@0 | 82 | bc ; PDI ; Pop_Directional_Isolate |
michael@0 | 83 | -> uchar.h & UCharacterEnums.ECharacterDirection |
michael@0 | 84 | -> until the bidi code gets updated, |
michael@0 | 85 | Roozbeh suggests mapping the new bc values to ON (Other_Neutral) |
michael@0 | 86 | - 3 new Word_Break (WB) values: |
michael@0 | 87 | WB ; HL ; Hebrew_Letter |
michael@0 | 88 | WB ; SQ ; Single_Quote |
michael@0 | 89 | WB ; DQ ; Double_Quote |
michael@0 | 90 | -> uchar.h & UCharacter.WordBreak |
michael@0 | 91 | -> first time Word_Break numeric constants exceed 4 bits (now 17 values) |
michael@0 | 92 | - 2 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html |
michael@0 | 93 | (added 2012-10-16) |
michael@0 | 94 | Aghb 239 Caucasian Albanian |
michael@0 | 95 | Mahj 314 Mahajani |
michael@0 | 96 | -> uscript.h |
michael@0 | 97 | -> com.ibm.icu.lang.UScript |
michael@0 | 98 | find USCRIPT_([^ ]+) *= ([0-9]+),(.+) |
michael@0 | 99 | replace public static final int \1 = \2;\3 |
michael@0 | 100 | -> preparseucd.py _scripts_only_in_iso15924 |
michael@0 | 101 | -> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI() |
michael@0 | 102 | and in com.ibm.icu.dev.test.lang.TestUScript.java |
michael@0 | 103 | -> update Script metadata: SCRIPT_PROPS[] in uscript_props.cpp & UScript.ScriptMetadata |
michael@0 | 104 | (not strictly necessary for NOT_ENCODED scripts) |
michael@0 | 105 | |
michael@0 | 106 | * generate normalization data files |
michael@0 | 107 | - ~/svn.icu/uni63/dbg$ export LD_LIBRARY_PATH=~/svn.icu/uni63/dbg/lib |
michael@0 | 108 | - ~/svn.icu/uni63/dbg$ SRC_DATA_IN=~/svn.icu/uni63/src/source/data/in |
michael@0 | 109 | - ~/svn.icu/uni63/dbg$ UNIDATA=~/svn.icu/uni63/src/source/data/unidata |
michael@0 | 110 | - ~/svn.icu/uni63/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm -s $UNIDATA/norm2 nfc.txt |
michael@0 | 111 | - ~/svn.icu/uni63/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt |
michael@0 | 112 | - ~/svn.icu/uni63/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt |
michael@0 | 113 | - ~/svn.icu/uni63/dbg$ bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm -s $UNIDATA/norm2 nfc.txt uts46.txt |
michael@0 | 114 | |
michael@0 | 115 | * build ICU (make install) |
michael@0 | 116 | so that the tools build can pick up the new definitions from the installed header files. |
michael@0 | 117 | |
michael@0 | 118 | ~/svn.icu/uni63/dbg$ echo;echo;make -j5 install > out.txt 2>&1 ; tail -n 20 out.txt |
michael@0 | 119 | |
michael@0 | 120 | * build Unicode tools using CMake+make |
michael@0 | 121 | |
michael@0 | 122 | ~/svn.icutools/trunk/src/unicode/c/icudefs.txt: |
michael@0 | 123 | |
michael@0 | 124 | # Location (--prefix) of where ICU was installed. |
michael@0 | 125 | set(ICU_INST_DIR /home/mscherer/svn.icu/uni63/inst) |
michael@0 | 126 | # Location of the ICU source tree. |
michael@0 | 127 | set(ICU_SRC_DIR /home/mscherer/svn.icu/uni63/src) |
michael@0 | 128 | |
michael@0 | 129 | ~/svn.icutools/trunk/dbg/unicode/c$ cmake ../../../src/unicode/c |
michael@0 | 130 | ~/svn.icutools/trunk/dbg/unicode/c$ make |
michael@0 | 131 | |
michael@0 | 132 | * generate core properties data files |
michael@0 | 133 | - ~/svn.icutools/trunk/dbg/unicode/c$ genprops/genprops ~/svn.icu/uni63/src |
michael@0 | 134 | - ~/svn.icutools/trunk/dbg/unicode/c$ genuca/genuca -i ~/svn.icu/uni63/dbg/data/out/build/icudt52l ~/svn.icu/uni63/src |
michael@0 | 135 | - rebuild ICU (make install) & tools |
michael@0 | 136 | - run genuca again (see step above) so that it picks up the new case mappings and nfc.nrm |
michael@0 | 137 | - rebuild ICU (make install) & tools |
michael@0 | 138 | |
michael@0 | 139 | * update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to |
michael@0 | 140 | sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) |
michael@0 | 141 | - grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters |
michael@0 | 142 | - Unicode 6.0..6.3: U+2260, U+226E, U+226F |
michael@0 | 143 | - nothing new in 6.3, no test file to update |
michael@0 | 144 | |
michael@0 | 145 | * update Java data files |
michael@0 | 146 | - refresh just the UCD-related files, just to be safe |
michael@0 | 147 | - see (ICU4C)/source/data/icu4j-readme.txt |
michael@0 | 148 | - mkdir /tmp/icu4j |
michael@0 | 149 | - ~/svn.icu/uni63/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 150 | output: |
michael@0 | 151 | ... |
michael@0 | 152 | Unicode .icu files built to ./out/build/icudt52l |
michael@0 | 153 | mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt52b |
michael@0 | 154 | mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt52b |
michael@0 | 155 | echo pnames.icu ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt |
michael@0 | 156 | LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt52l.dat ./out/icu4j/icudt52b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt52l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt52b |
michael@0 | 157 | mv ./out/icu4j/"com/ibm/icu/impl/data/icudt52b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt52b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt52b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt52b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt52b" |
michael@0 | 158 | jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt52b/ |
michael@0 | 159 | mkdir -p /tmp/icu4j/main/shared/data |
michael@0 | 160 | cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data |
michael@0 | 161 | jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt52b/ |
michael@0 | 162 | mkdir -p /tmp/icu4j/main/shared/data |
michael@0 | 163 | cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data |
michael@0 | 164 | make[1]: Leaving directory `/home/mscherer/svn.icu/uni63/dbg/data' |
michael@0 | 165 | - copy the big-endian Unicode data files to another location, |
michael@0 | 166 | separate from the other data files |
michael@0 | 167 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt52b/coll |
michael@0 | 168 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt52b/brkitr |
michael@0 | 169 | ~/svn.icu/uni63/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt52b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt52b |
michael@0 | 170 | ~/svn.icu/uni63/dbg/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt52b/cnvalias.icu |
michael@0 | 171 | ~/svn.icu/uni63/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt52b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt52b |
michael@0 | 172 | ~/svn.icu/uni63/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt52b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt52b/coll |
michael@0 | 173 | ~/svn.icu/uni63/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt52b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt52b/brkitr |
michael@0 | 174 | - refresh ICU4J |
michael@0 | 175 | ~/svn.icu/uni63/dbg/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt52b |
michael@0 | 176 | |
michael@0 | 177 | * refresh Java test .txt files |
michael@0 | 178 | - copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode |
michael@0 | 179 | |
michael@0 | 180 | * UCA -- mostly skipped for ICU 52 / Unicode 6.3, except update coll/* files |
michael@0 | 181 | |
michael@0 | 182 | - get output from Mark's tools; look in http://www.unicode.org/Public/UCA/<beta version>/ |
michael@0 | 183 | - CLDR root files for ICU are in CollationAuxiliary.zip; unpack that |
michael@0 | 184 | - update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt |
michael@0 | 185 | - update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt |
michael@0 | 186 | (note removing the underscore before "Rules") |
michael@0 | 187 | - update (ICU4C)/source/test/testdata/CollationTest_*.txt |
michael@0 | 188 | and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt |
michael@0 | 189 | with output from Mark's Unicode tools (..._CLDR_..._SHORT.txt) |
michael@0 | 190 | - check test file diffs for previously commented-out, known-failing data lines; |
michael@0 | 191 | probably need to keep those commented out |
michael@0 | 192 | - check FractionalUCA.txt for manual changes of lead bytes from IMPLICIT to Hani |
michael@0 | 193 | - run genuca, see command line above |
michael@0 | 194 | - rebuild ICU4C |
michael@0 | 195 | - refresh ICU4J collation data: |
michael@0 | 196 | (subset of instructions above for properties data refresh, except copies all coll/*) |
michael@0 | 197 | ~/svn.icu/uni63/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 198 | ~/svn.icu/uni63/dbg$ mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt52b/coll |
michael@0 | 199 | ~/svn.icu/uni63/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt52b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt52b/coll |
michael@0 | 200 | ~/svn.icu/uni63/dbg/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt52b |
michael@0 | 201 | - run all tests with the *_SHORT.txt or the full files (the full ones have comments, useful for debugging) |
michael@0 | 202 | - note on intltest: if collate/UCAConformanceTest fails, then |
michael@0 | 203 | utility/MultithreadTest/TestCollators will fail as well; |
michael@0 | 204 | fix the conformance test before looking into the multi-thread test |
michael@0 | 205 | |
michael@0 | 206 | * test ICU, fix test code where necessary |
michael@0 | 207 | |
michael@0 | 208 | * When refreshing all of ICU4J data from ICU4C |
michael@0 | 209 | - ~/svn.icu/uni63/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 210 | - cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data |
michael@0 | 211 | or |
michael@0 | 212 | - ~/svn.icu/uni63/dbg$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install |
michael@0 | 213 | |
michael@0 | 214 | *** LayoutEngine script information |
michael@0 | 215 | - skipped for Unicode 6.3: no new scripts |
michael@0 | 216 | |
michael@0 | 217 | *** merge the Unicode update branches back onto the trunk |
michael@0 | 218 | - do not merge the icudata.jar and testdata.jar, |
michael@0 | 219 | instead rebuild them from merged & tested ICU4C |
michael@0 | 220 | |
michael@0 | 221 | ---------------------------------------------------------------------------- *** |
michael@0 | 222 | |
michael@0 | 223 | Unicode 6.2 update |
michael@0 | 224 | |
michael@0 | 225 | http://www.unicode.org/review/pri230/ |
michael@0 | 226 | http://www.unicode.org/versions/beta-6.2.0.html |
michael@0 | 227 | http://www.unicode.org/reports/tr44/tr44-9.html#Unicode_6.2.0 |
michael@0 | 228 | http://www.unicode.org/review/pri227/ Changes to Script Extensions Property Values |
michael@0 | 229 | http://www.unicode.org/review/pri228/ Changing some common characters from Punctuation to Symbol |
michael@0 | 230 | http://www.unicode.org/review/pri229/ Linebreaking Changes for Pictographic Symbols |
michael@0 | 231 | http://www.unicode.org/reports/tr46/tr46-8.html IDNA |
michael@0 | 232 | http://unicode.org/Public/idna/6.2.0/ |
michael@0 | 233 | |
michael@0 | 234 | *** ICU Trac |
michael@0 | 235 | |
michael@0 | 236 | - ticket 9515: Unicode 6.2: final ICU update |
michael@0 | 237 | |
michael@0 | 238 | - ticket 9514: UCA 6.2: fix UCARules.txt |
michael@0 | 239 | |
michael@0 | 240 | - ticket 9437: update ICU to Unicode 6.2 |
michael@0 | 241 | - C++ branches/markus/uni62 at r32050 from trunk at r32041 |
michael@0 | 242 | - Java branches/markus/uni62 at r32068 from trunk at r32066 |
michael@0 | 243 | |
michael@0 | 244 | *** Unicode version numbers |
michael@0 | 245 | - makedata.mak |
michael@0 | 246 | - uchar.h |
michael@0 | 247 | (configure.in & configure: have been modified to extract the version from uchar.h) |
michael@0 | 248 | - com.ibm.icu.util.VersionInfo |
michael@0 | 249 | - com.ibm.icu.dev.test.lang.UCharacterTest.VERSION_ |
michael@0 | 250 | |
michael@0 | 251 | *** data files & enums & parser code |
michael@0 | 252 | |
michael@0 | 253 | * file preparation |
michael@0 | 254 | |
michael@0 | 255 | - download UCD, UCA & IDNA files |
michael@0 | 256 | - make sure that the Unicode data folder passed into preparseucd.py |
michael@0 | 257 | includes a copy of the latest IdnaMappingTable.txt (can be in some subfolder) |
michael@0 | 258 | - modify preparseucd.py: NamesList.txt is now in UTF-8 |
michael@0 | 259 | - ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni62/20120816 ~/svn.icu/uni62/src ~/svn.icu/tools/trunk/src |
michael@0 | 260 | - This writes files (especially ppucd.txt) to the ICU4C unidata and testdata subfolders. |
michael@0 | 261 | - Check test file diffs for previously commented-out, known-failing data lines; |
michael@0 | 262 | probably need to keep those commented out. |
michael@0 | 263 | |
michael@0 | 264 | * PropertyValueAliases.txt changes |
michael@0 | 265 | - 1 new Line_Break (lb) value: |
michael@0 | 266 | lb ; RI ; Regional_Indicator |
michael@0 | 267 | -> uchar.h & UCharacter.LineBreak |
michael@0 | 268 | - 1 new Word_Break (WB) value: |
michael@0 | 269 | WB ; RI ; Regional_Indicator |
michael@0 | 270 | -> uchar.h & UCharacter.WordBreak |
michael@0 | 271 | - 1 new Grapheme_Cluster_Break (GCB) value: |
michael@0 | 272 | GCB; RI ; Regional_Indicator |
michael@0 | 273 | -> uchar.h & UCharacter.GraphemeClusterBreak |
michael@0 | 274 | |
michael@0 | 275 | * 3 new numeric values |
michael@0 | 276 | The new value -1, which was really supposed to be NaN but that would have required |
michael@0 | 277 | new UnicodeData.txt syntax, can already be represented as a "fraction" of -1/1, |
michael@0 | 278 | but encodeNumericValue() in corepropsbuilder.cpp had to be fixed. |
michael@0 | 279 | cp;12456;na=CUNEIFORM NUMERIC SIGN NIGIDAMIN;nv=-1 |
michael@0 | 280 | cp;12457;na=CUNEIFORM NUMERIC SIGN NIGIDAESH;nv=-1 |
michael@0 | 281 | The two new values 216000 and 432000 require an addition to the encoding of numeric values. |
michael@0 | 282 | cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000 |
michael@0 | 283 | cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000 |
michael@0 | 284 | -> uprops.h, uchar.c & UCharacterProperty.java |
michael@0 | 285 | -> cucdtst.c & UCharacterTest.java |
michael@0 | 286 | |
michael@0 | 287 | * generate normalization data files |
michael@0 | 288 | - ~/svn.icu/uni62/dbg$ export LD_LIBRARY_PATH=~/svn.icu/uni62/dbg/lib |
michael@0 | 289 | - ~/svn.icu/uni62/dbg$ SRC_DATA_IN=~/svn.icu/uni62/src/source/data/in |
michael@0 | 290 | - ~/svn.icu/uni62/dbg$ UNIDATA=~/svn.icu/uni62/src/source/data/unidata |
michael@0 | 291 | - ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm -s $UNIDATA/norm2 nfc.txt |
michael@0 | 292 | - ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt |
michael@0 | 293 | - ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt |
michael@0 | 294 | - ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm -s $UNIDATA/norm2 nfc.txt uts46.txt |
michael@0 | 295 | |
michael@0 | 296 | * build ICU (make install) |
michael@0 | 297 | so that the tools build can pick up the new definitions from the installed header files. |
michael@0 | 298 | * build Unicode tools using CMake+make |
michael@0 | 299 | |
michael@0 | 300 | * generate core properties data files |
michael@0 | 301 | - ~/svn.icu/tools/trunk/dbg/unicode$ c/genprops/genprops ~/svn.icu/uni62/src |
michael@0 | 302 | - in initial bootstrapping, change the UCA version |
michael@0 | 303 | in source/data/unidata/FractionalUCA.txt to match the new Unicode version |
michael@0 | 304 | - ~/svn.icu/tools/trunk/dbg/unicode$ c/genuca/genuca -i ~/svn.icu/uni62/dbg/data/out/build/icudt50l ~/svn.icu/uni62/src |
michael@0 | 305 | - rebuild ICU (make install) & tools |
michael@0 | 306 | + if genrb fails to build coll/root.res with an U_INVALID_FORMAT_ERROR, |
michael@0 | 307 | check if the UCA version in FractionalUCA.txt matches the new Unicode version |
michael@0 | 308 | (see step above) |
michael@0 | 309 | - run genuca again (see step above) so that it picks up the new case mappings and nfc.nrm |
michael@0 | 310 | - rebuild ICU (make install) & tools |
michael@0 | 311 | |
michael@0 | 312 | * update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to |
michael@0 | 313 | sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) |
michael@0 | 314 | - grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters |
michael@0 | 315 | - Unicode 6.0..6.2: U+2260, U+226E, U+226F |
michael@0 | 316 | - nothing new in 6.2, no test file to update |
michael@0 | 317 | |
michael@0 | 318 | * update Java data files |
michael@0 | 319 | - refresh just the UCD-related files, just to be safe |
michael@0 | 320 | - see (ICU4C)/source/data/icu4j-readme.txt |
michael@0 | 321 | - mkdir /tmp/icu4j |
michael@0 | 322 | - ~/svn.icu/uni62/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 323 | output: |
michael@0 | 324 | ... |
michael@0 | 325 | Unicode .icu files built to ./out/build/icudt50l |
michael@0 | 326 | mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt50b |
michael@0 | 327 | mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt50b |
michael@0 | 328 | echo pnames.icu ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt |
michael@0 | 329 | LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt50l.dat ./out/icu4j/icudt50b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt50l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt50b |
michael@0 | 330 | mv ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt50b" |
michael@0 | 331 | jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt50b/ |
michael@0 | 332 | mkdir -p /tmp/icu4j/main/shared/data |
michael@0 | 333 | cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data |
michael@0 | 334 | jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt50b/ |
michael@0 | 335 | mkdir -p /tmp/icu4j/main/shared/data |
michael@0 | 336 | cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data |
michael@0 | 337 | make[1]: Leaving directory `/home/mscherer/svn.icu/uni62/dbg/data' |
michael@0 | 338 | - copy the big-endian Unicode data files to another location, |
michael@0 | 339 | separate from the other data files |
michael@0 | 340 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll |
michael@0 | 341 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/brkitr |
michael@0 | 342 | ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt50b |
michael@0 | 343 | ~/svn.icu/uni62/dbg/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/cnvalias.icu |
michael@0 | 344 | ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt50b |
michael@0 | 345 | ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll |
michael@0 | 346 | ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/brkitr |
michael@0 | 347 | - refresh ICU4J |
michael@0 | 348 | ~/svn.icu/uni62/dbg/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt50b |
michael@0 | 349 | |
michael@0 | 350 | * refresh Java test .txt files |
michael@0 | 351 | - copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode |
michael@0 | 352 | |
michael@0 | 353 | * UCA |
michael@0 | 354 | |
michael@0 | 355 | - get output from Mark's tools; look in http://www.unicode.org/Public/UCA/<beta version>/ |
michael@0 | 356 | - CLDR root files for ICU are in CollationAuxiliary.zip; unpack that |
michael@0 | 357 | - update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt |
michael@0 | 358 | - update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt |
michael@0 | 359 | (note removing the underscore before "Rules") |
michael@0 | 360 | - update (ICU4C)/source/test/testdata/CollationTest_*.txt |
michael@0 | 361 | and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt |
michael@0 | 362 | with output from Mark's Unicode tools (..._CLDR_..._SHORT.txt) |
michael@0 | 363 | - check test file diffs for previously commented-out, known-failing data lines; |
michael@0 | 364 | probably need to keep those commented out |
michael@0 | 365 | - check FractionalUCA.txt for manual changes of lead bytes from IMPLICIT to Hani |
michael@0 | 366 | - run genuca, see command line above |
michael@0 | 367 | - rebuild ICU4C |
michael@0 | 368 | - refresh ICU4J collation data: |
michael@0 | 369 | (subset of instructions above for properties data refresh, except copies all coll/*) |
michael@0 | 370 | ~/svn.icu/uni62/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 371 | ~/svn.icu/uni62/bld$ mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll |
michael@0 | 372 | ~/svn.icu/uni62/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll |
michael@0 | 373 | ~/svn.icu/uni62/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt50b |
michael@0 | 374 | - run all tests with the *_SHORT.txt or the full files (the full ones have comments, useful for debugging) |
michael@0 | 375 | - note on intltest: if collate/UCAConformanceTest fails, then |
michael@0 | 376 | utility/MultithreadTest/TestCollators will fail as well; |
michael@0 | 377 | fix the conformance test before looking into the multi-thread test |
michael@0 | 378 | |
michael@0 | 379 | * test ICU, fix test code where necessary |
michael@0 | 380 | |
michael@0 | 381 | * When refreshing all of ICU4J data from ICU4C |
michael@0 | 382 | - ~/svn.icu/uni62/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 383 | - cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data |
michael@0 | 384 | or |
michael@0 | 385 | - ~/svn.icu/uni62/dbg$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install |
michael@0 | 386 | |
michael@0 | 387 | *** LayoutEngine script information |
michael@0 | 388 | - skipped for Unicode 6.2: no new scripts |
michael@0 | 389 | |
michael@0 | 390 | *** merge the Unicode update branches back onto the trunk |
michael@0 | 391 | - do not merge the icudata.jar and testdata.jar, |
michael@0 | 392 | instead rebuild them from merged & tested ICU4C |
michael@0 | 393 | |
michael@0 | 394 | ---------------------------------------------------------------------------- *** |
michael@0 | 395 | |
michael@0 | 396 | Future Unicode update |
michael@0 | 397 | |
michael@0 | 398 | Tools simplified since the Unicode 6.1 update. See |
michael@0 | 399 | - http://site.icu-project.org/design/props/ppucd |
michael@0 | 400 | - http://bugs.icu-project.org/trac/wiki/Markus/ReviewTicket8972 |
michael@0 | 401 | |
michael@0 | 402 | * Unicode version numbers |
michael@0 | 403 | - icutools/unicode/makedefs.sh was deleted, so one fewer place for version & path updates |
michael@0 | 404 | |
michael@0 | 405 | * file preparation |
michael@0 | 406 | - ucdcopy.py, idna2nrm.py and genpname/preparse.pl replaced by preparseucd.py: |
michael@0 | 407 | - ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src |
michael@0 | 408 | - This writes files (especially ppucd.txt) to the ICU4C unidata and testdata subfolders. |
michael@0 | 409 | - Check test file diffs for previously commented-out, known-failing data lines; |
michael@0 | 410 | probably need to keep those commented out. |
michael@0 | 411 | |
michael@0 | 412 | * PropertyValueAliases.txt changes |
michael@0 | 413 | - Script codes that are in ISO 15924 but not in Unicode are now listed in |
michael@0 | 414 | preparseucd.py, in the _scripts_only_in_iso15924 variable. |
michael@0 | 415 | If there are new ISO codes, then add them. |
michael@0 | 416 | If Unicode adds some of them, then remove them from the .py variable. |
michael@0 | 417 | |
michael@0 | 418 | * UnicodeData.txt changes |
michael@0 | 419 | - No more manual changes for CJK ranges for algorithmic names; |
michael@0 | 420 | those are now written to ppucd.txt and genprops reads them from there. |
michael@0 | 421 | |
michael@0 | 422 | * generate core properties data files (makeprops.sh was deleted) |
michael@0 | 423 | - ~/svn.icu/tools/trunk/dbg/unicode$ c/genprops/genprops ~/svn.icu/trunk/src |
michael@0 | 424 | |
michael@0 | 425 | * no more manual updates of source/data/unidata/norm2/nfkc_cf.txt |
michael@0 | 426 | - it is now generated by preparseucd.py |
michael@0 | 427 | |
michael@0 | 428 | * no more separate idna2nrm.py run and manual copying to generate source/data/unidata/norm2/uts46.txt |
michael@0 | 429 | - it is now generated by preparseucd.py |
michael@0 | 430 | - make sure that the Unicode data folder passed into preparseucd.py |
michael@0 | 431 | includes a copy of http://www.unicode.org/Public/idna/6.1.0/IdnaMappingTable.txt |
michael@0 | 432 | (can be in some subfolder) |
michael@0 | 433 | |
michael@0 | 434 | * generate normalization data files |
michael@0 | 435 | - ~/svn.icu/trunk/dbg$ export LD_LIBRARY_PATH=~/svn.icu/trunk/dbg/lib |
michael@0 | 436 | - ~/svn.icu/trunk/dbg$ SRC_DATA_IN=~/svn.icu/trunk/src/source/data/in |
michael@0 | 437 | - ~/svn.icu/trunk/dbg$ UNIDATA=~/svn.icu/trunk/src/source/data/unidata |
michael@0 | 438 | - ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm -s $UNIDATA/norm2 nfc.txt |
michael@0 | 439 | - ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt |
michael@0 | 440 | - ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt |
michael@0 | 441 | - ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm -s $UNIDATA/norm2 nfc.txt uts46.txt |
michael@0 | 442 | |
michael@0 | 443 | * build ICU (make install) |
michael@0 | 444 | * build Unicode tools using CMake+make |
michael@0 | 445 | |
michael@0 | 446 | * new way to call genuca (makeuca.sh was deleted) |
michael@0 | 447 | - ~/svn.icu/tools/trunk/dbg/unicode$ c/genuca/genuca -i ~/svn.icu/trunk/dbg/data/out/build/icudt49l ~/svn.icu/trunk/src |
michael@0 | 448 | |
michael@0 | 449 | ---------------------------------------------------------------------------- *** |
michael@0 | 450 | |
michael@0 | 451 | Unicode 6.1 update |
michael@0 | 452 | |
michael@0 | 453 | *** ICU Trac |
michael@0 | 454 | |
michael@0 | 455 | - ticket 8995 final update to Unicode 6.1 |
michael@0 | 456 | - ticket 8994 regenerate source/layout/CanonData.cpp |
michael@0 | 457 | |
michael@0 | 458 | - ticket 8961 support Unicode "Age" value *names* |
michael@0 | 459 | - ticket 8963 support multiple character name aliases & types |
michael@0 | 460 | |
michael@0 | 461 | - ticket 8827 "update ICU to Unicode 6.1" |
michael@0 | 462 | - C++ branches/markus/uni61 at r30864 from trunk at r30843 |
michael@0 | 463 | - Java branches/markus/uni61 at r30865 from trunk at r30863 |
michael@0 | 464 | |
michael@0 | 465 | *** Unicode version numbers |
michael@0 | 466 | - makedata.mak |
michael@0 | 467 | - uchar.h |
michael@0 | 468 | (configure.in & configure: have been modified to extract the version from uchar.h) |
michael@0 | 469 | - com.ibm.icu.util.VersionInfo |
michael@0 | 470 | - icutools/unicode/makedefs.sh |
michael@0 | 471 | + also review & update other definitions in that file, |
michael@0 | 472 | e.g. the ICU version in this path: BLD_DATA_FILES=$ICU_BLD/data/out/build/icudt49l |
michael@0 | 473 | |
michael@0 | 474 | *** data files & enums & parser code |
michael@0 | 475 | |
michael@0 | 476 | * file preparation |
michael@0 | 477 | |
michael@0 | 478 | ~/svn.icu/tools/trunk/src/unicode/c/genprops/misc$ ./ucdcopy.py ~/uni61/20111205/ucd ~/uni61/processed |
michael@0 | 479 | - This prepares both unidata and testdata files in respective output subfolders. |
michael@0 | 480 | - Check test file diffs for previously commented-out, known-failing data lines; |
michael@0 | 481 | probably need to keep those commented out. |
michael@0 | 482 | |
michael@0 | 483 | * PropertyValueAliases.txt changes |
michael@0 | 484 | - 11 new block names: |
michael@0 | 485 | Arabic_Extended_A |
michael@0 | 486 | Arabic_Mathematical_Alphabetic_Symbols |
michael@0 | 487 | Chakma |
michael@0 | 488 | Meetei_Mayek_Extensions |
michael@0 | 489 | Meroitic_Cursive |
michael@0 | 490 | Meroitic_Hieroglyphs |
michael@0 | 491 | Miao |
michael@0 | 492 | Sharada |
michael@0 | 493 | Sora_Sompeng |
michael@0 | 494 | Sundanese_Supplement |
michael@0 | 495 | Takri |
michael@0 | 496 | -> add to uchar.h |
michael@0 | 497 | -> add to UCharacter.UnicodeBlock IDs |
michael@0 | 498 | Eclipse find UBLOCK_([^ ]+) = ([0-9]+), (/.+) |
michael@0 | 499 | replace public static final int \1_ID = \2; \3 |
michael@0 | 500 | -> add to UCharacter.UnicodeBlock objects |
michael@0 | 501 | Eclipse find UBLOCK_([^ ]+) = [0-9]+, (/.+) |
michael@0 | 502 | replace public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 |
michael@0 | 503 | - 1 new Joining_Group (jg) value: |
michael@0 | 504 | Rohingya_Yeh |
michael@0 | 505 | -> uchar.h & UCharacter.JoiningGroup |
michael@0 | 506 | - 2 new Line_Break (lb) values: |
michael@0 | 507 | CJ=Conditional_Japanese_Starter |
michael@0 | 508 | HL=Hebrew_Letter |
michael@0 | 509 | -> uchar.h & UCharacter.LineBreak |
michael@0 | 510 | - 7 new scripts: |
michael@0 | 511 | sc ; Cakm ; Chakma |
michael@0 | 512 | sc ; Merc ; Meroitic_Cursive |
michael@0 | 513 | sc ; Mero ; Meroitic_Hieroglyphs |
michael@0 | 514 | sc ; Plrd ; Miao |
michael@0 | 515 | sc ; Shrd ; Sharada |
michael@0 | 516 | sc ; Sora ; Sora_Sompeng |
michael@0 | 517 | sc ; Takr ; Takri |
michael@0 | 518 | -> remove these from SyntheticPropertyValueAliases.txt |
michael@0 | 519 | -> fix expectedLong names in cucdapi.c/TestUScriptCodeAPI() |
michael@0 | 520 | and in com.ibm.icu.dev.test.lang.TestUScript.java |
michael@0 | 521 | - 2 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html |
michael@0 | 522 | (added 2011-06-21) |
michael@0 | 523 | Khoj 322 Khojki |
michael@0 | 524 | Tirh 326 Tirhuta |
michael@0 | 525 | and another one added 2011-12-09 |
michael@0 | 526 | Hluw 080 Anatolian Hieroglyphs (Luwian Hieroglyphs, Hittite Hieroglyphs) |
michael@0 | 527 | -> uscript.h |
michael@0 | 528 | -> com.ibm.icu.lang.UScript |
michael@0 | 529 | find USCRIPT_([^ ]+) *= ([0-9]+),(.+) |
michael@0 | 530 | replace public static final int \1 = \2;\3 |
michael@0 | 531 | -> SyntheticPropertyValueAliases.txt |
michael@0 | 532 | -> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI() |
michael@0 | 533 | and in com.ibm.icu.dev.test.lang.TestUScript.java |
michael@0 | 534 | |
michael@0 | 535 | * UnicodeData.txt changes |
michael@0 | 536 | - the last Unihan code point changes from U+9FCB to U+9FCC |
michael@0 | 537 | search for both 9FCB (end) and 9FCC (limit) (regex 9FC[BC], case-insensitive) |
michael@0 | 538 | + do change gennames.c |
michael@0 | 539 | + do change swapCJK() in ucol.cpp & ImplicitCEGenerator.java |
michael@0 | 540 | |
michael@0 | 541 | * DerivedBidiClass.txt changes |
michael@0 | 542 | - 2 new default-AL blocks: |
michael@0 | 543 | # Arabic Extended-A: U+08A0 - U+08FF (was default-R) |
michael@0 | 544 | # Arabic Mathematical Alphabetic Symbols: |
michael@0 | 545 | # U+1EE00 - U+1EEFF (was default-R) |
michael@0 | 546 | - 2 new default-R blocks: |
michael@0 | 547 | # Meroitic Hieroglyphs: |
michael@0 | 548 | # U+10980 - U+1099F |
michael@0 | 549 | # Meroitic Cursive: U+109A0 - U+109FF |
michael@0 | 550 | -> should be picked up by the explicit data in the file |
michael@0 | 551 | |
michael@0 | 552 | * NameAliases.txt changes |
michael@0 | 553 | - from |
michael@0 | 554 | # Each line has two fields |
michael@0 | 555 | # First field: Code point |
michael@0 | 556 | # Second field: Alias |
michael@0 | 557 | - to |
michael@0 | 558 | # Each line has three fields, as described here: |
michael@0 | 559 | # |
michael@0 | 560 | # First field: Code point |
michael@0 | 561 | # Second field: Alias |
michael@0 | 562 | # Third field: Type |
michael@0 | 563 | - Also, the file previously allowed multiple aliases but only now does it |
michael@0 | 564 | actually provide multiple, even multiple of the same type. For example, |
michael@0 | 565 | FEFF;BYTE ORDER MARK;alternate |
michael@0 | 566 | FEFF;BOM;abbreviation |
michael@0 | 567 | FEFF;ZWNBSP;abbreviation |
michael@0 | 568 | - This breaks our gennames parser, unames.icu data structure, and API. |
michael@0 | 569 | Fix gennames to only pick up "correction" aliases. |
michael@0 | 570 | New ticket #8963 for further changes. |
michael@0 | 571 | |
michael@0 | 572 | * run genpname/preparse.pl (on Linux) |
michael@0 | 573 | + cd ~/svn.icu/tools/trunk/src/unicode/c/genpname |
michael@0 | 574 | + make sure that data.h is writable |
michael@0 | 575 | + perl preparse.pl ~/svn.icu/trunk/src > out.txt |
michael@0 | 576 | + preparse.pl shows no errors, out.txt Info and Warning lines look ok |
michael@0 | 577 | |
michael@0 | 578 | * build ICU (make install) |
michael@0 | 579 | so that the tools build can pick up the new definitions from the installed header files. |
michael@0 | 580 | * build Unicode tools (at least genpname) using CMake+make |
michael@0 | 581 | |
michael@0 | 582 | * run genpname |
michael@0 | 583 | (builds both pnames.icu and propname_data.h) |
michael@0 | 584 | - ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in |
michael@0 | 585 | - ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/common --csource |
michael@0 | 586 | |
michael@0 | 587 | * build ICU (make install) |
michael@0 | 588 | * build Unicode tools using CMake+make |
michael@0 | 589 | |
michael@0 | 590 | * update source/data/unidata/norm2/nfkc_cf.txt |
michael@0 | 591 | - follow the instructions in nfkc_cf.txt for updating it from DerivedNormalizationProps.txt |
michael@0 | 592 | |
michael@0 | 593 | * update source/data/unidata/norm2/uts46.txt |
michael@0 | 594 | - download http://www.unicode.org/Public/idna/6.1.0/IdnaMappingTable.txt |
michael@0 | 595 | to ~/svn.icu/tools/trunk/src/unicode/py |
michael@0 | 596 | - adjust idna2nrm.py to remove "; NV8": For UTS #46, we do not care about "not valid in IDNA2008". |
michael@0 | 597 | - ~/svn.icu/tools/trunk/src/unicode/py$ ./idna2nrm.py |
michael@0 | 598 | - ~/svn.icu/tools/trunk/src/unicode/py$ cp uts46.txt ~/svn.icu/trunk/src/source/data/unidata/norm2 |
michael@0 | 599 | |
michael@0 | 600 | * update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to |
michael@0 | 601 | sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) |
michael@0 | 602 | - grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters |
michael@0 | 603 | - Unicode 6.0..6.1: U+2260, U+226E, U+226F |
michael@0 | 604 | - nothing new in 6.1, no test file to update |
michael@0 | 605 | |
michael@0 | 606 | * generate core properties data files |
michael@0 | 607 | - in initial bootstrapping, change the UCA version |
michael@0 | 608 | in source/data/unidata/FractionalUCA.txt to match the new Unicode version |
michael@0 | 609 | - ~/svn.icu/tools/trunk/src/unicode$ ./makeprops.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld |
michael@0 | 610 | - rebuild ICU & tools |
michael@0 | 611 | + if genrb fails to build coll/root.res with an U_INVALID_FORMAT_ERROR, |
michael@0 | 612 | check if the UCA version in FractionalUCA.txt matches the new Unicode version |
michael@0 | 613 | (see step above) |
michael@0 | 614 | - run makeuca.sh so that genuca picks up the new case mappings and nfc.nrm: |
michael@0 | 615 | ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld |
michael@0 | 616 | - rebuild ICU & tools |
michael@0 | 617 | |
michael@0 | 618 | * update Java data files |
michael@0 | 619 | - refresh just the UCD-related files, just to be safe |
michael@0 | 620 | - see (ICU4C)/source/data/icu4j-readme.txt |
michael@0 | 621 | - mkdir /tmp/icu4j |
michael@0 | 622 | - ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 623 | output: |
michael@0 | 624 | ... |
michael@0 | 625 | Unicode .icu files built to ./out/build/icudt49l |
michael@0 | 626 | mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt49b |
michael@0 | 627 | mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt49b |
michael@0 | 628 | echo pnames.icu ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt |
michael@0 | 629 | LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt49l.dat ./out/icu4j/icudt49b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt49l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt49b |
michael@0 | 630 | mv ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt49b" |
michael@0 | 631 | jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt49b/ |
michael@0 | 632 | mkdir -p /tmp/icu4j/main/shared/data |
michael@0 | 633 | cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data |
michael@0 | 634 | jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt49b/ |
michael@0 | 635 | mkdir -p /tmp/icu4j/main/shared/data |
michael@0 | 636 | cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data |
michael@0 | 637 | make[1]: Leaving directory `/home/mscherer/svn.icu/trunk/bld/data' |
michael@0 | 638 | - copy the big-endian Unicode data files to another location, |
michael@0 | 639 | separate from the other data files |
michael@0 | 640 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll |
michael@0 | 641 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/brkitr |
michael@0 | 642 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt49b |
michael@0 | 643 | ~/svn.icu/trunk/bld/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/cnvalias.icu |
michael@0 | 644 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt49b |
michael@0 | 645 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll |
michael@0 | 646 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/brkitr |
michael@0 | 647 | - refresh ICU4J |
michael@0 | 648 | ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt49b |
michael@0 | 649 | |
michael@0 | 650 | * refresh Java test .txt files |
michael@0 | 651 | - copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode |
michael@0 | 652 | |
michael@0 | 653 | * test ICU so far, fix test code where necessary |
michael@0 | 654 | - temporarily ignore collation issues that look like UCA/UCD mismatches, |
michael@0 | 655 | until UCA data is updated |
michael@0 | 656 | |
michael@0 | 657 | * UCA |
michael@0 | 658 | |
michael@0 | 659 | - get output from Mark's tools; look in |
michael@0 | 660 | http://www.unicode.org/Public/UCA/6.1.0/CollationAuxiliary-<dev. version>.txt |
michael@0 | 661 | - update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt |
michael@0 | 662 | - update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt |
michael@0 | 663 | (note removing the underscore before "Rules") |
michael@0 | 664 | - update (ICU)/source/test/testdata/CollationTest_*.txt |
michael@0 | 665 | and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt |
michael@0 | 666 | with output from Mark's Unicode tools (..._CLDR_..._SHORT.txt) |
michael@0 | 667 | - check test file diffs for previously commented-out, known-failing data lines; |
michael@0 | 668 | probably need to keep those commented out |
michael@0 | 669 | - check FractionalUCA.txt for manual changes of lead bytes from IMPLICIT to Hani |
michael@0 | 670 | - run makeuca.sh: |
michael@0 | 671 | ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld |
michael@0 | 672 | - rebuild ICU4C |
michael@0 | 673 | - refresh ICU4J collation data: |
michael@0 | 674 | (subset of instructions above for properties data refresh, except copies all coll/*) |
michael@0 | 675 | ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 676 | ~/svn.icu/trunk/bld$ mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll |
michael@0 | 677 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll |
michael@0 | 678 | ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt49b |
michael@0 | 679 | - run all tests with the *_SHORT.txt or the full files (the full ones have comments, useful for debugging) |
michael@0 | 680 | - note on intltest: if collate/UCAConformanceTest fails, then |
michael@0 | 681 | utility/MultithreadTest/TestCollators will fail as well; |
michael@0 | 682 | fix the conformance test before looking into the multi-thread test |
michael@0 | 683 | |
michael@0 | 684 | * When refreshing all of ICU4J data from ICU4C |
michael@0 | 685 | - ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 686 | - cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data |
michael@0 | 687 | or |
michael@0 | 688 | - ~/svn.icu/trunk/bld$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install |
michael@0 | 689 | |
michael@0 | 690 | *** LayoutEngine script information |
michael@0 | 691 | |
michael@0 | 692 | (For details see the Unicode 5.2 change log below.) |
michael@0 | 693 | |
michael@0 | 694 | * Run icu4j-tools: com.ibm.icu.dev.tool.layout.ScriptNameBuilder. |
michael@0 | 695 | This generates LEScripts.h, LELanguages.h, ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp |
michael@0 | 696 | in the working directory. |
michael@0 | 697 | (It also generates ScriptRunData.cpp, which is no longer needed.) |
michael@0 | 698 | |
michael@0 | 699 | The generated files have a current copyright date and "@draft" statement. |
michael@0 | 700 | |
michael@0 | 701 | - diff current <icu>/source/layout files vs. generated ones |
michael@0 | 702 | ~/svn.icu4j/trunk/src$ kdiff3 ~/svn.icu/trunk/src/source/layout tools/misc/src/com/ibm/icu/dev/tool/layout |
michael@0 | 703 | review and manually merge desired changes; |
michael@0 | 704 | fix gratuitous changes, incorrect @draft and missing aliases; |
michael@0 | 705 | Unicode-derived script codes should be "born stable" like constants in uchar.h, uscript.h etc. |
michael@0 | 706 | - if you just copy the above files, then |
michael@0 | 707 | fix mixed line endings, review the diffs as above and restore changes to API tags etc.; |
michael@0 | 708 | manually re-add the "Indic script xyz v.2" tags in ScriptAndLanguageTags.h |
michael@0 | 709 | |
michael@0 | 710 | *** merge the Unicode update branches back onto the trunk |
michael@0 | 711 | - do not merge the icudata.jar and testdata.jar, |
michael@0 | 712 | instead rebuild them from merged & tested ICU4C |
michael@0 | 713 | |
michael@0 | 714 | ---------------------------------------------------------------------------- *** |
michael@0 | 715 | |
michael@0 | 716 | ICU 4.8 (no Unicode update, just new script codes) |
michael@0 | 717 | |
michael@0 | 718 | * 9 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html |
michael@0 | 719 | (added 2010-12-21) |
michael@0 | 720 | Afak 439 Afaka |
michael@0 | 721 | Jurc 510 Jurchen |
michael@0 | 722 | Mroo 199 Mro, Mru |
michael@0 | 723 | Nshu 499 Nüshu |
michael@0 | 724 | Shrd 319 Sharada, Śāradā |
michael@0 | 725 | Sora 398 Sora Sompeng |
michael@0 | 726 | Takr 321 Takri, Ṭākrī, Ṭāṅkrī |
michael@0 | 727 | Tang 520 Tangut |
michael@0 | 728 | Wole 480 Woleai |
michael@0 | 729 | -> uscript.h |
michael@0 | 730 | -> com.ibm.icu.lang.UScript |
michael@0 | 731 | find USCRIPT_([^ ]+) *= ([0-9]+),(.+) |
michael@0 | 732 | replace public static final int \1 = \2;\3 |
michael@0 | 733 | -> genpname/SyntheticPropertyValueAliases.txt |
michael@0 | 734 | -> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI() |
michael@0 | 735 | and in com.ibm.icu.dev.test.lang.TestUScript.java |
michael@0 | 736 | |
michael@0 | 737 | * run genpname/preparse.pl (on Linux) |
michael@0 | 738 | + cd ~/svn.icu/tools/trunk/src/unicode/c/genpname |
michael@0 | 739 | + make sure that data.h is writable |
michael@0 | 740 | + perl preparse.pl ~/svn.icu/trunk/src > out.txt |
michael@0 | 741 | + preparse.pl shows no errors, out.txt Info and Warning lines look ok |
michael@0 | 742 | |
michael@0 | 743 | * rebuild Unicode tools (at least genpname) using make |
michael@0 | 744 | - You might first need to "make install" ICU so that the tools build can pick |
michael@0 | 745 | up the new definitions from the installed header files. |
michael@0 | 746 | |
michael@0 | 747 | * run genpname |
michael@0 | 748 | (builds both pnames.icu and propname_data.h) |
michael@0 | 749 | - ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in |
michael@0 | 750 | - ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/common --csource |
michael@0 | 751 | - rebuild ICU & tools |
michael@0 | 752 | |
michael@0 | 753 | * run genprops |
michael@0 | 754 | - ~/svn.icu/tools/trunk/bld/unicode/c$ genprops/genprops -d ~/svn.icu/trunk/src/source/data/in -s ~/svn.icu/trunk/src/source/data/unidata -i ~/svn.icu/trunk/dbg/data/out/build/icudt48l -u 6.0 |
michael@0 | 755 | - ~/svn.icu/tools/trunk/bld/unicode/c$ genprops/genprops -d ~/svn.icu/trunk/src/source/common --csource -s ~/svn.icu/trunk/src/source/data/unidata -i ~/svn.icu/trunk/dbg/data/out/build/icudt48l -u 6.0 |
michael@0 | 756 | - rebuild ICU & tools |
michael@0 | 757 | |
michael@0 | 758 | * update Java data files |
michael@0 | 759 | - refresh just the UCD-related files, just to be safe |
michael@0 | 760 | - see (ICU4C)/source/data/icu4j-readme.txt |
michael@0 | 761 | - mkdir /tmp/icu4j |
michael@0 | 762 | - ~/svn.icu/trunk/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 763 | - copy the big-endian Unicode data files to another location, |
michael@0 | 764 | separate from the other data files |
michael@0 | 765 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt48b |
michael@0 | 766 | ~/svn.icu/trunk/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt48b/pnames.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt48b |
michael@0 | 767 | ~/svn.icu/trunk/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt48b/uprops.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt48b |
michael@0 | 768 | - refresh ICU4J |
michael@0 | 769 | ~/svn.icu/trunk/dbg/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt48b |
michael@0 | 770 | |
michael@0 | 771 | * should have updated the layout engine script codes but forgot |
michael@0 | 772 | |
michael@0 | 773 | ---------------------------------------------------------------------------- *** |
michael@0 | 774 | |
michael@0 | 775 | Unicode 6.0 update |
michael@0 | 776 | |
michael@0 | 777 | *** related ICU Trac tickets |
michael@0 | 778 | |
michael@0 | 779 | 7264 Unicode 6.0 Update |
michael@0 | 780 | |
michael@0 | 781 | *** Unicode version numbers |
michael@0 | 782 | - makedata.mak |
michael@0 | 783 | - uchar.h |
michael@0 | 784 | (configure.in & configure: have been modified to extract the version from uchar.h) |
michael@0 | 785 | - com.ibm.icu.util.VersionInfo |
michael@0 | 786 | |
michael@0 | 787 | *** data files & enums & parser code |
michael@0 | 788 | |
michael@0 | 789 | * file preparation |
michael@0 | 790 | |
michael@0 | 791 | ~/svn.icu/tools/trunk/src/unicode/c/genprops/misc$ ./ucdcopy.py ~/uni60/20100720/ucd ~/uni60/processed |
michael@0 | 792 | - This now prepares both unidata and testdata files in respective output subfolders. |
michael@0 | 793 | |
michael@0 | 794 | * PropertyAliases.txt changes |
michael@0 | 795 | - new Script_Extensions property defined in the new ScriptExtensions.txt file |
michael@0 | 796 | but not listed in PropertyAliases.txt; reported to unicode.org; |
michael@0 | 797 | -> added to tools/trunk/src/unicode/c/genpname/SyntheticPropertyAliases.txt |
michael@0 | 798 | scx; Script_Extensions |
michael@0 | 799 | -> uchar.h with new UProperty section |
michael@0 | 800 | -> com.ibm.icu.lang.UProperty, parallel with uchar.h |
michael@0 | 801 | |
michael@0 | 802 | * PropertyValueAliases.txt changes |
michael@0 | 803 | - 12 new block names: |
michael@0 | 804 | Alchemical_Symbols |
michael@0 | 805 | Bamum_Supplement |
michael@0 | 806 | Batak |
michael@0 | 807 | Brahmi |
michael@0 | 808 | CJK_Unified_Ideographs_Extension_D |
michael@0 | 809 | Emoticons |
michael@0 | 810 | Ethiopic_Extended_A |
michael@0 | 811 | Kana_Supplement |
michael@0 | 812 | Mandaic |
michael@0 | 813 | Miscellaneous_Symbols_And_Pictographs |
michael@0 | 814 | Playing_Cards |
michael@0 | 815 | Transport_And_Map_Symbols |
michael@0 | 816 | -> add to uchar.h |
michael@0 | 817 | -> add to UCharacter.UnicodeBlock |
michael@0 | 818 | Eclipse find UBLOCK_([^ ]+) = [0-9]+, (/.+) |
michael@0 | 819 | replace public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 |
michael@0 | 820 | - Joining_Group (jg) values: |
michael@0 | 821 | Teh_Marbuta_Goal becomes the new canonical value for the old Hamza_On_Heh_Goal which becomes an alias |
michael@0 | 822 | -> uchar.h & UCharacter.JoiningGroup |
michael@0 | 823 | - 3 new scripts: |
michael@0 | 824 | sc ; Batk ; Batak |
michael@0 | 825 | sc ; Brah ; Brahmi |
michael@0 | 826 | sc ; Mand ; Mandaic |
michael@0 | 827 | -> remove these from SyntheticPropertyValueAliases.txt |
michael@0 | 828 | -> add alias USCRIPT_MANDAIC to USCRIPT_MANDAEAN |
michael@0 | 829 | -> fix expectedLong names in cucdapi.c/TestUScriptCodeAPI() |
michael@0 | 830 | and in com.ibm.icu.dev.test.lang.TestUScript.java |
michael@0 | 831 | - 13 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html |
michael@0 | 832 | (added 2009-11-11..2010-07-18) |
michael@0 | 833 | Bass 259 Bassa Vah |
michael@0 | 834 | Dupl 755 Duployan shortand |
michael@0 | 835 | Elba 226 Elbasan |
michael@0 | 836 | Gran 343 Grantha |
michael@0 | 837 | Kpel 436 Kpelle |
michael@0 | 838 | Loma 437 Loma |
michael@0 | 839 | Mend 438 Mende |
michael@0 | 840 | Merc 101 Meroitic Cursive |
michael@0 | 841 | Narb 106 Old North Arabian |
michael@0 | 842 | Nbat 159 Nabataean |
michael@0 | 843 | Palm 126 Palmyrene |
michael@0 | 844 | Sind 318 Sindhi |
michael@0 | 845 | Wara 262 Warang Citi |
michael@0 | 846 | -> uscript.h |
michael@0 | 847 | -> com.ibm.icu.lang.UScript |
michael@0 | 848 | find USCRIPT_([^ ]+) *= ([0-9]+),(.+) |
michael@0 | 849 | replace public static final int \1 = \2;\3 |
michael@0 | 850 | -> SyntheticPropertyValueAliases.txt |
michael@0 | 851 | -> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI() |
michael@0 | 852 | and in com.ibm.icu.dev.test.lang.TestUScript.java |
michael@0 | 853 | - ISO 15924 name change |
michael@0 | 854 | Mero 100 Meroitic Hieroglyphs (was Meroitic) |
michael@0 | 855 | -> add new alias USCRIPT_MEROITIC_HIEROGLYPHS to USCRIPT_MEROITIC |
michael@0 | 856 | - property value alias added for Cham, was already moved out of SyntheticPropertyValueAliases.txt |
michael@0 | 857 | |
michael@0 | 858 | * UnicodeData.txt changes |
michael@0 | 859 | - new CJK block: |
michael@0 | 860 | 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; |
michael@0 | 861 | 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; |
michael@0 | 862 | -> add to tools/trunk/src/unicode/c/gennames/gennames.c, with new ucdVersion |
michael@0 | 863 | |
michael@0 | 864 | * build Unicode tools using CMake+make |
michael@0 | 865 | |
michael@0 | 866 | * run genpname/preparse.pl (on Linux) |
michael@0 | 867 | + cd ~/svn.icu/tools/trunk/src/unicode/c/genpname |
michael@0 | 868 | + make sure that data.h is writable |
michael@0 | 869 | + perl preparse.pl ~/svn.icu/trunk/src > out.txt |
michael@0 | 870 | + preparse.pl shows no errors, out.txt Info and Warning lines look ok |
michael@0 | 871 | |
michael@0 | 872 | * rebuild Unicode tools (at least genpname) using make |
michael@0 | 873 | - You might first need to "make install" ICU so that the tools build can pick |
michael@0 | 874 | up the new definitions from the installed header files. |
michael@0 | 875 | |
michael@0 | 876 | * run genpname |
michael@0 | 877 | - ~/svn.icu/tools/trunk/bld/unicode$ c/genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in |
michael@0 | 878 | - rebuild ICU & tools |
michael@0 | 879 | |
michael@0 | 880 | * update source/data/unidata/norm2/nfkc_cf.txt |
michael@0 | 881 | - follow the instructions in nfkc_cf.txt for updating it from DerivedNormalizationProps.txt |
michael@0 | 882 | |
michael@0 | 883 | * update source/data/unidata/norm2/uts46.txt |
michael@0 | 884 | - download http://www.unicode.org/Public/idna/6.0.0/IdnaMappingTable.txt |
michael@0 | 885 | to ~/svn.icu/tools/trunk/src/unicode/py |
michael@0 | 886 | - adjust idna2nrm.py to handle new disallowed_STD3_valid and disallowed_STD3_mapped values |
michael@0 | 887 | - ~/svn.icu/tools/trunk/src/unicode/py$ ./idna2nrm.py |
michael@0 | 888 | - ~/svn.icu/tools/trunk/src/unicode/py$ cp uts46.txt ~/svn.icu/trunk/src/source/data/unidata/norm2 |
michael@0 | 889 | |
michael@0 | 890 | * update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to |
michael@0 | 891 | sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) |
michael@0 | 892 | - grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters |
michael@0 | 893 | - Unicode 6.0: U+2260, U+226E, U+226F |
michael@0 | 894 | |
michael@0 | 895 | * generate core properties data files |
michael@0 | 896 | - ~/svn.icu/tools/trunk/src/unicode$ ./makeprops.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld |
michael@0 | 897 | - rebuild ICU & tools |
michael@0 | 898 | - run makeuca.sh so that genuca picks up the new nfc.nrm: |
michael@0 | 899 | ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld |
michael@0 | 900 | - rebuild ICU & tools |
michael@0 | 901 | |
michael@0 | 902 | * implement new Script_Extensions property (provisional) |
michael@0 | 903 | - parser & generator: genprops & uprops.icu |
michael@0 | 904 | - uscript.h, uprops.h, uchar.c, uniset_props.cpp and others, plus cintltst/cucdapi.c & intltest/usettest.cpp |
michael@0 | 905 | - UScript.java, UCharacterProperty.java, UnicodeSet.java, TestUScript.java, UnicodeSetTest.java |
michael@0 | 906 | |
michael@0 | 907 | * switch ubidi.icu, ucase.icu and uprops.icu from UTrie to UTrie2 |
michael@0 | 908 | - (one-time change) |
michael@0 | 909 | - genbidi/gencase/genprops tools changes |
michael@0 | 910 | - re-run makeprops.sh (see above) |
michael@0 | 911 | - UCharacterProperty.java, UCharacterTypeIterator.java, |
michael@0 | 912 | UBiDiProps.java, UCaseProps.java, and several others with minor changes; |
michael@0 | 913 | UCharacterPropertyReader.java deleted and its code folded into UCharacterProperty.java |
michael@0 | 914 | |
michael@0 | 915 | * update Java data files |
michael@0 | 916 | - refresh just the UCD-related files, just to be safe |
michael@0 | 917 | - see (ICU4C)/source/data/icu4j-readme.txt |
michael@0 | 918 | - mkdir /tmp/icu4j |
michael@0 | 919 | - ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 920 | output: |
michael@0 | 921 | ... |
michael@0 | 922 | Unicode .icu files built to ./out/build/icudt45l |
michael@0 | 923 | mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt45b |
michael@0 | 924 | echo ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt |
michael@0 | 925 | LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt45l.dat ./out/icu4j/icudt45b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt45l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt45b |
michael@0 | 926 | jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt45b |
michael@0 | 927 | mkdir -p /tmp/icu4j/main/shared/data |
michael@0 | 928 | cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data |
michael@0 | 929 | - copy the big-endian Unicode data files to another location, |
michael@0 | 930 | separate from the other data files |
michael@0 | 931 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll |
michael@0 | 932 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/brkitr |
michael@0 | 933 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt45b |
michael@0 | 934 | ~/svn.icu/trunk/bld/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/cnvalias.icu |
michael@0 | 935 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt45b |
michael@0 | 936 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll |
michael@0 | 937 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/brkitr |
michael@0 | 938 | - refresh ICU4J |
michael@0 | 939 | ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt45b |
michael@0 | 940 | |
michael@0 | 941 | * refresh Java test .txt files |
michael@0 | 942 | - copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode |
michael@0 | 943 | |
michael@0 | 944 | * un-hardcode normalization skippable (NF*_Inert) test data |
michael@0 | 945 | - removes one manual step from the Unicode upgrade, and removes dependency on one of Mark's tools |
michael@0 | 946 | |
michael@0 | 947 | * copy updated break iterator test files |
michael@0 | 948 | - now handled by early ucdcopy.py and |
michael@0 | 949 | copying the uni60/processed/testdata files to ~/svn.icu/trunk/src/source/test/testdata |
michael@0 | 950 | (old instructions: |
michael@0 | 951 | copy from (Unicode 6.0)/ucd/auxiliary/*BreakTest-6....txt |
michael@0 | 952 | to ~/svn.icu/trunk/src/source/test/testdata) |
michael@0 | 953 | - they are not used in ICU4J |
michael@0 | 954 | |
michael@0 | 955 | * UCA |
michael@0 | 956 | |
michael@0 | 957 | - get output from Mark's tools; look in |
michael@0 | 958 | http://www.unicode.org/~book/incoming/mark/uca6.0.0/ |
michael@0 | 959 | http://www.macchiato.com/unicode/utc/additional-uca-files |
michael@0 | 960 | http://www.unicode.org/Public/UCA/6.0.0/ |
michael@0 | 961 | http://www.unicode.org/~mdavis/uca/ |
michael@0 | 962 | - update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt |
michael@0 | 963 | - update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt |
michael@0 | 964 | - update Han-implicit ranges for new CJK extensions: |
michael@0 | 965 | swapCJK() in ucol.cpp & ImplicitCEGenerator.java |
michael@0 | 966 | - genuca: allow bytes 02 for U+FFFE, new merge-sort character; |
michael@0 | 967 | do not add it into invuca so that tailoring primary-after an ignorable works |
michael@0 | 968 | - genuca: permit space between [variable top] bytes |
michael@0 | 969 | - ucol.cpp: treat noncharacters like unassigned rather than ignorable |
michael@0 | 970 | - run makeuca.sh: |
michael@0 | 971 | ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld |
michael@0 | 972 | - rebuild ICU4C |
michael@0 | 973 | - refresh ICU4J collation data: |
michael@0 | 974 | (subset of instructions above for properties data refresh, except copies all coll/*) |
michael@0 | 975 | ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 976 | mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll |
michael@0 | 977 | ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll |
michael@0 | 978 | ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt45b |
michael@0 | 979 | - update (ICU)/source/test/testdata/CollationTest_*.txt |
michael@0 | 980 | and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt |
michael@0 | 981 | with output from Mark's Unicode tools |
michael@0 | 982 | - run all tests with the *_SHORT.txt or the full files (the full ones have comments) |
michael@0 | 983 | - note on intltest: if collate/UCAConformanceTest fails, then |
michael@0 | 984 | utility/MultithreadTest/TestCollators will fail as well; |
michael@0 | 985 | fix the conformance test before looking into the multi-thread test |
michael@0 | 986 | |
michael@0 | 987 | * When refreshing all of ICU4J data from ICU4C |
michael@0 | 988 | - ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install |
michael@0 | 989 | - cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data |
michael@0 | 990 | or |
michael@0 | 991 | - ~/svn.icu/trunk/bld$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install |
michael@0 | 992 | |
michael@0 | 993 | *** LayoutEngine script information |
michael@0 | 994 | |
michael@0 | 995 | (For details see the Unicode 5.2 change log below.) |
michael@0 | 996 | |
michael@0 | 997 | * Run ICU4J com.ibm.icu.dev.tool.layout.ScriptNameBuilder. This generates LEScripts.h, LELanguages.h, |
michael@0 | 998 | ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp in the working directory. (It also generates |
michael@0 | 999 | ScriptRunData.cpp, which is no longer needed.) |
michael@0 | 1000 | |
michael@0 | 1001 | The generated files have a current copyright date and "@draft" statement. |
michael@0 | 1002 | |
michael@0 | 1003 | * copy the above files into <icu>/source/layout, replacing the old files. |
michael@0 | 1004 | * fix mixed line endings |
michael@0 | 1005 | * review the diffs and fix incorrect @draft and missing aliases; |
michael@0 | 1006 | Unicode-derived script codes should be "born stable" like constants in uchar.h, uscript.h etc. |
michael@0 | 1007 | * manually re-add the "Indic script xyz v.2" tags in ScriptAndLanguageTags.h |
michael@0 | 1008 | |
michael@0 | 1009 | ---------------------------------------------------------------------------- *** |
michael@0 | 1010 | |
michael@0 | 1011 | Unicode 5.2 update |
michael@0 | 1012 | |
michael@0 | 1013 | *** related ICU Trac tickets |
michael@0 | 1014 | |
michael@0 | 1015 | 7084 Unicode 5.2 |
michael@0 | 1016 | |
michael@0 | 1017 | 7167 verify collation bytes |
michael@0 | 1018 | 7235 Java test NAME_ALIAS |
michael@0 | 1019 | 7236 Java DerivedCoreProperties.txt test |
michael@0 | 1020 | 7237 Java BidiTest.txt |
michael@0 | 1021 | 7238 UTrie2 in core unidata |
michael@0 | 1022 | 7239 test for tailoring gaps |
michael@0 | 1023 | 7240 Java fix CollationMiscTest |
michael@0 | 1024 | 7243 update layout engine for Unicode 5.2 |
michael@0 | 1025 | |
michael@0 | 1026 | *** Unicode version numbers |
michael@0 | 1027 | - makedata.mak |
michael@0 | 1028 | - uchar.h |
michael@0 | 1029 | - configure.in & configure |
michael@0 | 1030 | - update ucdVersion in gennames.c if an algorithmic range changes |
michael@0 | 1031 | |
michael@0 | 1032 | *** data files & enums & parser code |
michael@0 | 1033 | |
michael@0 | 1034 | * file preparation |
michael@0 | 1035 | |
michael@0 | 1036 | python source\tools\genprops\misc\ucdcopy.py "C:\Documents and Settings\mscherer\My Documents\unicode\ucd\5.2.0" C:\svn\icuproj\icu\trunk\source\data\unidata |
michael@0 | 1037 | - includes finding files regardless of version numbers, |
michael@0 | 1038 | copying them, and performing the equivalent processing of the |
michael@0 | 1039 | ucdstrip and ucdmerge tools on the desired set of files |
michael@0 | 1040 | |
michael@0 | 1041 | * notes on changes |
michael@0 | 1042 | - PropertyAliases.txt |
michael@0 | 1043 | moved from numeric to enumerated: |
michael@0 | 1044 | ccc ; Canonical_Combining_Class |
michael@0 | 1045 | new string properties: |
michael@0 | 1046 | NFKC_CF ; NFKC_Casefold |
michael@0 | 1047 | Name_Alias; Name_Alias |
michael@0 | 1048 | new binary properties: |
michael@0 | 1049 | Cased ; Cased |
michael@0 | 1050 | CI ; Case_Ignorable |
michael@0 | 1051 | CWCF ; Changes_When_Casefolded |
michael@0 | 1052 | CWCM ; Changes_When_Casemapped |
michael@0 | 1053 | CWKCF ; Changes_When_NFKC_Casefolded |
michael@0 | 1054 | CWL ; Changes_When_Lowercased |
michael@0 | 1055 | CWT ; Changes_When_Titlecased |
michael@0 | 1056 | CWU ; Changes_When_Uppercased |
michael@0 | 1057 | new CJK Unihan properties (not supported by ICU) |
michael@0 | 1058 | - PropertyValueAliases.txt |
michael@0 | 1059 | new block names |
michael@0 | 1060 | new scripts |
michael@0 | 1061 | one script code change: |
michael@0 | 1062 | sc ; Qaai ; Inherited |
michael@0 | 1063 | -> |
michael@0 | 1064 | sc ; Zinh ; Inherited ; Qaai |
michael@0 | 1065 | new Line_Break (lb) value: |
michael@0 | 1066 | lb ; CP ; Close_Parenthesis |
michael@0 | 1067 | new Joining_Group (jg) values: Farsi_Yeh, Nya |
michael@0 | 1068 | other new values: |
michael@0 | 1069 | ccc; 214; ATA ; Attached_Above |
michael@0 | 1070 | - DerivedBidiClass.txt |
michael@0 | 1071 | new default-R range: U+1E800 - U+1EFFF |
michael@0 | 1072 | - UnicodeData.txt |
michael@0 | 1073 | all of the ISO comments are gone |
michael@0 | 1074 | new CJK block end: |
michael@0 | 1075 | 9FC3;<CJK Ideograph, Last> -> 9FCB;<CJK Ideograph, Last> |
michael@0 | 1076 | new CJK block: |
michael@0 | 1077 | 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1078 | 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; |
michael@0 | 1079 | |
michael@0 | 1080 | * genpname |
michael@0 | 1081 | - run preparse.pl |
michael@0 | 1082 | + cd \svn\icuproj\icu\trunk\source\tools\genpname |
michael@0 | 1083 | + make sure that data.h is writable |
michael@0 | 1084 | + perl preparse.pl \svn\icuproj\icu\trunk > out.txt |
michael@0 | 1085 | + preparse.pl complains with errors like the following: |
michael@0 | 1086 | Error: sc:Egyp already set to Egyptian_Hieroglyphs, cannot set to Egyp at preparse.pl line 1322, <GEN6> line 34. |
michael@0 | 1087 | This is because ICU 4.0 had scripts from ISO 15924 which are now |
michael@0 | 1088 | added to Unicode 5.2, and the Perl script shows a conflict between SyntheticPropertyValueAliases.txt |
michael@0 | 1089 | and PropertyValueAliases.txt. |
michael@0 | 1090 | -> Removed duplicate script entries from SyntheticPropertyValueAliases.txt: |
michael@0 | 1091 | Egyp, Java, Lana, Mtei, Orkh, Armi, Avst, Kthi, Phli, Prti, Samr, Tavt |
michael@0 | 1092 | + preparse.pl complains with errors about block names missing from uchar.h; add them |
michael@0 | 1093 | |
michael@0 | 1094 | * uchar.h & uscript.h & uprops.h & uprops.c & genprops |
michael@0 | 1095 | - new block & script values |
michael@0 | 1096 | + 26 new blocks |
michael@0 | 1097 | copy new blocks from Blocks.txt |
michael@0 | 1098 | MS VC++ 2008 regular expression: |
michael@0 | 1099 | find "^{[0-9A-F]+}\.\.{[0-9A-F]+}; {[A-Z].+}$" |
michael@0 | 1100 | replace with " UBLOCK_\3 = 172, /*[\1]*/" |
michael@0 | 1101 | + several new script values already added in ICU 4.0 for ISO 15924 coverage |
michael@0 | 1102 | (removed from SyntheticPropertyValueAliases.txt, see genpname notes above) |
michael@0 | 1103 | + 3 new script values added for ISO 15924 and Unicode 5.2 coverage |
michael@0 | 1104 | + 1 new script value added for ISO 15924 coverage (not in Unicode 5.2) |
michael@0 | 1105 | (added to SyntheticPropertyValueAliases.txt) |
michael@0 | 1106 | - new Joining Group (JG) values: Farsi_Yeh, Nya |
michael@0 | 1107 | - new Line_Break (lb) value: |
michael@0 | 1108 | lb ; CP ; Close_Parenthesis |
michael@0 | 1109 | |
michael@0 | 1110 | * hardcoded Unihan range end/limit |
michael@0 | 1111 | - Unihan range end moves from 9FC3 to 9FCB |
michael@0 | 1112 | search for both 9FC3 (end) and 9FC4 (limit) (regex 9FC[34], case-insensitive) |
michael@0 | 1113 | + do change gennames.c |
michael@0 | 1114 | |
michael@0 | 1115 | * Compare definitions of new binary properties with what we used to use |
michael@0 | 1116 | in algorithms, to see if the definitions changed. |
michael@0 | 1117 | - Verified that definitions for Cased and Case_Ignorable are unchanged. |
michael@0 | 1118 | The gencase tool now parses the newly public Case_Ignorable values |
michael@0 | 1119 | in case the definition changes in the future. |
michael@0 | 1120 | |
michael@0 | 1121 | * uchar.c & uprops.h & uprops.c & genprops |
michael@0 | 1122 | - new numeric values that didn't exist in Unicode data before: |
michael@0 | 1123 | 1/7, 1/9, 1/10, 3/10, 1/16, 3/16 |
michael@0 | 1124 | the ones with denominators >9 cannot be supported by uprops.icu formatVersion 5, |
michael@0 | 1125 | therefore redesign the encoding of numeric types and values for formatVersion 6; |
michael@0 | 1126 | design for simple numbers up to at least 144 ("one gross"), |
michael@0 | 1127 | large values up to at least 10^20, |
michael@0 | 1128 | and fractions with numerators -1..17 and denominators 1..16 |
michael@0 | 1129 | to cover current and expected future values |
michael@0 | 1130 | (e.g., more Han numeric values, Meroitic twelfths) |
michael@0 | 1131 | |
michael@0 | 1132 | * reimplement Hangul_Syllable_Type for new Jamo characters |
michael@0 | 1133 | - the old code assumed that all Jamo characters are in the 11xx block |
michael@0 | 1134 | - Unicode 5.2 fills holes there and adds new Jamo characters in |
michael@0 | 1135 | A960..A97F; Hangul Jamo Extended-A |
michael@0 | 1136 | and in |
michael@0 | 1137 | D7B0..D7FF; Hangul Jamo Extended-B |
michael@0 | 1138 | - Hangul_Syllable_Type can be trivially derived from a subset of |
michael@0 | 1139 | Grapheme_Cluster_Break values |
michael@0 | 1140 | |
michael@0 | 1141 | * build Unicode data source code for hardcoding core data |
michael@0 | 1142 | C:\svn\icuproj\icu\trunk\source\data>NMAKE /f makedata.mak ICUMAKE=\svn\icuproj\icu\trunk\source\data\ CFG=x86\release uni-core-data |
michael@0 | 1143 | |
michael@0 | 1144 | ICU data make path is \svn\icuproj\icu\trunk\source\data\ |
michael@0 | 1145 | ICU root path is \svn\icuproj\icu\trunk |
michael@0 | 1146 | Information: cannot find "ucmlocal.mk". Not building user-additional converter files. |
michael@0 | 1147 | Information: cannot find "brklocal.mk". Not building user-additional break iterator files. |
michael@0 | 1148 | Information: cannot find "reslocal.mk". Not building user-additional resource bundle files. |
michael@0 | 1149 | Information: cannot find "collocal.mk". Not building user-additional resource bundle files. |
michael@0 | 1150 | Information: cannot find "rbnflocal.mk". Not building user-additional resource bundle files. |
michael@0 | 1151 | Information: cannot find "trnslocal.mk". Not building user-additional transliterator files. |
michael@0 | 1152 | Information: cannot find "misclocal.mk". Not building user-additional miscellaenous files. |
michael@0 | 1153 | Information: cannot find "spreplocal.mk". Not building user-additional stringprep files. |
michael@0 | 1154 | Creating data file for Unicode Property Names |
michael@0 | 1155 | Creating data file for Unicode Character Properties |
michael@0 | 1156 | Creating data file for Unicode Case Mapping Properties |
michael@0 | 1157 | Creating data file for Unicode BiDi/Shaping Properties |
michael@0 | 1158 | Creating data file for Unicode Normalization |
michael@0 | 1159 | Unicode .icu files built to "\svn\icuproj\icu\trunk\source\data\out\build\icudt43l" |
michael@0 | 1160 | Unicode .c source files built to "\svn\icuproj\icu\trunk\source\data\out\tmp" |
michael@0 | 1161 | |
michael@0 | 1162 | - copy the .c source files to C:\svn\icuproj\icu\trunk\source\common |
michael@0 | 1163 | and rebuild the common library |
michael@0 | 1164 | |
michael@0 | 1165 | *** UCA |
michael@0 | 1166 | |
michael@0 | 1167 | - update FractionalUCA.txt with new canonical closure (output from Mark's Unicode tools) |
michael@0 | 1168 | - update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt from Mark's Unicode tools |
michael@0 | 1169 | - update source/test/testdata/CollationTest_*.txt with output from Mark's Unicode tools |
michael@0 | 1170 | [ Begin obsolete instructions: |
michael@0 | 1171 | Starting with UCA 5.2, we use the CollationTest_*_SHORT.txt files not the *_STUB.txt files. |
michael@0 | 1172 | - generate the source/test/testdata/CollationTest_*_STUB.txt files via source/tools/genuca/genteststub.py |
michael@0 | 1173 | on Windows: |
michael@0 | 1174 | python C:\svn\icuproj\icu\trunk\source\tools\genuca\genteststub.py CollationTest_NON_IGNORABLE_SHORT.txt CollationTest_NON_IGNORABLE_STUB.txt |
michael@0 | 1175 | python C:\svn\icuproj\icu\trunk\source\tools\genuca\genteststub.py CollationTest_SHIFTED_SHORT.txt CollationTest_SHIFTED_STUB.txt |
michael@0 | 1176 | End obsolete instructions] |
michael@0 | 1177 | - run all tests with the *_SHORT.txt or the full files (the full ones have comments) |
michael@0 | 1178 | not just the *_STUB.txt files |
michael@0 | 1179 | - note on intltest: if collate/UCAConformanceTest fails, then |
michael@0 | 1180 | utility/MultithreadTest/TestCollators will fail as well; |
michael@0 | 1181 | fix the conformance test before looking into the multi-thread test |
michael@0 | 1182 | |
michael@0 | 1183 | *** Implement Cased & Case_Ignorable properties |
michael@0 | 1184 | - via UProperty; call ucase.h functions ucase_getType() and ucase_getTypeOrIgnorable() |
michael@0 | 1185 | - Problem: These properties should be disjoint, but aren't |
michael@0 | 1186 | - UTC 2009nov decision: skip all Case_Ignorable regardless of whether they are Cased or not |
michael@0 | 1187 | - change ucase.icu to be able to store any combination of Cased and Case_Ignorable |
michael@0 | 1188 | |
michael@0 | 1189 | *** Implement Changes_When_Xyz properties |
michael@0 | 1190 | - without stored data |
michael@0 | 1191 | |
michael@0 | 1192 | *** Implement Name_Alias property |
michael@0 | 1193 | - add it as another name field in unames.icu |
michael@0 | 1194 | - make it available via u_charName() and UCharNameChoice and |
michael@0 | 1195 | - consider it in u_charFromName() |
michael@0 | 1196 | |
michael@0 | 1197 | *** Break iterators |
michael@0 | 1198 | |
michael@0 | 1199 | * Update break iterator rules to new UAX versions and new property values |
michael@0 | 1200 | * Update source/test/testdata/<boundary>Test.txt files from <unicode.org ucd>/ucd/auxiliary |
michael@0 | 1201 | |
michael@0 | 1202 | *** new BidiTest file |
michael@0 | 1203 | - review format and data |
michael@0 | 1204 | - copy BidiTest.txt to source/test/testdata |
michael@0 | 1205 | - write test code using this data |
michael@0 | 1206 | - fix ICU code where it fails the conformance test |
michael@0 | 1207 | |
michael@0 | 1208 | *** Java |
michael@0 | 1209 | - generally, find and update code corresponding to C/C++ |
michael@0 | 1210 | - UCharacter.UnicodeBlock constants: |
michael@0 | 1211 | a) add an _ID integer per new block, update COUNT |
michael@0 | 1212 | b) add a class instance per new block |
michael@0 | 1213 | Visual Studio regex: |
michael@0 | 1214 | find UBLOCK_{[^ ]+} = [0-9]+, {/.+} |
michael@0 | 1215 | replace with public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 |
michael@0 | 1216 | - CHAR_NAME_ALIAS -> UCharacter.getNameAlias() and getCharFromNameAlias() |
michael@0 | 1217 | |
michael@0 | 1218 | - port test changes to Java |
michael@0 | 1219 | |
michael@0 | 1220 | *** LayoutEngine script information |
michael@0 | 1221 | |
michael@0 | 1222 | (For comparison, see the Unicode 5.1 update: http://bugs.icu-project.org/trac/changeset/23833) |
michael@0 | 1223 | |
michael@0 | 1224 | * Run ICU4J com.ibm.icu.dev.tool.layout.ScriptNameBuilder. This generates LEScripts.h, LELanguages.h, |
michael@0 | 1225 | ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp in the working directory. (It also generates |
michael@0 | 1226 | ScriptRunData.cpp, which is no longer needed.) |
michael@0 | 1227 | |
michael@0 | 1228 | The generated files have a current copyright date and "@draft" statement. |
michael@0 | 1229 | |
michael@0 | 1230 | -> Eric Mader wrote in email on 20090930: |
michael@0 | 1231 | "I think the tool has been modified to update @draft to @stable for |
michael@0 | 1232 | older scripts and to add @draft for new scripts. |
michael@0 | 1233 | (I worked with an intern on this last year.) |
michael@0 | 1234 | You should check the output after you run it." |
michael@0 | 1235 | |
michael@0 | 1236 | * copy the above files into <icu>/source/layout, replacing the old files. |
michael@0 | 1237 | * fix mixed line endings |
michael@0 | 1238 | * review the diffs and fix incorrect @draft and missing aliases |
michael@0 | 1239 | * manually re-add the "Indic script xyz v.2" tags in ScriptAndLanguageTags.h |
michael@0 | 1240 | |
michael@0 | 1241 | Add new default entries to the indicClassTables array in <icu>/source/layout/IndicClassTables.cpp |
michael@0 | 1242 | and the complexTable array in <icu>/source/layoutex/ParagraphLayout.cpp. (This step should be automated...) |
michael@0 | 1243 | |
michael@0 | 1244 | -> Eric Mader wrote in email on 20090930: |
michael@0 | 1245 | "This is just a matter of making sure that all the per-script tables have |
michael@0 | 1246 | entries for any new scripts that were added. |
michael@0 | 1247 | If any new Indic characters were added, then the class tables in |
michael@0 | 1248 | IndicClassTables.cpp should be updated to reflect this. |
michael@0 | 1249 | John Emmons should know how to do this if it's required." |
michael@0 | 1250 | |
michael@0 | 1251 | * rebuild the layout and layoutex libraries. |
michael@0 | 1252 | |
michael@0 | 1253 | *** Documentation |
michael@0 | 1254 | - Update User Guide |
michael@0 | 1255 | + Jamo_Short_Name, sfc->scf, binary property value aliases |
michael@0 | 1256 | |
michael@0 | 1257 | ---------------------------------------------------------------------------- *** |
michael@0 | 1258 | |
michael@0 | 1259 | Unicode 5.1 update |
michael@0 | 1260 | |
michael@0 | 1261 | *** related ICU Trac tickets |
michael@0 | 1262 | |
michael@0 | 1263 | 5696 Update to Unicode 5.1 |
michael@0 | 1264 | |
michael@0 | 1265 | *** Unicode version numbers |
michael@0 | 1266 | - makedata.mak |
michael@0 | 1267 | - uchar.h |
michael@0 | 1268 | - configure.in & configure |
michael@0 | 1269 | - update ucdVersion in gennames.c if an algorithmic range changes |
michael@0 | 1270 | |
michael@0 | 1271 | *** data files & enums & parser code |
michael@0 | 1272 | |
michael@0 | 1273 | * file preparation |
michael@0 | 1274 | - ucdstrip: |
michael@0 | 1275 | DerivedCoreProperties.txt |
michael@0 | 1276 | DerivedNormalizationProps.txt |
michael@0 | 1277 | NormalizationTest.txt |
michael@0 | 1278 | PropList.txt |
michael@0 | 1279 | Scripts.txt |
michael@0 | 1280 | GraphemeBreakProperty.txt |
michael@0 | 1281 | SentenceBreakProperty.txt |
michael@0 | 1282 | WordBreakProperty.txt |
michael@0 | 1283 | - ucdstrip and ucdmerge: |
michael@0 | 1284 | EastAsianWidth.txt |
michael@0 | 1285 | LineBreak.txt |
michael@0 | 1286 | |
michael@0 | 1287 | * my ucd2unidata.bat (needs to be updated each time with UCD and file version numbers) |
michael@0 | 1288 | copy 5.1.0\ucd\BidiMirroring.txt ..\unidata\ |
michael@0 | 1289 | copy 5.1.0\ucd\Blocks.txt ..\unidata\ |
michael@0 | 1290 | copy 5.1.0\ucd\CaseFolding.txt ..\unidata\ |
michael@0 | 1291 | copy 5.1.0\ucd\DerivedAge.txt ..\unidata\ |
michael@0 | 1292 | copy 5.1.0\ucd\extracted\DerivedBidiClass.txt ..\unidata\ |
michael@0 | 1293 | copy 5.1.0\ucd\extracted\DerivedJoiningGroup.txt ..\unidata\ |
michael@0 | 1294 | copy 5.1.0\ucd\extracted\DerivedJoiningType.txt ..\unidata\ |
michael@0 | 1295 | copy 5.1.0\ucd\extracted\DerivedNumericValues.txt ..\unidata\ |
michael@0 | 1296 | copy 5.1.0\ucd\NormalizationCorrections.txt ..\unidata\ |
michael@0 | 1297 | copy 5.1.0\ucd\PropertyAliases.txt ..\unidata\ |
michael@0 | 1298 | copy 5.1.0\ucd\PropertyValueAliases.txt ..\unidata\ |
michael@0 | 1299 | copy 5.1.0\ucd\SpecialCasing.txt ..\unidata\ |
michael@0 | 1300 | copy 5.1.0\ucd\UnicodeData.txt ..\unidata\ |
michael@0 | 1301 | |
michael@0 | 1302 | ucdstrip < 5.1.0\ucd\DerivedCoreProperties.txt > ..\unidata\DerivedCoreProperties.txt |
michael@0 | 1303 | ucdstrip < 5.1.0\ucd\DerivedNormalizationProps.txt > ..\unidata\DerivedNormalizationProps.txt |
michael@0 | 1304 | ucdstrip < 5.1.0\ucd\NormalizationTest.txt > ..\unidata\NormalizationTest.txt |
michael@0 | 1305 | ucdstrip < 5.1.0\ucd\PropList.txt > ..\unidata\PropList.txt |
michael@0 | 1306 | ucdstrip < 5.1.0\ucd\Scripts.txt > ..\unidata\Scripts.txt |
michael@0 | 1307 | ucdstrip < 5.1.0\ucd\auxiliary\GraphemeBreakProperty.txt > ..\unidata\GraphemeBreakProperty.txt |
michael@0 | 1308 | ucdstrip < 5.1.0\ucd\auxiliary\SentenceBreakProperty.txt > ..\unidata\SentenceBreakProperty.txt |
michael@0 | 1309 | ucdstrip < 5.1.0\ucd\auxiliary\WordBreakProperty.txt > ..\unidata\WordBreakProperty.txt |
michael@0 | 1310 | ucdstrip < 5.1.0\ucd\EastAsianWidth.txt | ucdmerge > ..\unidata\EastAsianWidth.txt |
michael@0 | 1311 | ucdstrip < 5.1.0\ucd\LineBreak.txt | ucdmerge > ..\unidata\LineBreak.txt |
michael@0 | 1312 | |
michael@0 | 1313 | * genpname |
michael@0 | 1314 | - run preparse.pl |
michael@0 | 1315 | + cd \svn\icuproj\icu\uni51\source\tools\genpname |
michael@0 | 1316 | + make sure that data.h is writable |
michael@0 | 1317 | + perl preparse.pl \svn\icuproj\icu\uni51 > out.txt |
michael@0 | 1318 | + preparse.pl complains with errors like the following: |
michael@0 | 1319 | Error: sc:Cari already set to Carian, cannot set to Cari at preparse.pl line 1308, <GEN6> line 30. |
michael@0 | 1320 | This is because ICU 3.8 had scripts from ISO 15924 which are now |
michael@0 | 1321 | added to Unicode 5.1, and the script shows a conflict between SyntheticPropertyValueAliases.txt |
michael@0 | 1322 | and PropertyValueAliases.txt. |
michael@0 | 1323 | -> Removed duplicate script entries from SyntheticPropertyValueAliases.txt: |
michael@0 | 1324 | Cari, Cham, Kali, Lepc, Lyci, Lydi, Olck, Rjng, Saur, Sund, Vaii |
michael@0 | 1325 | + PropertyValueAliases.txt now explicitly contains values for boolean properties: |
michael@0 | 1326 | N/Y, No/Yes, F/T, False/True |
michael@0 | 1327 | -> Added N/No and Y/Yes to preparse.pl function read_PropertyValueAliases. |
michael@0 | 1328 | It will use further values from the file if present. |
michael@0 | 1329 | |
michael@0 | 1330 | * uchar.h & uscript.h & uprops.h & uprops.c & genprops |
michael@0 | 1331 | - new block & script values |
michael@0 | 1332 | + 17 new blocks |
michael@0 | 1333 | + 11 new script values already added in ICU 3.8 for ISO 15924 coverage |
michael@0 | 1334 | (removed from SyntheticPropertyValueAliases.txt) |
michael@0 | 1335 | + 14 new script values added for ISO 15924 coverage (not in Unicode 5.1) |
michael@0 | 1336 | (added to SyntheticPropertyValueAliases.txt) |
michael@0 | 1337 | - uprops.icu (uprops.h) only provides 7 bits for script codes. |
michael@0 | 1338 | In ICU 4.0 there are USCRIPT_CODE_LIMIT=130 script codes now. |
michael@0 | 1339 | There is none above 127 yet which is the script code for an |
michael@0 | 1340 | assigned Unicode character, so ICU 4.0 uprops.icu does not store any |
michael@0 | 1341 | script code values greater than 127. |
michael@0 | 1342 | However, it does need to store the maximum script value=USCRIPT_CODE_LIMIT-1=129 |
michael@0 | 1343 | in a parallel bit field, and that overflows now. |
michael@0 | 1344 | Also, future values >=128 would be incompatible anyway. |
michael@0 | 1345 | uprops.h is modified to move around several of the bit fields |
michael@0 | 1346 | in the properties vector words, and now uses 8 bits for the script code. |
michael@0 | 1347 | Two other bit fields also grow to accommodate future growth: |
michael@0 | 1348 | Block (current count: 172) grows from 8 to 9 bits, |
michael@0 | 1349 | and Word_Break grows from 4 to 5 bits. |
michael@0 | 1350 | - renamed property Simple_Case_Folding (sfc->scf) |
michael@0 | 1351 | + nothing to be done: handled as normal alias |
michael@0 | 1352 | - new property JSN Jamo_Short_Name |
michael@0 | 1353 | + no new API: only contributes to the Name property |
michael@0 | 1354 | - new Grapheme_Cluster_Break (GCB) value: SM=SpacingMark |
michael@0 | 1355 | - new Joining Group (JG) value: Burushashki_Yeh_Barree |
michael@0 | 1356 | - new Sentence_Break (SB) values: |
michael@0 | 1357 | SB ; CR ; CR |
michael@0 | 1358 | SB ; EX ; Extend |
michael@0 | 1359 | SB ; LF ; LF |
michael@0 | 1360 | SB ; SC ; SContinue |
michael@0 | 1361 | - new Word_Break (WB) values: |
michael@0 | 1362 | WB ; CR ; CR |
michael@0 | 1363 | WB ; Extend ; Extend |
michael@0 | 1364 | WB ; LF ; LF |
michael@0 | 1365 | WB ; MB ; MidNumLet |
michael@0 | 1366 | |
michael@0 | 1367 | * Further changes in the 2008-02-29 update: |
michael@0 | 1368 | - Default_Ignorable_Code_Point: The new file removes Cc, Cs, noncharacters from DICP |
michael@0 | 1369 | because they should not normally be invisible. |
michael@0 | 1370 | - new Joining Group (JG) value Burushashki_Yeh_Barree was renamed to Burushaski_Yeh_Barree (one 'h' removed) |
michael@0 | 1371 | - new Grapheme_Cluster_Break (GCB) value: PP=Prepend |
michael@0 | 1372 | - new Word_Break (WB) value: NL=Newline |
michael@0 | 1373 | |
michael@0 | 1374 | * hardcoded Unihan range end/limit (see Unicode 4.1 update for comparison) |
michael@0 | 1375 | - Unihan range end moves from 9FBB to 9FC3 |
michael@0 | 1376 | search for both 9FBB (end) and 9FBC (limit) (regex 9FB[BC], case-insensitive) |
michael@0 | 1377 | + do change gennames.c |
michael@0 | 1378 | |
michael@0 | 1379 | * build Unicode data source code for hardcoding core data |
michael@0 | 1380 | C:\svn\icuproj\icu\uni51\source\data>NMAKE /f makedata.mak ICUMAKE=\svn\icuproj\icu\uni51\source\data\ CFG=debug uni-core-data |
michael@0 | 1381 | |
michael@0 | 1382 | ICU data make path is \svn\icuproj\icu\uni51\source\data\ |
michael@0 | 1383 | ICU root path is \svn\icuproj\icu\uni51 |
michael@0 | 1384 | Information: cannot find "ucmlocal.mk". Not building user-additional converter files. |
michael@0 | 1385 | Information: cannot find "brklocal.mk". Not building user-additional break iterator files. |
michael@0 | 1386 | Information: cannot find "reslocal.mk". Not building user-additional resource bundle files. |
michael@0 | 1387 | Information: cannot find "collocal.mk". Not building user-additional resource bundle files. |
michael@0 | 1388 | Information: cannot find "rbnflocal.mk". Not building user-additional resource bundle files. |
michael@0 | 1389 | Information: cannot find "trnslocal.mk". Not building user-additional transliterator files. |
michael@0 | 1390 | Information: cannot find "misclocal.mk". Not building user-additional miscellaenous files. |
michael@0 | 1391 | Creating data file for Unicode Character Properties |
michael@0 | 1392 | Creating data file for Unicode Case Mapping Properties |
michael@0 | 1393 | Creating data file for Unicode BiDi/Shaping Properties |
michael@0 | 1394 | Creating data file for Unicode Normalization |
michael@0 | 1395 | Unicode .icu files built to "\svn\icuproj\icu\uni51\source\data\out\build\icudt39l" |
michael@0 | 1396 | Unicode .c source files built to "\svn\icuproj\icu\uni51\source\data\out\tmp" |
michael@0 | 1397 | |
michael@0 | 1398 | - copy the .c source files to C:\svn\icuproj\icu\uni51\source\common |
michael@0 | 1399 | and rebuild the common library |
michael@0 | 1400 | |
michael@0 | 1401 | *** Break iterators |
michael@0 | 1402 | |
michael@0 | 1403 | * Update break iterator rules to new UAX versions and new property values |
michael@0 | 1404 | |
michael@0 | 1405 | *** UCA |
michael@0 | 1406 | |
michael@0 | 1407 | * update FractionalUCA.txt and UCARules.txt with new canonical closure |
michael@0 | 1408 | |
michael@0 | 1409 | *** Test suites |
michael@0 | 1410 | - Test that APIs using Unicode property value aliases (like UnicodeSet) |
michael@0 | 1411 | support all of the boolean values N/Y, No/Yes, F/T, False/True |
michael@0 | 1412 | -> TestBinaryValues() tests in both cintltst and intltest |
michael@0 | 1413 | |
michael@0 | 1414 | *** LayoutEngine script information |
michael@0 | 1415 | * Run ICU4J com.ibm.icu.dev.tool.layout.ScriptNameBuilder. This generates LEScripts.h, LELanguage.h, |
michael@0 | 1416 | ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp in the working directory. (it also generates |
michael@0 | 1417 | ScriptRunData.cpp, which is no longer needed.) |
michael@0 | 1418 | |
michael@0 | 1419 | The generated files have a current copyright date and "@draft" statement. |
michael@0 | 1420 | |
michael@0 | 1421 | * copy the above files into <icu>/source/layout, replacing the old files. |
michael@0 | 1422 | |
michael@0 | 1423 | Add new default entries to the indicClassTables array in <icu>/source/layout/IndicClassTables.cpp |
michael@0 | 1424 | and the complexTable array in <icu>/source/layoutex/ParagraphLayout.cpp. (This step should be automated...) |
michael@0 | 1425 | |
michael@0 | 1426 | * rebuild the layout and layoutex libraries. |
michael@0 | 1427 | |
michael@0 | 1428 | *** Documentation |
michael@0 | 1429 | - Update User Guide |
michael@0 | 1430 | + Jamo_Short_Name, sfc->scf, binary property value aliases |
michael@0 | 1431 | |
michael@0 | 1432 | ---------------------------------------------------------------------------- *** |
michael@0 | 1433 | |
michael@0 | 1434 | Unicode 5.0 update |
michael@0 | 1435 | |
michael@0 | 1436 | *** related Jitterbugs |
michael@0 | 1437 | |
michael@0 | 1438 | 5084 RFE: Update to Unicode 5.0 |
michael@0 | 1439 | |
michael@0 | 1440 | *** data files & enums & parser code |
michael@0 | 1441 | |
michael@0 | 1442 | * file preparation |
michael@0 | 1443 | - ucdstrip: |
michael@0 | 1444 | DerivedCoreProperties.txt |
michael@0 | 1445 | DerivedNormalizationProps.txt |
michael@0 | 1446 | NormalizationTest.txt |
michael@0 | 1447 | PropList.txt |
michael@0 | 1448 | Scripts.txt |
michael@0 | 1449 | GraphemeBreakProperty.txt |
michael@0 | 1450 | SentenceBreakProperty.txt |
michael@0 | 1451 | WordBreakProperty.txt |
michael@0 | 1452 | - ucdstrip and ucdmerge: |
michael@0 | 1453 | EastAsianWidth.txt |
michael@0 | 1454 | LineBreak.txt |
michael@0 | 1455 | |
michael@0 | 1456 | * my ucd2unidata.bat (needs to be updated each time with UCD and file version numbers) |
michael@0 | 1457 | copy 5.0.0\ucd\BidiMirroring.txt ..\unidata\ |
michael@0 | 1458 | copy 5.0.0\ucd\Blocks.txt ..\unidata\ |
michael@0 | 1459 | copy 5.0.0\ucd\CaseFolding.txt ..\unidata\ |
michael@0 | 1460 | copy 5.0.0\ucd\DerivedAge.txt ..\unidata\ |
michael@0 | 1461 | copy 5.0.0\ucd\extracted\DerivedBidiClass.txt ..\unidata\ |
michael@0 | 1462 | copy 5.0.0\ucd\extracted\DerivedJoiningGroup.txt ..\unidata\ |
michael@0 | 1463 | copy 5.0.0\ucd\extracted\DerivedJoiningType.txt ..\unidata\ |
michael@0 | 1464 | copy 5.0.0\ucd\extracted\DerivedNumericValues.txt ..\unidata\ |
michael@0 | 1465 | copy 5.0.0\ucd\NormalizationCorrections.txt ..\unidata\ |
michael@0 | 1466 | copy 5.0.0\ucd\PropertyAliases.txt ..\unidata\ |
michael@0 | 1467 | copy 5.0.0\ucd\PropertyValueAliases.txt ..\unidata\ |
michael@0 | 1468 | copy 5.0.0\ucd\SpecialCasing.txt ..\unidata\ |
michael@0 | 1469 | copy 5.0.0\ucd\UnicodeData.txt ..\unidata\ |
michael@0 | 1470 | |
michael@0 | 1471 | ucdstrip < 5.0.0\ucd\DerivedCoreProperties.txt > ..\unidata\DerivedCoreProperties.txt |
michael@0 | 1472 | ucdstrip < 5.0.0\ucd\DerivedNormalizationProps.txt > ..\unidata\DerivedNormalizationProps.txt |
michael@0 | 1473 | ucdstrip < 5.0.0\ucd\NormalizationTest.txt > ..\unidata\NormalizationTest.txt |
michael@0 | 1474 | ucdstrip < 5.0.0\ucd\PropList.txt > ..\unidata\PropList.txt |
michael@0 | 1475 | ucdstrip < 5.0.0\ucd\Scripts.txt > ..\unidata\Scripts.txt |
michael@0 | 1476 | ucdstrip < 5.0.0\ucd\auxiliary\GraphemeBreakProperty.txt > ..\unidata\GraphemeBreakProperty.txt |
michael@0 | 1477 | ucdstrip < 5.0.0\ucd\auxiliary\SentenceBreakProperty.txt > ..\unidata\SentenceBreakProperty.txt |
michael@0 | 1478 | ucdstrip < 5.0.0\ucd\auxiliary\WordBreakProperty.txt > ..\unidata\WordBreakProperty.txt |
michael@0 | 1479 | ucdstrip < 5.0.0\ucd\EastAsianWidth.txt | ucdmerge > ..\unidata\EastAsianWidth.txt |
michael@0 | 1480 | ucdstrip < 5.0.0\ucd\LineBreak.txt | ucdmerge > ..\unidata\LineBreak.txt |
michael@0 | 1481 | |
michael@0 | 1482 | * update FractionalUCA.txt and UCARules.txt with new canonical closure |
michael@0 | 1483 | |
michael@0 | 1484 | * genpname |
michael@0 | 1485 | - run preparse.pl |
michael@0 | 1486 | + make sure that data.h is writable |
michael@0 | 1487 | + perl preparse.pl \cvs\oss\icu > out.txt |
michael@0 | 1488 | |
michael@0 | 1489 | * uchar.h & uscript.h & uprops.h & uprops.c & genprops |
michael@0 | 1490 | - new block & script values |
michael@0 | 1491 | + script values already added in ICU 3.6 because all of ISO 15924 is now covered |
michael@0 | 1492 | |
michael@0 | 1493 | * build Unicode data source code for hardcoding core data |
michael@0 | 1494 | C:\cvs\oss\icu\source\data>NMAKE /f makedata.mak ICUMAKE=\cvs\oss\icu\source\data\ CFG=debug uni-core-data |
michael@0 | 1495 | |
michael@0 | 1496 | ICU data make path is \cvs\oss\icu\source\data\ |
michael@0 | 1497 | ICU root path is \cvs\oss\icu |
michael@0 | 1498 | Information: cannot find "ucmlocal.mk". Not building user-additional converter files. |
michael@0 | 1499 | [etc.] |
michael@0 | 1500 | Creating data file for Unicode Character Properties |
michael@0 | 1501 | Creating data file for Unicode Case Mapping Properties |
michael@0 | 1502 | Creating data file for Unicode BiDi/Shaping Properties |
michael@0 | 1503 | Creating data file for Unicode Normalization |
michael@0 | 1504 | Unicode .icu files built to "\cvs\oss\icu\source\data\out\build\icudt35l" |
michael@0 | 1505 | Unicode .c source files built to "\cvs\oss\icu\source\data\out\tmp" |
michael@0 | 1506 | |
michael@0 | 1507 | - copy the .c source files to C:\cvs\oss\icu\source\common |
michael@0 | 1508 | and rebuild the common library |
michael@0 | 1509 | |
michael@0 | 1510 | *** Unicode version numbers |
michael@0 | 1511 | - makedata.mak |
michael@0 | 1512 | - uchar.h |
michael@0 | 1513 | - configure.in |
michael@0 | 1514 | |
michael@0 | 1515 | *** LayoutEngine script information |
michael@0 | 1516 | * Run ICU4J com.ibm.icu.dev.tool.layout.ScriptNameBuilder. This generates LEScripts.h, LELanguage.h, |
michael@0 | 1517 | ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp in the working directory. (it also generates |
michael@0 | 1518 | ScriptRunData.cpp, which is no longer needed.) |
michael@0 | 1519 | |
michael@0 | 1520 | The generated files have a current copyright date and "@draft" statement. |
michael@0 | 1521 | |
michael@0 | 1522 | * copy the above files into <icu>/source/layout, replacing the old files. |
michael@0 | 1523 | |
michael@0 | 1524 | Add new default entries to the indicClassTables array in <icu>/source/layout/IndicClassTables.cpp |
michael@0 | 1525 | and the complexTable array in <icu>/source/layoutex/ParagraphLayout.cpp. (This step should be automated...) |
michael@0 | 1526 | |
michael@0 | 1527 | * rebuild the layout and layoutex libraries. |
michael@0 | 1528 | |
michael@0 | 1529 | ---------------------------------------------------------------------------- *** |
michael@0 | 1530 | |
michael@0 | 1531 | Unicode 4.1 update |
michael@0 | 1532 | |
michael@0 | 1533 | *** related Jitterbugs |
michael@0 | 1534 | |
michael@0 | 1535 | 4332 RFE: Update to Unicode 4.1 |
michael@0 | 1536 | 4157 RBBI, TR29 4.1 updates |
michael@0 | 1537 | |
michael@0 | 1538 | *** data files & enums & parser code |
michael@0 | 1539 | |
michael@0 | 1540 | * file preparation |
michael@0 | 1541 | - ucdstrip: |
michael@0 | 1542 | DerivedCoreProperties.txt |
michael@0 | 1543 | DerivedNormalizationProps.txt |
michael@0 | 1544 | NormalizationTest.txt |
michael@0 | 1545 | GraphemeBreakProperty.txt |
michael@0 | 1546 | SentenceBreakProperty.txt |
michael@0 | 1547 | WordBreakProperty.txt |
michael@0 | 1548 | - ucdstrip and ucdmerge: |
michael@0 | 1549 | EastAsianWidth.txt |
michael@0 | 1550 | LineBreak.txt |
michael@0 | 1551 | |
michael@0 | 1552 | * add new files to the repository |
michael@0 | 1553 | GraphemeBreakProperty.txt |
michael@0 | 1554 | SentenceBreakProperty.txt |
michael@0 | 1555 | WordBreakProperty.txt |
michael@0 | 1556 | |
michael@0 | 1557 | * update FractionalUCA.txt and UCARules.txt with new canonical closure |
michael@0 | 1558 | |
michael@0 | 1559 | * genpname |
michael@0 | 1560 | - handle new enumerated properties in sub read_uchar |
michael@0 | 1561 | - run preparse.pl |
michael@0 | 1562 | |
michael@0 | 1563 | * uchar.h & uscript.h & uprops.h & uprops.c & genprops |
michael@0 | 1564 | - new binary properties |
michael@0 | 1565 | + Pattern_Syntax |
michael@0 | 1566 | + Pattern_White_Space |
michael@0 | 1567 | - new enumerated properties |
michael@0 | 1568 | + Grapheme_Cluster_Break |
michael@0 | 1569 | + Sentence_Break |
michael@0 | 1570 | + Word_Break |
michael@0 | 1571 | - new block & script & line break values |
michael@0 | 1572 | |
michael@0 | 1573 | * gencase |
michael@0 | 1574 | - case-ignorable changes |
michael@0 | 1575 | see http://www.unicode.org/versions/Unicode4.1.0/#CaseMods |
michael@0 | 1576 | now: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk |
michael@0 | 1577 | |
michael@0 | 1578 | *** Unicode version numbers |
michael@0 | 1579 | - makedata.mak |
michael@0 | 1580 | - uchar.h |
michael@0 | 1581 | - configure.in |
michael@0 | 1582 | |
michael@0 | 1583 | *** tests |
michael@0 | 1584 | - verify that u_charMirror() round-trips |
michael@0 | 1585 | - test all new properties and some new values of old properties |
michael@0 | 1586 | |
michael@0 | 1587 | *** other code |
michael@0 | 1588 | |
michael@0 | 1589 | * hardcoded Unihan range end/limit |
michael@0 | 1590 | - Unihan range end moves from 9FA5 to 9FBB |
michael@0 | 1591 | search for both 9FA5 (end) and 9FA6 (limit) (regex 9FA[56], case-insensitive) |
michael@0 | 1592 | + do not modify BOCU/BOCSU code because that would change the encoding |
michael@0 | 1593 | and break binary compatibility! |
michael@0 | 1594 | + similarly, do not change the GB 18030 range data (ucnvmbcs.c), |
michael@0 | 1595 | NamePrepProfile.txt |
michael@0 | 1596 | + ignore trietest.c: test data is arbitrary |
michael@0 | 1597 | + ignore tstnorm.cpp: test optimization, not important |
michael@0 | 1598 | + ignore collation: 9FA[56] only appears in comments; swapCJK() uses the whole block up to 9FFF |
michael@0 | 1599 | + do change line_th.txt and word_th.txt |
michael@0 | 1600 | by replacing hardcoded ranges with the new property values |
michael@0 | 1601 | + do change gennames.c |
michael@0 | 1602 | |
michael@0 | 1603 | source\data\brkitr\line_th.txt(229): \u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6 |
michael@0 | 1604 | source\data\brkitr\word_th.txt(23): \u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6 |
michael@0 | 1605 | source\tools\gennames\gennames.c(971): 0x4e00, 0x9fa5, |
michael@0 | 1606 | |
michael@0 | 1607 | * case mappings |
michael@0 | 1608 | - compare new special casing context conditions with previous ones |
michael@0 | 1609 | see http://www.unicode.org/versions/Unicode4.1.0/#CaseMods |
michael@0 | 1610 | |
michael@0 | 1611 | * genpname |
michael@0 | 1612 | - consider storing only the short name if it is the same as the long name |
michael@0 | 1613 | |
michael@0 | 1614 | *** other reviews |
michael@0 | 1615 | - UAX #29 changes (grapheme/word/sentence breaks) |
michael@0 | 1616 | - UAX #14 changes (line breaks) |
michael@0 | 1617 | - Pattern_Syntax & Pattern_White_Space |
michael@0 | 1618 | |
michael@0 | 1619 | ---------------------------------------------------------------------------- *** |
michael@0 | 1620 | |
michael@0 | 1621 | Unicode 4.0.1 update |
michael@0 | 1622 | |
michael@0 | 1623 | *** related Jitterbugs |
michael@0 | 1624 | |
michael@0 | 1625 | 3170 RFE: Update to Unicode 4.0.1 |
michael@0 | 1626 | 3171 Add new Unicode 4.0.1 properties |
michael@0 | 1627 | 3520 use Unicode 4.0.1 updates for break iteration |
michael@0 | 1628 | |
michael@0 | 1629 | *** data files & enums & parser code |
michael@0 | 1630 | |
michael@0 | 1631 | * file preparation |
michael@0 | 1632 | - ucdstrip: DerivedNormalizationProps.txt, NormalizationTest.txt, DerivedCoreProperties.txt |
michael@0 | 1633 | - ucdstrip and ucdmerge: EastAsianWidth.txt, LineBreak.txt |
michael@0 | 1634 | |
michael@0 | 1635 | * file fixes |
michael@0 | 1636 | - fix UnicodeData.txt general categories of Ethiopic digits Nd->No |
michael@0 | 1637 | according to PRI #26 |
michael@0 | 1638 | http://www.unicode.org/review/resolved-pri.html#pri26 |
michael@0 | 1639 | - undone again because no corrigendum in sight; |
michael@0 | 1640 | instead modified tests to not check consistency on this for Unicode 4.0.1 |
michael@0 | 1641 | |
michael@0 | 1642 | * ucdterms.txt |
michael@0 | 1643 | - update from http://www.unicode.org/copyright.html |
michael@0 | 1644 | formatted for plain text |
michael@0 | 1645 | |
michael@0 | 1646 | * uchar.h & uprops.h & uprops.c & genprops |
michael@0 | 1647 | - add UBLOCK_CYRILLIC_SUPPLEMENT because the block is renamed |
michael@0 | 1648 | - add U_LB_INSEPARABLE due to a spelling fix |
michael@0 | 1649 | + put short name comment only on line with new constant |
michael@0 | 1650 | for genpname perl script parser |
michael@0 | 1651 | - new binary properties |
michael@0 | 1652 | + STerm |
michael@0 | 1653 | + Variation_Selector |
michael@0 | 1654 | |
michael@0 | 1655 | * genpname |
michael@0 | 1656 | - fix genpname perl script so that it doesn't choke on more than 2 names per property value |
michael@0 | 1657 | - perl script: correctly calculate the maximum number of fields per row |
michael@0 | 1658 | |
michael@0 | 1659 | * uscript.h |
michael@0 | 1660 | - new script code Hrkt=Katakana_Or_Hiragana |
michael@0 | 1661 | |
michael@0 | 1662 | * gennorm.c track changes in DerivedNormalizationProps.txt |
michael@0 | 1663 | - "FNC" -> "FC_NFKC" |
michael@0 | 1664 | - single field "NFD_NO" -> two fields "NFD_QC; N" etc. |
michael@0 | 1665 | |
michael@0 | 1666 | * genprops/props2.c track changes in DerivedNumericValues.txt |
michael@0 | 1667 | - changed from 3 columns to 2, dropping the numeric type |
michael@0 | 1668 | + assume that the type is always numeric for Han characters, |
michael@0 | 1669 | and that only those are added in addition to what UnicodeData.txt lists |
michael@0 | 1670 | |
michael@0 | 1671 | *** Unicode version numbers |
michael@0 | 1672 | - makedata.mak |
michael@0 | 1673 | - uchar.h |
michael@0 | 1674 | - configure.in |
michael@0 | 1675 | |
michael@0 | 1676 | *** tests |
michael@0 | 1677 | - update test of default bidi classes according to PRI #28 |
michael@0 | 1678 | /tsutil/cucdtst/TestUnicodeData |
michael@0 | 1679 | http://www.unicode.org/review/resolved-pri.html#pri28 |
michael@0 | 1680 | - bidi tests: change exemplar character for ES depending on Unicode version |
michael@0 | 1681 | - change hardcoded expected property values where they change |
michael@0 | 1682 | |
michael@0 | 1683 | *** other code |
michael@0 | 1684 | |
michael@0 | 1685 | * name matching |
michael@0 | 1686 | - read UCD.html |
michael@0 | 1687 | |
michael@0 | 1688 | * scripts |
michael@0 | 1689 | - use new Hrkt=Katakana_Or_Hiragana |
michael@0 | 1690 | |
michael@0 | 1691 | * ZWJ & ZWNJ |
michael@0 | 1692 | - are now part of combining character sequences |
michael@0 | 1693 | - break iteration used to assume that LB classes did not overlap; now they do for ZWJ & ZWNJ |