unicode: upgrade to 8.0.0

Not sure if I'm on time for 1.5; Unicode 8 just got released.

Straighforward upgrade. Only changed maketables.go to prevent it from adding
the Cherokee upper and lower case mappings. This change causes the caseOrbit
table to NOT change. Added tests to verify that the relevant functions still
produce the correct result, even for Cherokee.

Fixes #11309

Change-Id: I42850f5b3399bde125b002efc78eff96dbd86a08
Reviewed-on: https://go-review.googlesource.com/11286
Reviewed-by: Russ Cox <rsc@golang.org>
This commit is contained in:
Marcel van Lohuizen 2015-06-21 20:21:04 +02:00 committed by Russ Cox
parent 834fef80ae
commit fe15da62f7
7 changed files with 437 additions and 215 deletions

View file

@ -328,3 +328,4 @@ pkg syscall (netbsd-arm), type IfMsghdr struct, Pad_cgo_1 [4]uint8
pkg syscall (netbsd-arm-cgo), const SizeofIfData = 132
pkg syscall (netbsd-arm-cgo), type IfMsghdr struct, Pad_cgo_1 [4]uint8
pkg unicode, const Version = "6.3.0"
pkg unicode, const Version = "7.0.0"

View file

@ -948,3 +948,10 @@ pkg syscall (openbsd-amd64-cgo), type SysProcAttr struct, Pgid int
pkg text/template, method (*Template) DefinedTemplates() string
pkg text/template, method (*Template) Option(...string) *Template
pkg time, method (Time) AppendFormat([]uint8, string) []uint8
pkg unicode, const Version = "8.0.0"
pkg unicode, var Ahom *RangeTable
pkg unicode, var Anatolian_Hieroglyphs *RangeTable
pkg unicode, var Hatran *RangeTable
pkg unicode, var Multani *RangeTable
pkg unicode, var Old_Hungarian *RangeTable
pkg unicode, var SignWriting *RangeTable

View file

@ -7,7 +7,7 @@
package strconv
// (468+138+67)*2 + (326)*4 = 2650 bytes
// (470+136+73)*2 + (342)*4 = 2726 bytes
var isPrint16 = []uint16{
0x0020, 0x007e,
@ -26,8 +26,8 @@ var isPrint16 = []uint16{
0x0800, 0x082d,
0x0830, 0x085b,
0x085e, 0x085e,
0x08a0, 0x08b2,
0x08e4, 0x098c,
0x08a0, 0x08b4,
0x08e3, 0x098c,
0x098f, 0x0990,
0x0993, 0x09b2,
0x09b6, 0x09b9,
@ -51,6 +51,7 @@ var isPrint16 = []uint16{
0x0ad0, 0x0ad0,
0x0ae0, 0x0ae3,
0x0ae6, 0x0af1,
0x0af9, 0x0af9,
0x0b01, 0x0b0c,
0x0b0f, 0x0b10,
0x0b13, 0x0b39,
@ -73,7 +74,7 @@ var isPrint16 = []uint16{
0x0be6, 0x0bfa,
0x0c00, 0x0c39,
0x0c3d, 0x0c4d,
0x0c55, 0x0c59,
0x0c55, 0x0c5a,
0x0c60, 0x0c63,
0x0c66, 0x0c6f,
0x0c78, 0x0cb9,
@ -84,7 +85,7 @@ var isPrint16 = []uint16{
0x0d01, 0x0d3a,
0x0d3d, 0x0d4e,
0x0d57, 0x0d57,
0x0d60, 0x0d63,
0x0d5f, 0x0d63,
0x0d66, 0x0d75,
0x0d79, 0x0d7f,
0x0d82, 0x0d96,
@ -117,7 +118,8 @@ var isPrint16 = []uint16{
0x1318, 0x135a,
0x135d, 0x137c,
0x1380, 0x1399,
0x13a0, 0x13f4,
0x13a0, 0x13f5,
0x13f8, 0x13fd,
0x1400, 0x169c,
0x16a0, 0x16f8,
0x1700, 0x1714,
@ -167,9 +169,9 @@ var isPrint16 = []uint16{
0x2030, 0x205e,
0x2070, 0x2071,
0x2074, 0x209c,
0x20a0, 0x20bd,
0x20a0, 0x20be,
0x20d0, 0x20f0,
0x2100, 0x2189,
0x2100, 0x218b,
0x2190, 0x23fa,
0x2400, 0x2426,
0x2440, 0x244a,
@ -177,6 +179,7 @@ var isPrint16 = []uint16{
0x2b76, 0x2b95,
0x2b98, 0x2bb9,
0x2bbd, 0x2bd1,
0x2bec, 0x2bef,
0x2c00, 0x2cf3,
0x2cf9, 0x2d27,
0x2d2d, 0x2d2d,
@ -193,19 +196,19 @@ var isPrint16 = []uint16{
0x3131, 0x31ba,
0x31c0, 0x31e3,
0x31f0, 0x4db5,
0x4dc0, 0x9fcc,
0x4dc0, 0x9fd5,
0xa000, 0xa48c,
0xa490, 0xa4c6,
0xa4d0, 0xa62b,
0xa640, 0xa6f7,
0xa700, 0xa7ad,
0xa7b0, 0xa7b1,
0xa7b0, 0xa7b7,
0xa7f7, 0xa82b,
0xa830, 0xa839,
0xa840, 0xa877,
0xa880, 0xa8c4,
0xa8ce, 0xa8d9,
0xa8e0, 0xa8fb,
0xa8e0, 0xa8fd,
0xa900, 0xa953,
0xa95f, 0xa97c,
0xa980, 0xa9d9,
@ -217,9 +220,8 @@ var isPrint16 = []uint16{
0xab01, 0xab06,
0xab09, 0xab0e,
0xab11, 0xab16,
0xab20, 0xab5f,
0xab64, 0xab65,
0xabc0, 0xabed,
0xab20, 0xab65,
0xab70, 0xabed,
0xabf0, 0xabf9,
0xac00, 0xd7a3,
0xd7b0, 0xd7c6,
@ -234,8 +236,7 @@ var isPrint16 = []uint16{
0xfd92, 0xfdc7,
0xfdf0, 0xfdfd,
0xfe00, 0xfe19,
0xfe20, 0xfe2d,
0xfe30, 0xfe6b,
0xfe20, 0xfe6b,
0xfe70, 0xfefc,
0xff01, 0xffbe,
0xffc2, 0xffc7,
@ -370,8 +371,6 @@ var isNotPrint16 = []uint16{
0x318f,
0x321f,
0x32ff,
0xa69e,
0xa78f,
0xa9ce,
0xa9ff,
0xab27,
@ -418,12 +417,13 @@ var isPrint32 = []uint32{
0x01083c, 0x01083c,
0x01083f, 0x01089e,
0x0108a7, 0x0108af,
0x010900, 0x01091b,
0x0108e0, 0x0108f5,
0x0108fb, 0x01091b,
0x01091f, 0x010939,
0x01093f, 0x01093f,
0x010980, 0x0109b7,
0x0109be, 0x0109bf,
0x010a00, 0x010a06,
0x0109bc, 0x0109cf,
0x0109d2, 0x010a06,
0x010a0c, 0x010a33,
0x010a38, 0x010a3a,
0x010a3f, 0x010a47,
@ -438,6 +438,9 @@ var isPrint32 = []uint32{
0x010b99, 0x010b9c,
0x010ba9, 0x010baf,
0x010c00, 0x010c48,
0x010c80, 0x010cb2,
0x010cc0, 0x010cf2,
0x010cfa, 0x010cff,
0x010e60, 0x010e7e,
0x011000, 0x01104d,
0x011052, 0x01106f,
@ -446,19 +449,19 @@ var isPrint32 = []uint32{
0x0110f0, 0x0110f9,
0x011100, 0x011143,
0x011150, 0x011176,
0x011180, 0x0111c8,
0x0111cd, 0x0111cd,
0x0111d0, 0x0111da,
0x0111e1, 0x0111f4,
0x011180, 0x0111cd,
0x0111d0, 0x0111f4,
0x011200, 0x01123d,
0x011280, 0x0112a9,
0x0112b0, 0x0112ea,
0x0112f0, 0x0112f9,
0x011301, 0x01130c,
0x011300, 0x01130c,
0x01130f, 0x011310,
0x011313, 0x011339,
0x01133c, 0x011344,
0x011347, 0x011348,
0x01134b, 0x01134d,
0x011350, 0x011350,
0x011357, 0x011357,
0x01135d, 0x011363,
0x011366, 0x01136c,
@ -466,17 +469,22 @@ var isPrint32 = []uint32{
0x011480, 0x0114c7,
0x0114d0, 0x0114d9,
0x011580, 0x0115b5,
0x0115b8, 0x0115c9,
0x0115b8, 0x0115dd,
0x011600, 0x011644,
0x011650, 0x011659,
0x011680, 0x0116b7,
0x0116c0, 0x0116c9,
0x011700, 0x011719,
0x01171d, 0x01172b,
0x011730, 0x01173f,
0x0118a0, 0x0118f2,
0x0118ff, 0x0118ff,
0x011ac0, 0x011af8,
0x012000, 0x012398,
0x012000, 0x012399,
0x012400, 0x012474,
0x012480, 0x012543,
0x013000, 0x01342e,
0x014400, 0x014646,
0x016800, 0x016a38,
0x016a40, 0x016a69,
0x016a6e, 0x016a6f,
@ -497,7 +505,7 @@ var isPrint32 = []uint32{
0x01d000, 0x01d0f5,
0x01d100, 0x01d126,
0x01d129, 0x01d172,
0x01d17b, 0x01d1dd,
0x01d17b, 0x01d1e8,
0x01d200, 0x01d245,
0x01d300, 0x01d356,
0x01d360, 0x01d371,
@ -508,7 +516,8 @@ var isPrint32 = []uint32{
0x01d50d, 0x01d546,
0x01d54a, 0x01d6a5,
0x01d6a8, 0x01d7cb,
0x01d7ce, 0x01d7ff,
0x01d7ce, 0x01da8b,
0x01da9b, 0x01daaf,
0x01e800, 0x01e8c4,
0x01e8c7, 0x01e8d6,
0x01ee00, 0x01ee24,
@ -530,13 +539,7 @@ var isPrint32 = []uint32{
0x01f210, 0x01f23a,
0x01f240, 0x01f248,
0x01f250, 0x01f251,
0x01f300, 0x01f32c,
0x01f330, 0x01f37d,
0x01f380, 0x01f3ce,
0x01f3d4, 0x01f3f7,
0x01f400, 0x01f54a,
0x01f550, 0x01f642,
0x01f645, 0x01f6cf,
0x01f300, 0x01f6d0,
0x01f6e0, 0x01f6ec,
0x01f6f0, 0x01f6f3,
0x01f700, 0x01f773,
@ -546,9 +549,13 @@ var isPrint32 = []uint32{
0x01f850, 0x01f859,
0x01f860, 0x01f887,
0x01f890, 0x01f8ad,
0x01f910, 0x01f918,
0x01f980, 0x01f984,
0x01f9c0, 0x01f9c0,
0x020000, 0x02a6d6,
0x02a700, 0x02b734,
0x02b740, 0x02b81d,
0x02b820, 0x02cea1,
0x02f800, 0x02fa1d,
0x0e0100, 0x0e01ef,
}
@ -562,12 +569,18 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0x0809,
0x0836,
0x0856,
0x08f3,
0x0a04,
0x0a14,
0x0a18,
0x10bd,
0x1135,
0x11e0,
0x1212,
0x1287,
0x1289,
0x128e,
0x129e,
0x1304,
0x1329,
0x1331,
@ -589,6 +602,7 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0xd53f,
0xd545,
0xd551,
0xdaa0,
0xee04,
0xee20,
0xee23,
@ -618,7 +632,6 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0xf0c0,
0xf0d0,
0xf12f,
0xf4ff,
0xf57a,
0xf5a4,
}

View file

@ -24,6 +24,7 @@ var upperTest = []rune{
0x181,
0x376,
0x3cf,
0x13bd,
0x1f2a,
0x2102,
0x2c00,
@ -46,6 +47,7 @@ var notupperTest = []rune{
0x377,
0x387,
0x2150,
0xab7d,
0xffff,
0x10000,
}
@ -194,6 +196,15 @@ var caseTest = []caseT{
{LowerCase, 0x0148, 0x0148},
{TitleCase, 0x0148, 0x0147},
// Lowercase lower than uppercase.
// AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8
{UpperCase, 0xab78, 0x13a8},
{LowerCase, 0xab78, 0xab78},
{TitleCase, 0xab78, 0x13a8},
{UpperCase, 0x13a8, 0x13a8},
{LowerCase, 0x13a8, 0xab78},
{TitleCase, 0x13a8, 0x13a8},
// Last block in the 5.1.0 table
// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
{UpperCase, 0x10400, 0x10400},
@ -405,6 +416,9 @@ var simpleFoldTests = []string{
// Extra special cases: has lower/upper but no case fold.
"İ",
"ı",
// Upper comes before lower (Cherokee).
"\u13b0\uab80",
}
func TestSimpleFold(t *testing.T) {

View file

@ -44,7 +44,7 @@ func main() {
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
var url = flag.String("url",
"http://www.unicode.org/Public/7.0.0/ucd/",
"http://www.unicode.org/Public/8.0.0/ucd/",
"URL of Unicode database directory")
var tablelist = flag.String("tables",
"all",
@ -1152,11 +1152,14 @@ func printCasefold() {
}
}
// Delete the groups for which assuming [lower, upper] is right.
// Delete the groups for which assuming [lower, upper] or [upper, lower] is right.
for i, orb := range caseOrbit {
if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
caseOrbit[i] = nil
}
if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
caseOrbit[i] = nil
}
}
// Record orbit information in chars.

View file

@ -14,9 +14,11 @@ type T struct {
script string
}
// Hand-chosen tests from Unicode 5.1.0, 6.0.0, 6.2.0, 6.3.0 and 7.0.0 mostly to
// discover when new scripts and categories arise.
// Hand-chosen tests from Unicode 5.1.0, 6.0.0, 6.2.0, 6.3.0, 7.0.0 and 8.0.0
// mostly to discover when new scripts and categories arise.
var inTest = []T{
{0x11711, "Ahom"},
{0x14646, "Anatolian_Hieroglyphs"},
{0x06e2, "Arabic"},
{0x0567, "Armenian"},
{0x10b20, "Avestan"},
@ -58,6 +60,7 @@ var inTest = []T{
{0x3028, "Han"},
{0x11b8, "Hangul"},
{0x1727, "Hanunoo"},
{0x108FF, "Hatran"},
{0x05a0, "Hebrew"},
{0x3058, "Hiragana"},
{0x10841, "Imperial_Aramaic"},
@ -94,12 +97,14 @@ var inTest = []T{
{0x11611, "Modi"},
{0x1822, "Mongolian"},
{0x16a60, "Mro"},
{0x11293, "Multani"},
{0x104c, "Myanmar"},
{0x10880, "Nabataean"},
{0x19c3, "New_Tai_Lue"},
{0x07f8, "Nko"},
{0x169b, "Ogham"},
{0x1c6a, "Ol_Chiki"},
{0x10C80, "Old_Hungarian"},
{0x10310, "Old_Italic"},
{0x10a80, "Old_North_Arabian"},
{0x10350, "Old_Permic"},
@ -121,6 +126,7 @@ var inTest = []T{
{0x111a0, "Sharada"},
{0x10463, "Shavian"},
{0x115c1, "Siddham"},
{0x1D920, "SignWriting"},
{0x0dbd, "Sinhala"},
{0x110d0, "Sora_Sompeng"},
{0x1ba3, "Sundanese"},

File diff suppressed because it is too large Load diff