// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package build import ( "fmt" "unicode" "golang.org/x/text/internal/colltab" ) const ( defaultSecondary = 0x20 defaultTertiary = 0x2 maxTertiary = 0x1F ) type rawCE struct { w []int ccc uint8 } func makeRawCE(w []int, ccc uint8) rawCE { ce := rawCE{w: make([]int, 4), ccc: ccc} copy(ce.w, w) return ce } // A collation element is represented as an uint32. // In the typical case, a rune maps to a single collation element. If a rune // can be the start of a contraction or expands into multiple collation elements, // then the collation element that is associated with a rune will have a special // form to represent such m to n mappings. Such special collation elements // have a value >= 0x80000000. const ( maxPrimaryBits = 21 maxSecondaryBits = 12 maxTertiaryBits = 8 ) func makeCE(ce rawCE) (uint32, error) { v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc) return uint32(v), e } // For contractions, collation elements are of the form // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where // - n* is the size of the first node in the contraction trie. // - i* is the index of the first node in the contraction trie. // - b* is the offset into the contraction collation element table. // See contract.go for details on the contraction trie. const ( contractID = 0xC0000000 maxNBits = 4 maxTrieIndexBits = 12 maxContractOffsetBits = 13 ) func makeContractIndex(h ctHandle, offset int) (uint32, error) { if h.n >= 1<<maxNBits { return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits) } if h.index >= 1<<maxTrieIndexBits { return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits) } if offset >= 1<<maxContractOffsetBits { return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits) } ce := uint32(contractID) ce += uint32(offset << (maxNBits + maxTrieIndexBits)) ce += uint32(h.index << maxNBits) ce += uint32(h.n) return ce, nil } // For expansions, collation elements are of the form // 11100000 00000000 bbbbbbbb bbbbbbbb, // where b* is the index into the expansion sequence table. const ( expandID = 0xE0000000 maxExpandIndexBits = 16 ) func makeExpandIndex(index int) (uint32, error) { if index >= 1<<maxExpandIndexBits { return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits) } return expandID + uint32(index), nil } // Each list of collation elements corresponding to an expansion starts with // a header indicating the length of the sequence. func makeExpansionHeader(n int) (uint32, error) { return uint32(n), nil } // Some runes can be expanded using NFKD decomposition. Instead of storing the full // sequence of collation elements, we decompose the rune and lookup the collation // elements for each rune in the decomposition and modify the tertiary weights. // The collation element, in this case, is of the form // 11110000 00000000 wwwwwwww vvvvvvvv, where // - v* is the replacement tertiary weight for the first rune, // - w* is the replacement tertiary weight for the second rune, // Tertiary weights of subsequent runes should be replaced with maxTertiary. // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. const ( decompID = 0xF0000000 ) func makeDecompose(t1, t2 int) (uint32, error) { if t1 >= 256 || t1 < 0 { return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1) } if t2 >= 256 || t2 < 0 { return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2) } return uint32(t2<<8+t1) + decompID, nil } const ( // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. minUnified rune = 0x4E00 maxUnified = 0x9FFF minCompatibility = 0xF900 maxCompatibility = 0xFAFF minRare = 0x3400 maxRare = 0x4DBF ) const ( commonUnifiedOffset = 0x10000 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF otherOffset = 0x50000 // largest rune in rare is U+2FA1D illegalOffset = otherOffset + int(unicode.MaxRune) maxPrimary = illegalOffset + 1 ) // implicitPrimary returns the primary weight for the a rune // for which there is no entry for the rune in the collation table. // We take a different approach from the one specified in // https://unicode.org/reports/tr10/#Implicit_Weights, // but preserve the resulting relative ordering of the runes. func implicitPrimary(r rune) int { if unicode.Is(unicode.Ideographic, r) { if r >= minUnified && r <= maxUnified { // The most common case for CJK. return int(r) + commonUnifiedOffset } if r >= minCompatibility && r <= maxCompatibility { // This will typically not hit. The DUCET explicitly specifies mappings // for all characters that do not decompose. return int(r) + commonUnifiedOffset } return int(r) + rareUnifiedOffset } return int(r) + otherOffset } // convertLargeWeights converts collation elements with large // primaries (either double primaries or for illegal runes) // to our own representation. // A CJK character C is represented in the DUCET as // [.FBxx.0020.0002.C][.BBBB.0000.0000.C] // We will rewrite these characters to a single CE. // We assume the CJK values start at 0x8000. // See https://unicode.org/reports/tr10/#Implicit_Weights func convertLargeWeights(elems []rawCE) (res []rawCE, err error) { const ( cjkPrimaryStart = 0xFB40 rarePrimaryStart = 0xFB80 otherPrimaryStart = 0xFBC0 illegalPrimary = 0xFFFE highBitsMask = 0x3F lowBitsMask = 0x7FFF lowBitsFlag = 0x8000 shiftBits = 15 ) for i := 0; i < len(elems); i++ { ce := elems[i].w p := ce[0] if p < cjkPrimaryStart { continue } if p > 0xFFFF { return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p) } if p >= illegalPrimary { ce[0] = illegalOffset + p - illegalPrimary } else { if i+1 >= len(elems) { return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) } if elems[i+1].w[0]&lowBitsFlag == 0 { return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) } np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask switch { case p < rarePrimaryStart: np += commonUnifiedOffset case p < otherPrimaryStart: np += rareUnifiedOffset default: p += otherOffset } ce[0] = np for j := i + 1; j+1 < len(elems); j++ { elems[j] = elems[j+1] } elems = elems[:len(elems)-1] } } return elems, nil } // nextWeight computes the first possible collation weights following elems // for the given level. func nextWeight(level colltab.Level, elems []rawCE) []rawCE { if level == colltab.Identity { next := make([]rawCE, len(elems)) copy(next, elems) return next } next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)} next[0].w[level]++ if level < colltab.Secondary { next[0].w[colltab.Secondary] = defaultSecondary } if level < colltab.Tertiary { next[0].w[colltab.Tertiary] = defaultTertiary } // Filter entries that cannot influence ordering. for _, ce := range elems[1:] { skip := true for i := colltab.Primary; i < level; i++ { skip = skip && ce.w[i] == 0 } if !skip { next = append(next, ce) } } return next } func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) { for ; i < len(elems) && elems[i].w[level] == 0; i++ { } if i < len(elems) { return i, elems[i].w[level] } return i, 0 } // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise. // It also returns the collation level at which the difference is found. func compareWeights(a, b []rawCE) (result int, level colltab.Level) { for level := colltab.Primary; level < colltab.Identity; level++ { var va, vb int for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 { ia, va = nextVal(a, ia, level) ib, vb = nextVal(b, ib, level) if va != vb { if va < vb { return -1, level } else { return 1, level } } } } return 0, colltab.Identity } func equalCE(a, b rawCE) bool { for i := 0; i < 3; i++ { if b.w[i] != a.w[i] { return false } } return true } func equalCEArrays(a, b []rawCE) bool { if len(a) != len(b) { return false } for i := range a { if !equalCE(a[i], b[i]) { return false } } return true }