// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package colltab import ( "fmt" "unicode" ) // Level identifies the collation comparison level. // The primary level corresponds to the basic sorting of text. // The secondary level corresponds to accents and related linguistic elements. // The tertiary level corresponds to casing and related concepts. // The quaternary level is derived from the other levels by the // various algorithms for handling variable elements. type Level int const ( Primary Level = iota Secondary Tertiary Quaternary Identity NumLevels ) const ( defaultSecondary = 0x20 defaultTertiary = 0x2 maxTertiary = 0x1F MaxQuaternary = 0x1FFFFF // 21 bits. ) // Elem is a representation of a collation element. This API provides ways to encode // and decode Elems. Implementations of collation tables may use values greater // or equal to PrivateUse for their own purposes. However, these should never be // returned by AppendNext. type Elem uint32 const ( maxCE Elem = 0xAFFFFFFF PrivateUse = minContract minContract = 0xC0000000 maxContract = 0xDFFFFFFF minExpand = 0xE0000000 maxExpand = 0xEFFFFFFF minDecomp = 0xF0000000 ) type ceType int const ( ceNormal ceType = iota // ceNormal includes implicits (ce == 0) ceContractionIndex // rune can be a start of a contraction ceExpansionIndex // rune expands into a sequence of collation elements ceDecompose // rune expands using NFKC decomposition ) func (ce Elem) ctype() ceType { if ce <= maxCE { return ceNormal } if ce <= maxContract { return ceContractionIndex } else { if ce <= maxExpand { return ceExpansionIndex } return ceDecompose } panic("should not reach here") return ceType(-1) } // For normal collation elements, we assume that a collation element either has // a primary or non-default secondary value, not both. // Collation elements with a primary value are of the form // 01pppppp pppppppp ppppppp0 ssssssss // - p* is primary collation value // - s* is the secondary collation value // 00pppppp pppppppp ppppppps sssttttt, where // - p* is primary collation value // - s* offset of secondary from default value. // - t* is the tertiary collation value // 100ttttt cccccccc pppppppp pppppppp // - t* is the tertiar collation value // - c* is the canonical combining class // - p* is the primary collation value // Collation elements with a secondary value are of the form // 1010cccc ccccssss ssssssss tttttttt, where // - c* is the canonical combining class // - s* is the secondary collation value // - t* is the tertiary collation value // 11qqqqqq qqqqqqqq qqqqqqq0 00000000 // - q* quaternary value const ( ceTypeMask = 0xC0000000 ceTypeMaskExt = 0xE0000000 ceIgnoreMask = 0xF00FFFFF ceType1 = 0x40000000 ceType2 = 0x00000000 ceType3or4 = 0x80000000 ceType4 = 0xA0000000 ceTypeQ = 0xC0000000 Ignore = ceType4 firstNonPrimary = 0x80000000 lastSpecialPrimary = 0xA0000000 secondaryMask = 0x80000000 hasTertiaryMask = 0x40000000 primaryValueMask = 0x3FFFFE00 maxPrimaryBits = 21 compactPrimaryBits = 16 maxSecondaryBits = 12 maxTertiaryBits = 8 maxCCCBits = 8 maxSecondaryCompactBits = 8 maxSecondaryDiffBits = 4 maxTertiaryCompactBits = 5 primaryShift = 9 compactSecondaryShift = 5 minCompactSecondary = defaultSecondary - 4 ) func makeImplicitCE(primary int) Elem { return ceType1 | Elem(primary<<primaryShift) | defaultSecondary } // MakeElem returns an Elem for the given values. It will return an error // if the given combination of values is invalid. func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) { if w := primary; w >= 1<<maxPrimaryBits || w < 0 { return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits) } if w := secondary; w >= 1<<maxSecondaryBits || w < 0 { return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits) } if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 { return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits) } ce := Elem(0) if primary != 0 { if ccc != 0 { if primary >= 1<<compactPrimaryBits { return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits) } if secondary != defaultSecondary { return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc) } ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits)) ce |= Elem(ccc) << compactPrimaryBits ce |= Elem(primary) ce |= ceType3or4 } else if tertiary == defaultTertiary { if secondary >= 1<<maxSecondaryCompactBits { return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits) } ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary) ce |= ceType1 } else { d := secondary - defaultSecondary + maxSecondaryDiffBits if d >= 1<<maxSecondaryDiffBits || d < 0 { return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits) } if tertiary >= 1<<maxTertiaryCompactBits { return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits) } ce = Elem(primary<<maxSecondaryDiffBits + d) ce = ce<<maxTertiaryCompactBits + Elem(tertiary) } } else { ce = Elem(secondary<<maxTertiaryBits + tertiary) ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits) ce |= ceType4 } return ce, nil } // MakeQuaternary returns an Elem with the given quaternary value. func MakeQuaternary(v int) Elem { return ceTypeQ | Elem(v<<primaryShift) } // Mask sets weights for any level smaller than l to 0. // The resulting Elem can be used to test for equality with // other Elems to which the same mask has been applied. func (ce Elem) Mask(l Level) uint32 { return 0 } // CCC returns the canonical combining class associated with the underlying character, // if applicable, or 0 otherwise. func (ce Elem) CCC() uint8 { if ce&ceType3or4 != 0 { if ce&ceType4 == ceType3or4 { return uint8(ce >> 16) } return uint8(ce >> 20) } return 0 } // Primary returns the primary collation weight for ce. func (ce Elem) Primary() int { if ce >= firstNonPrimary { if ce > lastSpecialPrimary { return 0 } return int(uint16(ce)) } return int(ce&primaryValueMask) >> primaryShift } // Secondary returns the secondary collation weight for ce. func (ce Elem) Secondary() int { switch ce & ceTypeMask { case ceType1: return int(uint8(ce)) case ceType2: return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) case ceType3or4: if ce < ceType4 { return defaultSecondary } return int(ce>>8) & 0xFFF case ceTypeQ: return 0 } panic("should not reach here") } // Tertiary returns the tertiary collation weight for ce. func (ce Elem) Tertiary() uint8 { if ce&hasTertiaryMask == 0 { if ce&ceType3or4 == 0 { return uint8(ce & 0x1F) } if ce&ceType4 == ceType4 { return uint8(ce) } return uint8(ce>>24) & 0x1F // type 2 } else if ce&ceTypeMask == ceType1 { return defaultTertiary } // ce is a quaternary value. return 0 } func (ce Elem) updateTertiary(t uint8) Elem { if ce&ceTypeMask == ceType1 { // convert to type 4 nce := ce & primaryValueMask nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift ce = nce } else if ce&ceTypeMaskExt == ceType3or4 { ce &= ^Elem(maxTertiary << 24) return ce | (Elem(t) << 24) } else { // type 2 or 4 ce &= ^Elem(maxTertiary) } return ce | Elem(t) } // Quaternary returns the quaternary value if explicitly specified, // 0 if ce == Ignore, or MaxQuaternary otherwise. // Quaternary values are used only for shifted variants. func (ce Elem) Quaternary() int { if ce&ceTypeMask == ceTypeQ { return int(ce&primaryValueMask) >> primaryShift } else if ce&ceIgnoreMask == Ignore { return 0 } return MaxQuaternary } // Weight returns the collation weight for the given level. func (ce Elem) Weight(l Level) int { switch l { case Primary: return ce.Primary() case Secondary: return ce.Secondary() case Tertiary: return int(ce.Tertiary()) case Quaternary: return ce.Quaternary() } return 0 // return 0 (ignore) for undefined levels. } // For contractions, collation elements are of the form // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where // - n* is the size of the first node in the contraction trie. // - i* is the index of the first node in the contraction trie. // - b* is the offset into the contraction collation element table. // See contract.go for details on the contraction trie. const ( maxNBits = 4 maxTrieIndexBits = 12 maxContractOffsetBits = 13 ) func splitContractIndex(ce Elem) (index, n, offset int) { n = int(ce & (1<<maxNBits - 1)) ce >>= maxNBits index = int(ce & (1<<maxTrieIndexBits - 1)) ce >>= maxTrieIndexBits offset = int(ce & (1<<maxContractOffsetBits - 1)) return } // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb, // where b* is the index into the expansion sequence table. const maxExpandIndexBits = 16 func splitExpandIndex(ce Elem) (index int) { return int(uint16(ce)) } // Some runes can be expanded using NFKD decomposition. Instead of storing the full // sequence of collation elements, we decompose the rune and lookup the collation // elements for each rune in the decomposition and modify the tertiary weights. // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where // - v* is the replacement tertiary weight for the first rune, // - w* is the replacement tertiary weight for the second rune, // Tertiary weights of subsequent runes should be replaced with maxTertiary. // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. func splitDecompose(ce Elem) (t1, t2 uint8) { return uint8(ce), uint8(ce >> 8) } const ( // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. minUnified rune = 0x4E00 maxUnified = 0x9FFF minCompatibility = 0xF900 maxCompatibility = 0xFAFF minRare = 0x3400 maxRare = 0x4DBF ) const ( commonUnifiedOffset = 0x10000 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF otherOffset = 0x50000 // largest rune in rare is U+2FA1D illegalOffset = otherOffset + int(unicode.MaxRune) maxPrimary = illegalOffset + 1 ) // implicitPrimary returns the primary weight for the a rune // for which there is no entry for the rune in the collation table. // We take a different approach from the one specified in // http://unicode.org/reports/tr10/#Implicit_Weights, // but preserve the resulting relative ordering of the runes. func implicitPrimary(r rune) int { if unicode.Is(unicode.Ideographic, r) { if r >= minUnified && r <= maxUnified { // The most common case for CJK. return int(r) + commonUnifiedOffset } if r >= minCompatibility && r <= maxCompatibility { // This will typically not hit. The DUCET explicitly specifies mappings // for all characters that do not decompose. return int(r) + commonUnifiedOffset } return int(r) + rareUnifiedOffset } return int(r) + otherOffset }