Newer
Older
pokemon-go-trade / vendor / golang.org / x / text / collate / build / builder.go
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package build // import "golang.org/x/text/collate/build"

import (
	"fmt"
	"io"
	"log"
	"sort"
	"strings"
	"unicode/utf8"

	"golang.org/x/text/internal/colltab"
	"golang.org/x/text/language"
	"golang.org/x/text/unicode/norm"
)

// TODO: optimizations:
// - expandElem is currently 20K. By putting unique colElems in a separate
//   table and having a byte array of indexes into this table, we can reduce
//   the total size to about 7K. By also factoring out the length bytes, we
//   can reduce this to about 6K.
// - trie valueBlocks are currently 100K. There are a lot of sparse blocks
//   and many consecutive values with the same stride. This can be further
//   compacted.
// - Compress secondary weights into 8 bits.
// - Some LDML specs specify a context element. Currently we simply concatenate
//   those.  Context can be implemented using the contraction trie. If Builder
//   could analyze and detect when using a context makes sense, there is no
//   need to expose this construct in the API.

// A Builder builds a root collation table.  The user must specify the
// collation elements for each entry.  A common use will be to base the weights
// on those specified in the allkeys* file as provided by the UCA or CLDR.
type Builder struct {
	index  *trieBuilder
	root   ordering
	locale []*Tailoring
	t      *table
	err    error
	built  bool

	minNonVar int // lowest primary recorded for a variable
	varTop    int // highest primary recorded for a non-variable

	// indexes used for reusing expansions and contractions
	expIndex map[string]int      // positions of expansions keyed by their string representation
	ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes
	ctElem   map[string]int      // contraction elements keyed by their string representation
}

// A Tailoring builds a collation table based on another collation table.
// The table is defined by specifying tailorings to the underlying table.
// See https://unicode.org/reports/tr35/ for an overview of tailoring
// collation tables.  The CLDR contains pre-defined tailorings for a variety
// of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.)
type Tailoring struct {
	id      string
	builder *Builder
	index   *ordering

	anchor *entry
	before bool
}

// NewBuilder returns a new Builder.
func NewBuilder() *Builder {
	return &Builder{
		index:    newTrieBuilder(),
		root:     makeRootOrdering(),
		expIndex: make(map[string]int),
		ctHandle: make(map[string]ctHandle),
		ctElem:   make(map[string]int),
	}
}

// Tailoring returns a Tailoring for the given locale.  One should
// have completed all calls to Add before calling Tailoring.
func (b *Builder) Tailoring(loc language.Tag) *Tailoring {
	t := &Tailoring{
		id:      loc.String(),
		builder: b,
		index:   b.root.clone(),
	}
	t.index.id = t.id
	b.locale = append(b.locale, t)
	return t
}

// Add adds an entry to the collation element table, mapping
// a slice of runes to a sequence of collation elements.
// A collation element is specified as list of weights: []int{primary, secondary, ...}.
// The entries are typically obtained from a collation element table
// as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format.
// Note that the collation elements specified by colelems are only used
// as a guide.  The actual weights generated by Builder may differ.
// The argument variables is a list of indices into colelems that should contain
// a value for each colelem that is a variable. (See the reference above.)
func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
	str := string(runes)
	elems := make([]rawCE, len(colelems))
	for i, ce := range colelems {
		if len(ce) == 0 {
			break
		}
		elems[i] = makeRawCE(ce, 0)
		if len(ce) == 1 {
			elems[i].w[1] = defaultSecondary
		}
		if len(ce) <= 2 {
			elems[i].w[2] = defaultTertiary
		}
		if len(ce) <= 3 {
			elems[i].w[3] = ce[0]
		}
	}
	for i, ce := range elems {
		p := ce.w[0]
		isvar := false
		for _, j := range variables {
			if i == j {
				isvar = true
			}
		}
		if isvar {
			if p >= b.minNonVar && b.minNonVar > 0 {
				return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
			}
			if p > b.varTop {
				b.varTop = p
			}
		} else if p > 1 { // 1 is a special primary value reserved for FFFE
			if p <= b.varTop {
				return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
			}
			if b.minNonVar == 0 || p < b.minNonVar {
				b.minNonVar = p
			}
		}
	}
	elems, err := convertLargeWeights(elems)
	if err != nil {
		return err
	}
	cccs := []uint8{}
	nfd := norm.NFD.String(str)
	for i := range nfd {
		cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
	}
	if len(cccs) < len(elems) {
		if len(cccs) > 2 {
			return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
		}
		p := len(elems) - 1
		for ; p > 0 && elems[p].w[0] == 0; p-- {
			elems[p].ccc = cccs[len(cccs)-1]
		}
		for ; p >= 0; p-- {
			elems[p].ccc = cccs[0]
		}
	} else {
		for i := range elems {
			elems[i].ccc = cccs[i]
		}
	}
	// doNorm in collate.go assumes that the following conditions hold.
	if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
		return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
	}
	b.root.newEntry(str, elems)
	return nil
}

func (t *Tailoring) setAnchor(anchor string) error {
	anchor = norm.NFC.String(anchor)
	a := t.index.find(anchor)
	if a == nil {
		a = t.index.newEntry(anchor, nil)
		a.implicit = true
		a.modified = true
		for _, r := range []rune(anchor) {
			e := t.index.find(string(r))
			e.lock = true
		}
	}
	t.anchor = a
	return nil
}

// SetAnchor sets the point after which elements passed in subsequent calls to
// Insert will be inserted.  It is equivalent to the reset directive in an LDML
// specification.  See Insert for an example.
// SetAnchor supports the following logical reset positions:
// <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
// and <last_non_ignorable/>.
func (t *Tailoring) SetAnchor(anchor string) error {
	if err := t.setAnchor(anchor); err != nil {
		return err
	}
	t.before = false
	return nil
}

// SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
// Insert will insert entries before the anchor.
func (t *Tailoring) SetAnchorBefore(anchor string) error {
	if err := t.setAnchor(anchor); err != nil {
		return err
	}
	t.before = true
	return nil
}

// Insert sets the ordering of str relative to the entry set by the previous
// call to SetAnchor or Insert.  The argument extend corresponds
// to the extend elements as defined in LDML.  A non-empty value for extend
// will cause the collation elements corresponding to extend to be appended
// to the collation elements generated for the entry added by Insert.
// This has the same net effect as sorting str after the string anchor+extend.
// See https://www.unicode.org/reports/tr10/#Tailoring_Example for details
// on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements
// for full details on LDML.
//
// Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
// at the primary sorting level:
//      t := b.Tailoring("se")
// 		t.SetAnchor("z")
// 		t.Insert(colltab.Primary, "ä", "")
// Order "ü" after "ue" at the secondary sorting level:
//		t.SetAnchor("ue")
//		t.Insert(colltab.Secondary, "ü","")
// or
//		t.SetAnchor("u")
//		t.Insert(colltab.Secondary, "ü", "e")
// Order "q" afer "ab" at the secondary level and "Q" after "q"
// at the tertiary level:
// 		t.SetAnchor("ab")
// 		t.Insert(colltab.Secondary, "q", "")
// 		t.Insert(colltab.Tertiary, "Q", "")
// Order "b" before "a":
//      t.SetAnchorBefore("a")
//      t.Insert(colltab.Primary, "b", "")
// Order "0" after the last primary ignorable:
//      t.SetAnchor("<last_primary_ignorable/>")
//      t.Insert(colltab.Primary, "0", "")
func (t *Tailoring) Insert(level colltab.Level, str, extend string) error {
	if t.anchor == nil {
		return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
	}
	str = norm.NFC.String(str)
	e := t.index.find(str)
	if e == nil {
		e = t.index.newEntry(str, nil)
	} else if e.logical != noAnchor {
		return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str)
	}
	if e.lock {
		return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str)
	}
	a := t.anchor
	// Find the first element after the anchor which differs at a level smaller or
	// equal to the given level.  Then insert at this position.
	// See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
	e.before = t.before
	if t.before {
		t.before = false
		if a.prev == nil {
			a.insertBefore(e)
		} else {
			for a = a.prev; a.level > level; a = a.prev {
			}
			a.insertAfter(e)
		}
		e.level = level
	} else {
		for ; a.level > level; a = a.next {
		}
		e.level = a.level
		if a != e {
			a.insertAfter(e)
			a.level = level
		} else {
			// We don't set a to prev itself. This has the effect of the entry
			// getting new collation elements that are an increment of itself.
			// This is intentional.
			a.prev.level = level
		}
	}
	e.extend = norm.NFD.String(extend)
	e.exclude = false
	e.modified = true
	e.elems = nil
	t.anchor = e
	return nil
}

func (o *ordering) getWeight(e *entry) []rawCE {
	if len(e.elems) == 0 && e.logical == noAnchor {
		if e.implicit {
			for _, r := range e.runes {
				e.elems = append(e.elems, o.getWeight(o.find(string(r)))...)
			}
		} else if e.before {
			count := [colltab.Identity + 1]int{}
			a := e
			for ; a.elems == nil && !a.implicit; a = a.next {
				count[a.level]++
			}
			e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
			for i := colltab.Primary; i < colltab.Quaternary; i++ {
				if count[i] != 0 {
					e.elems[0].w[i] -= count[i]
					break
				}
			}
			if e.prev != nil {
				o.verifyWeights(e.prev, e, e.prev.level)
			}
		} else {
			prev := e.prev
			e.elems = nextWeight(prev.level, o.getWeight(prev))
			o.verifyWeights(e, e.next, e.level)
		}
	}
	return e.elems
}

func (o *ordering) addExtension(e *entry) {
	if ex := o.find(e.extend); ex != nil {
		e.elems = append(e.elems, ex.elems...)
	} else {
		for _, r := range []rune(e.extend) {
			e.elems = append(e.elems, o.find(string(r)).elems...)
		}
	}
	e.extend = ""
}

func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error {
	if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil {
		return nil
	}
	for i := colltab.Primary; i < level; i++ {
		if a.elems[0].w[i] < b.elems[0].w[i] {
			return nil
		}
	}
	if a.elems[0].w[level] >= b.elems[0].w[level] {
		err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
		log.Println(err)
		// TODO: return the error instead, or better, fix the conflicting entry by making room.
	}
	return nil
}

func (b *Builder) error(e error) {
	if e != nil {
		b.err = e
	}
}

func (b *Builder) errorID(locale string, e error) {
	if e != nil {
		b.err = fmt.Errorf("%s:%v", locale, e)
	}
}

// patchNorm ensures that NFC and NFD counterparts are consistent.
func (o *ordering) patchNorm() {
	// Insert the NFD counterparts, if necessary.
	for _, e := range o.ordered {
		nfd := norm.NFD.String(e.str)
		if nfd != e.str {
			if e0 := o.find(nfd); e0 != nil && !e0.modified {
				e0.elems = e.elems
			} else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
				e := o.newEntry(nfd, e.elems)
				e.modified = true
			}
		}
	}
	// Update unchanged composed forms if one of their parts changed.
	for _, e := range o.ordered {
		nfd := norm.NFD.String(e.str)
		if e.modified || nfd == e.str {
			continue
		}
		if e0 := o.find(nfd); e0 != nil {
			e.elems = e0.elems
		} else {
			e.elems = o.genColElems(nfd)
			if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
				r := []rune(nfd)
				head := string(r[0])
				tail := ""
				for i := 1; i < len(r); i++ {
					s := norm.NFC.String(head + string(r[i]))
					if e0 := o.find(s); e0 != nil && e0.modified {
						head = s
					} else {
						tail += string(r[i])
					}
				}
				e.elems = append(o.genColElems(head), o.genColElems(tail)...)
			}
		}
	}
	// Exclude entries for which the individual runes generate the same collation elements.
	for _, e := range o.ordered {
		if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
			e.exclude = true
		}
	}
}

func (b *Builder) buildOrdering(o *ordering) {
	for _, e := range o.ordered {
		o.getWeight(e)
	}
	for _, e := range o.ordered {
		o.addExtension(e)
	}
	o.patchNorm()
	o.sort()
	simplify(o)
	b.processExpansions(o)   // requires simplify
	b.processContractions(o) // requires simplify

	t := newNode()
	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
		if !e.skip() {
			ce, err := e.encode()
			b.errorID(o.id, err)
			t.insert(e.runes[0], ce)
		}
	}
	o.handle = b.index.addTrie(t)
}

func (b *Builder) build() (*table, error) {
	if b.built {
		return b.t, b.err
	}
	b.built = true
	b.t = &table{
		Table: colltab.Table{
			MaxContractLen: utf8.UTFMax,
			VariableTop:    uint32(b.varTop),
		},
	}

	b.buildOrdering(&b.root)
	b.t.root = b.root.handle
	for _, t := range b.locale {
		b.buildOrdering(t.index)
		if b.err != nil {
			break
		}
	}
	i, err := b.index.generate()
	b.t.trie = *i
	b.t.Index = colltab.Trie{
		Index:   i.index,
		Values:  i.values,
		Index0:  i.index[blockSize*b.t.root.lookupStart:],
		Values0: i.values[blockSize*b.t.root.valueStart:],
	}
	b.error(err)
	return b.t, b.err
}

// Build builds the root Collator.
func (b *Builder) Build() (colltab.Weighter, error) {
	table, err := b.build()
	if err != nil {
		return nil, err
	}
	return table, nil
}

// Build builds a Collator for Tailoring t.
func (t *Tailoring) Build() (colltab.Weighter, error) {
	// TODO: implement.
	return nil, nil
}

// Print prints the tables for b and all its Tailorings as a Go file
// that can be included in the Collate package.
func (b *Builder) Print(w io.Writer) (n int, err error) {
	p := func(nn int, e error) {
		n += nn
		if err == nil {
			err = e
		}
	}
	t, err := b.build()
	if err != nil {
		return 0, err
	}
	p(fmt.Fprintf(w, `var availableLocales = "und`))
	for _, loc := range b.locale {
		if loc.id != "und" {
			p(fmt.Fprintf(w, ",%s", loc.id))
		}
	}
	p(fmt.Fprint(w, "\"\n\n"))
	p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop))
	p(fmt.Fprintln(w, "var locales = [...]tableIndex{"))
	for _, loc := range b.locale {
		if loc.id == "und" {
			p(t.fprintIndex(w, loc.index.handle, loc.id))
		}
	}
	for _, loc := range b.locale {
		if loc.id != "und" {
			p(t.fprintIndex(w, loc.index.handle, loc.id))
		}
	}
	p(fmt.Fprint(w, "}\n\n"))
	n, _, err = t.fprint(w, "main")
	return
}

// reproducibleFromNFKD checks whether the given expansion could be generated
// from an NFKD expansion.
func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
	// Length must be equal.
	if len(exp) != len(nfkd) {
		return false
	}
	for i, ce := range exp {
		// Primary and secondary values should be equal.
		if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
			return false
		}
		// Tertiary values should be equal to maxTertiary for third element onwards.
		// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
		// simply be dropped.  Try this out by dropping the following code.
		if i >= 2 && ce.w[2] != maxTertiary {
			return false
		}
		if _, err := makeCE(ce); err != nil {
			// Simply return false. The error will be caught elsewhere.
			return false
		}
	}
	return true
}

func simplify(o *ordering) {
	// Runes that are a starter of a contraction should not be removed.
	// (To date, there is only Kannada character 0CCA.)
	keep := make(map[rune]bool)
	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
		if len(e.runes) > 1 {
			keep[e.runes[0]] = true
		}
	}
	// Tag entries for which the runes NFKD decompose to identical values.
	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
		s := e.str
		nfkd := norm.NFKD.String(s)
		nfd := norm.NFD.String(s)
		if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
			continue
		}
		if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
			e.decompose = true
		}
	}
}

// appendExpansion converts the given collation sequence to
// collation elements and adds them to the expansion table.
// It returns an index to the expansion table.
func (b *Builder) appendExpansion(e *entry) int {
	t := b.t
	i := len(t.ExpandElem)
	ce := uint32(len(e.elems))
	t.ExpandElem = append(t.ExpandElem, ce)
	for _, w := range e.elems {
		ce, err := makeCE(w)
		if err != nil {
			b.error(err)
			return -1
		}
		t.ExpandElem = append(t.ExpandElem, ce)
	}
	return i
}

// processExpansions extracts data necessary to generate
// the extraction tables.
func (b *Builder) processExpansions(o *ordering) {
	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
		if !e.expansion() {
			continue
		}
		key := fmt.Sprintf("%v", e.elems)
		i, ok := b.expIndex[key]
		if !ok {
			i = b.appendExpansion(e)
			b.expIndex[key] = i
		}
		e.expansionIndex = i
	}
}

func (b *Builder) processContractions(o *ordering) {
	// Collate contractions per starter rune.
	starters := []rune{}
	cm := make(map[rune][]*entry)
	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
		if e.contraction() {
			if len(e.str) > b.t.MaxContractLen {
				b.t.MaxContractLen = len(e.str)
			}
			r := e.runes[0]
			if _, ok := cm[r]; !ok {
				starters = append(starters, r)
			}
			cm[r] = append(cm[r], e)
		}
	}
	// Add entries of single runes that are at a start of a contraction.
	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
		if !e.contraction() {
			r := e.runes[0]
			if _, ok := cm[r]; ok {
				cm[r] = append(cm[r], e)
			}
		}
	}
	// Build the tries for the contractions.
	t := b.t
	for _, r := range starters {
		l := cm[r]
		// Compute suffix strings. There are 31 different contraction suffix
		// sets for 715 contractions and 82 contraction starter runes as of
		// version 6.0.0.
		sufx := []string{}
		hasSingle := false
		for _, e := range l {
			if len(e.runes) > 1 {
				sufx = append(sufx, string(e.runes[1:]))
			} else {
				hasSingle = true
			}
		}
		if !hasSingle {
			b.error(fmt.Errorf("no single entry for starter rune %U found", r))
			continue
		}
		// Unique the suffix set.
		sort.Strings(sufx)
		key := strings.Join(sufx, "\n")
		handle, ok := b.ctHandle[key]
		if !ok {
			var err error
			handle, err = appendTrie(&t.ContractTries, sufx)
			if err != nil {
				b.error(err)
			}
			b.ctHandle[key] = handle
		}
		// Bucket sort entries in index order.
		es := make([]*entry, len(l))
		for _, e := range l {
			var p, sn int
			if len(e.runes) > 1 {
				str := []byte(string(e.runes[1:]))
				p, sn = lookup(&t.ContractTries, handle, str)
				if sn != len(str) {
					log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
				}
			}
			if es[p] != nil {
				log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
			}
			es[p] = e
		}
		// Create collation elements for contractions.
		elems := []uint32{}
		for _, e := range es {
			ce, err := e.encodeBase()
			b.errorID(o.id, err)
			elems = append(elems, ce)
		}
		key = fmt.Sprintf("%v", elems)
		i, ok := b.ctElem[key]
		if !ok {
			i = len(t.ContractElem)
			b.ctElem[key] = i
			t.ContractElem = append(t.ContractElem, elems...)
		}
		// Store info in entry for starter rune.
		es[0].contractionIndex = i
		es[0].contractionHandle = handle
	}
}