blob: 092277e1f64b96a3475d9b4036d93ad9b0f00610 [file] [log] [blame]
Serge Bazanskicc25bdf2018-10-25 14:02:58 +02001// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// This program generates the trie for width operations. The generated table
8// includes width category information as well as the normalization mappings.
9package main
10
11import (
12 "bytes"
13 "fmt"
14 "io"
15 "log"
16 "math"
17 "unicode/utf8"
18
19 "golang.org/x/text/internal/gen"
20 "golang.org/x/text/internal/triegen"
21)
22
23// See gen_common.go for flags.
24
25func main() {
26 gen.Init()
27 genTables()
28 genTests()
29 gen.Repackage("gen_trieval.go", "trieval.go", "width")
30 gen.Repackage("gen_common.go", "common_test.go", "width")
31}
32
33func genTables() {
34 t := triegen.NewTrie("width")
35 // fold and inverse mappings. See mapComment for a description of the format
36 // of each entry. Add dummy value to make an index of 0 mean no mapping.
37 inverse := [][4]byte{{}}
38 mapping := map[[4]byte]int{[4]byte{}: 0}
39
40 getWidthData(func(r rune, tag elem, alt rune) {
41 idx := 0
42 if alt != 0 {
43 var buf [4]byte
44 buf[0] = byte(utf8.EncodeRune(buf[1:], alt))
45 s := string(r)
46 buf[buf[0]] ^= s[len(s)-1]
47 var ok bool
48 if idx, ok = mapping[buf]; !ok {
49 idx = len(mapping)
50 if idx > math.MaxUint8 {
51 log.Fatalf("Index %d does not fit in a byte.", idx)
52 }
53 mapping[buf] = idx
54 inverse = append(inverse, buf)
55 }
56 }
57 t.Insert(r, uint64(tag|elem(idx)))
58 })
59
60 w := &bytes.Buffer{}
61 gen.WriteUnicodeVersion(w)
62
63 sz, err := t.Gen(w)
64 if err != nil {
65 log.Fatal(err)
66 }
67
68 sz += writeMappings(w, inverse)
69
70 fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024)
71
72 gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes())
73}
74
75const inverseDataComment = `
76// inverseData contains 4-byte entries of the following format:
77// <length> <modified UTF-8-encoded rune> <0 padding>
78// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
79// UTF-8 encoding of the original rune. Mappings often have the following
80// pattern:
81// A -> A (U+FF21 -> U+0041)
82// B -> B (U+FF22 -> U+0042)
83// ...
84// By xor-ing the last byte the same entry can be shared by many mappings. This
85// reduces the total number of distinct entries by about two thirds.
86// The resulting entry for the aforementioned mappings is
87// { 0x01, 0xE0, 0x00, 0x00 }
88// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
89// E0 ^ A1 = 41.
90// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
91// E0 ^ A2 = 42.
92// Note that because of the xor-ing, the byte sequence stored in the entry is
93// not valid UTF-8.`
94
95func writeMappings(w io.Writer, data [][4]byte) int {
96 fmt.Fprintln(w, inverseDataComment)
97 fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data))
98 for _, x := range data {
99 fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3])
100 }
101 fmt.Fprintln(w, "}")
102 return len(data) * 4
103}
104
105func genTests() {
106 w := &bytes.Buffer{}
107 fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n")
108 getWidthData(func(r rune, tag elem, alt rune) {
109 if alt != 0 {
110 fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag)
111 }
112 })
113 fmt.Fprintln(w, "}")
114 gen.WriteGoFile("runes_test.go", "width", w.Bytes())
115}