Serge Bazanski | cc25bdf | 2018-10-25 14:02:58 +0200 | [diff] [blame] | 1 | // Copyright 2015 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | // This program generates the trie for width operations. The generated table |
| 8 | // includes width category information as well as the normalization mappings. |
| 9 | package main |
| 10 | |
| 11 | import ( |
| 12 | "bytes" |
| 13 | "fmt" |
| 14 | "io" |
| 15 | "log" |
| 16 | "math" |
| 17 | "unicode/utf8" |
| 18 | |
| 19 | "golang.org/x/text/internal/gen" |
| 20 | "golang.org/x/text/internal/triegen" |
| 21 | ) |
| 22 | |
| 23 | // See gen_common.go for flags. |
| 24 | |
| 25 | func main() { |
| 26 | gen.Init() |
| 27 | genTables() |
| 28 | genTests() |
| 29 | gen.Repackage("gen_trieval.go", "trieval.go", "width") |
| 30 | gen.Repackage("gen_common.go", "common_test.go", "width") |
| 31 | } |
| 32 | |
| 33 | func genTables() { |
| 34 | t := triegen.NewTrie("width") |
| 35 | // fold and inverse mappings. See mapComment for a description of the format |
| 36 | // of each entry. Add dummy value to make an index of 0 mean no mapping. |
| 37 | inverse := [][4]byte{{}} |
| 38 | mapping := map[[4]byte]int{[4]byte{}: 0} |
| 39 | |
| 40 | getWidthData(func(r rune, tag elem, alt rune) { |
| 41 | idx := 0 |
| 42 | if alt != 0 { |
| 43 | var buf [4]byte |
| 44 | buf[0] = byte(utf8.EncodeRune(buf[1:], alt)) |
| 45 | s := string(r) |
| 46 | buf[buf[0]] ^= s[len(s)-1] |
| 47 | var ok bool |
| 48 | if idx, ok = mapping[buf]; !ok { |
| 49 | idx = len(mapping) |
| 50 | if idx > math.MaxUint8 { |
| 51 | log.Fatalf("Index %d does not fit in a byte.", idx) |
| 52 | } |
| 53 | mapping[buf] = idx |
| 54 | inverse = append(inverse, buf) |
| 55 | } |
| 56 | } |
| 57 | t.Insert(r, uint64(tag|elem(idx))) |
| 58 | }) |
| 59 | |
| 60 | w := &bytes.Buffer{} |
| 61 | gen.WriteUnicodeVersion(w) |
| 62 | |
| 63 | sz, err := t.Gen(w) |
| 64 | if err != nil { |
| 65 | log.Fatal(err) |
| 66 | } |
| 67 | |
| 68 | sz += writeMappings(w, inverse) |
| 69 | |
| 70 | fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) |
| 71 | |
| 72 | gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes()) |
| 73 | } |
| 74 | |
| 75 | const inverseDataComment = ` |
| 76 | // inverseData contains 4-byte entries of the following format: |
| 77 | // <length> <modified UTF-8-encoded rune> <0 padding> |
| 78 | // The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the |
| 79 | // UTF-8 encoding of the original rune. Mappings often have the following |
| 80 | // pattern: |
| 81 | // A -> A (U+FF21 -> U+0041) |
| 82 | // B -> B (U+FF22 -> U+0042) |
| 83 | // ... |
| 84 | // By xor-ing the last byte the same entry can be shared by many mappings. This |
| 85 | // reduces the total number of distinct entries by about two thirds. |
| 86 | // The resulting entry for the aforementioned mappings is |
| 87 | // { 0x01, 0xE0, 0x00, 0x00 } |
| 88 | // Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get |
| 89 | // E0 ^ A1 = 41. |
| 90 | // Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get |
| 91 | // E0 ^ A2 = 42. |
| 92 | // Note that because of the xor-ing, the byte sequence stored in the entry is |
| 93 | // not valid UTF-8.` |
| 94 | |
| 95 | func writeMappings(w io.Writer, data [][4]byte) int { |
| 96 | fmt.Fprintln(w, inverseDataComment) |
| 97 | fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data)) |
| 98 | for _, x := range data { |
| 99 | fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3]) |
| 100 | } |
| 101 | fmt.Fprintln(w, "}") |
| 102 | return len(data) * 4 |
| 103 | } |
| 104 | |
| 105 | func genTests() { |
| 106 | w := &bytes.Buffer{} |
| 107 | fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n") |
| 108 | getWidthData(func(r rune, tag elem, alt rune) { |
| 109 | if alt != 0 { |
| 110 | fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag) |
| 111 | } |
| 112 | }) |
| 113 | fmt.Fprintln(w, "}") |
| 114 | gen.WriteGoFile("runes_test.go", "width", w.Bytes()) |
| 115 | } |