Serge Bazanski | cc25bdf | 2018-10-25 14:02:58 +0200 | [diff] [blame] | 1 | // Copyright 2015 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | //go:generate stringer -type=Kind |
| 6 | //go:generate go run gen.go gen_common.go gen_trieval.go |
| 7 | |
| 8 | // Package width provides functionality for handling different widths in text. |
| 9 | // |
| 10 | // Wide characters behave like ideographs; they tend to allow line breaks after |
| 11 | // each character and remain upright in vertical text layout. Narrow characters |
| 12 | // are kept together in words or runs that are rotated sideways in vertical text |
| 13 | // layout. |
| 14 | // |
| 15 | // For more information, see http://unicode.org/reports/tr11/. |
| 16 | package width // import "golang.org/x/text/width" |
| 17 | |
| 18 | import ( |
| 19 | "unicode/utf8" |
| 20 | |
| 21 | "golang.org/x/text/transform" |
| 22 | ) |
| 23 | |
| 24 | // TODO |
| 25 | // 1) Reduce table size by compressing blocks. |
| 26 | // 2) API proposition for computing display length |
| 27 | // (approximation, fixed pitch only). |
| 28 | // 3) Implement display length. |
| 29 | |
| 30 | // Kind indicates the type of width property as defined in http://unicode.org/reports/tr11/. |
| 31 | type Kind int |
| 32 | |
| 33 | const ( |
| 34 | // Neutral characters do not occur in legacy East Asian character sets. |
| 35 | Neutral Kind = iota |
| 36 | |
| 37 | // EastAsianAmbiguous characters that can be sometimes wide and sometimes |
| 38 | // narrow and require additional information not contained in the character |
| 39 | // code to further resolve their width. |
| 40 | EastAsianAmbiguous |
| 41 | |
| 42 | // EastAsianWide characters are wide in its usual form. They occur only in |
| 43 | // the context of East Asian typography. These runes may have explicit |
| 44 | // halfwidth counterparts. |
| 45 | EastAsianWide |
| 46 | |
| 47 | // EastAsianNarrow characters are narrow in its usual form. They often have |
| 48 | // fullwidth counterparts. |
| 49 | EastAsianNarrow |
| 50 | |
| 51 | // Note: there exist Narrow runes that do not have fullwidth or wide |
| 52 | // counterparts, despite what the definition says (e.g. U+27E6). |
| 53 | |
| 54 | // EastAsianFullwidth characters have a compatibility decompositions of type |
| 55 | // wide that map to a narrow counterpart. |
| 56 | EastAsianFullwidth |
| 57 | |
| 58 | // EastAsianHalfwidth characters have a compatibility decomposition of type |
| 59 | // narrow that map to a wide or ambiguous counterpart, plus U+20A9 ₩ WON |
| 60 | // SIGN. |
| 61 | EastAsianHalfwidth |
| 62 | |
| 63 | // Note: there exist runes that have a halfwidth counterparts but that are |
| 64 | // classified as Ambiguous, rather than wide (e.g. U+2190). |
| 65 | ) |
| 66 | |
| 67 | // TODO: the generated tries need to return size 1 for invalid runes for the |
| 68 | // width to be computed correctly (each byte should render width 1) |
| 69 | |
| 70 | var trie = newWidthTrie(0) |
| 71 | |
| 72 | // Lookup reports the Properties of the first rune in b and the number of bytes |
| 73 | // of its UTF-8 encoding. |
| 74 | func Lookup(b []byte) (p Properties, size int) { |
| 75 | v, sz := trie.lookup(b) |
| 76 | return Properties{elem(v), b[sz-1]}, sz |
| 77 | } |
| 78 | |
| 79 | // LookupString reports the Properties of the first rune in s and the number of |
| 80 | // bytes of its UTF-8 encoding. |
| 81 | func LookupString(s string) (p Properties, size int) { |
| 82 | v, sz := trie.lookupString(s) |
| 83 | return Properties{elem(v), s[sz-1]}, sz |
| 84 | } |
| 85 | |
| 86 | // LookupRune reports the Properties of rune r. |
| 87 | func LookupRune(r rune) Properties { |
| 88 | var buf [4]byte |
| 89 | n := utf8.EncodeRune(buf[:], r) |
| 90 | v, _ := trie.lookup(buf[:n]) |
| 91 | last := byte(r) |
| 92 | if r >= utf8.RuneSelf { |
| 93 | last = 0x80 + byte(r&0x3f) |
| 94 | } |
| 95 | return Properties{elem(v), last} |
| 96 | } |
| 97 | |
| 98 | // Properties provides access to width properties of a rune. |
| 99 | type Properties struct { |
| 100 | elem elem |
| 101 | last byte |
| 102 | } |
| 103 | |
| 104 | func (e elem) kind() Kind { |
| 105 | return Kind(e >> typeShift) |
| 106 | } |
| 107 | |
| 108 | // Kind returns the Kind of a rune as defined in Unicode TR #11. |
| 109 | // See http://unicode.org/reports/tr11/ for more details. |
| 110 | func (p Properties) Kind() Kind { |
| 111 | return p.elem.kind() |
| 112 | } |
| 113 | |
| 114 | // Folded returns the folded variant of a rune or 0 if the rune is canonical. |
| 115 | func (p Properties) Folded() rune { |
| 116 | if p.elem&tagNeedsFold != 0 { |
| 117 | buf := inverseData[byte(p.elem)] |
| 118 | buf[buf[0]] ^= p.last |
| 119 | r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]]) |
| 120 | return r |
| 121 | } |
| 122 | return 0 |
| 123 | } |
| 124 | |
| 125 | // Narrow returns the narrow variant of a rune or 0 if the rune is already |
| 126 | // narrow or doesn't have a narrow variant. |
| 127 | func (p Properties) Narrow() rune { |
| 128 | if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) { |
| 129 | buf := inverseData[byte(p.elem)] |
| 130 | buf[buf[0]] ^= p.last |
| 131 | r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]]) |
| 132 | return r |
| 133 | } |
| 134 | return 0 |
| 135 | } |
| 136 | |
| 137 | // Wide returns the wide variant of a rune or 0 if the rune is already |
| 138 | // wide or doesn't have a wide variant. |
| 139 | func (p Properties) Wide() rune { |
| 140 | if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) { |
| 141 | buf := inverseData[byte(p.elem)] |
| 142 | buf[buf[0]] ^= p.last |
| 143 | r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]]) |
| 144 | return r |
| 145 | } |
| 146 | return 0 |
| 147 | } |
| 148 | |
| 149 | // TODO for Properties: |
| 150 | // - Add Fullwidth/Halfwidth or Inverted methods for computing variants |
| 151 | // mapping. |
| 152 | // - Add width information (including information on non-spacing runes). |
| 153 | |
| 154 | // Transformer implements the transform.Transformer interface. |
| 155 | type Transformer struct { |
| 156 | t transform.SpanningTransformer |
| 157 | } |
| 158 | |
| 159 | // Reset implements the transform.Transformer interface. |
| 160 | func (t Transformer) Reset() { t.t.Reset() } |
| 161 | |
| 162 | // Transform implements the transform.Transformer interface. |
| 163 | func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 164 | return t.t.Transform(dst, src, atEOF) |
| 165 | } |
| 166 | |
| 167 | // Span implements the transform.SpanningTransformer interface. |
| 168 | func (t Transformer) Span(src []byte, atEOF bool) (n int, err error) { |
| 169 | return t.t.Span(src, atEOF) |
| 170 | } |
| 171 | |
| 172 | // Bytes returns a new byte slice with the result of applying t to b. |
| 173 | func (t Transformer) Bytes(b []byte) []byte { |
| 174 | b, _, _ = transform.Bytes(t, b) |
| 175 | return b |
| 176 | } |
| 177 | |
| 178 | // String returns a string with the result of applying t to s. |
| 179 | func (t Transformer) String(s string) string { |
| 180 | s, _, _ = transform.String(t, s) |
| 181 | return s |
| 182 | } |
| 183 | |
| 184 | var ( |
| 185 | // Fold is a transform that maps all runes to their canonical width. |
| 186 | // |
| 187 | // Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm |
| 188 | // provide a more generic folding mechanism. |
| 189 | Fold Transformer = Transformer{foldTransform{}} |
| 190 | |
| 191 | // Widen is a transform that maps runes to their wide variant, if |
| 192 | // available. |
| 193 | Widen Transformer = Transformer{wideTransform{}} |
| 194 | |
| 195 | // Narrow is a transform that maps runes to their narrow variant, if |
| 196 | // available. |
| 197 | Narrow Transformer = Transformer{narrowTransform{}} |
| 198 | ) |
| 199 | |
| 200 | // TODO: Consider the following options: |
| 201 | // - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some |
| 202 | // generalized variant of this. |
| 203 | // - Consider a wide Won character to be the default width (or some generalized |
| 204 | // variant of this). |
| 205 | // - Filter the set of characters that gets converted (the preferred approach is |
| 206 | // to allow applying filters to transforms). |