230 lines
4.8 KiB
Go
230 lines
4.8 KiB
Go
package helpers
|
|
|
|
import (
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
func ContainsNonBMPCodePoint(text string) bool {
|
|
for _, c := range text {
|
|
if c > 0xFFFF {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// This does "ContainsNonBMPCodePoint(UTF16ToString(text))" without any allocations
|
|
func ContainsNonBMPCodePointUTF16(text []uint16) bool {
|
|
if n := len(text); n > 0 {
|
|
for i, c := range text[:n-1] {
|
|
// Check for a high surrogate
|
|
if c >= 0xD800 && c <= 0xDBFF {
|
|
// Check for a low surrogate
|
|
if c2 := text[i+1]; c2 >= 0xDC00 && c2 <= 0xDFFF {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func StringToUTF16(text string) []uint16 {
|
|
decoded := make([]uint16, 0, len(text))
|
|
for _, c := range text {
|
|
if c <= 0xFFFF {
|
|
decoded = append(decoded, uint16(c))
|
|
} else {
|
|
c -= 0x10000
|
|
decoded = append(decoded, uint16(0xD800+((c>>10)&0x3FF)), uint16(0xDC00+(c&0x3FF)))
|
|
}
|
|
}
|
|
return decoded
|
|
}
|
|
|
|
func UTF16ToString(text []uint16) string {
|
|
var temp [utf8.UTFMax]byte
|
|
b := strings.Builder{}
|
|
n := len(text)
|
|
for i := 0; i < n; i++ {
|
|
r1 := rune(text[i])
|
|
if r1 >= 0xD800 && r1 <= 0xDBFF && i+1 < n {
|
|
if r2 := rune(text[i+1]); r2 >= 0xDC00 && r2 <= 0xDFFF {
|
|
r1 = (r1-0xD800)<<10 | (r2 - 0xDC00) + 0x10000
|
|
i++
|
|
}
|
|
}
|
|
width := encodeWTF8Rune(temp[:], r1)
|
|
b.Write(temp[:width])
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func UTF16ToStringWithValidation(text []uint16) (string, uint16, bool) {
|
|
var temp [utf8.UTFMax]byte
|
|
b := strings.Builder{}
|
|
n := len(text)
|
|
for i := 0; i < n; i++ {
|
|
r1 := rune(text[i])
|
|
if r1 >= 0xD800 && r1 <= 0xDBFF {
|
|
if i+1 < n {
|
|
if r2 := rune(text[i+1]); r2 >= 0xDC00 && r2 <= 0xDFFF {
|
|
r1 = (r1-0xD800)<<10 | (r2 - 0xDC00) + 0x10000
|
|
i++
|
|
} else {
|
|
return "", uint16(r1), false
|
|
}
|
|
} else {
|
|
return "", uint16(r1), false
|
|
}
|
|
} else if r1 >= 0xDC00 && r1 <= 0xDFFF {
|
|
return "", uint16(r1), false
|
|
}
|
|
width := encodeWTF8Rune(temp[:], r1)
|
|
b.Write(temp[:width])
|
|
}
|
|
return b.String(), 0, true
|
|
}
|
|
|
|
// Does "UTF16ToString(text) == str" without a temporary allocation
|
|
func UTF16EqualsString(text []uint16, str string) bool {
|
|
if len(text) > len(str) {
|
|
// Strings can't be equal if UTF-16 encoding is longer than UTF-8 encoding
|
|
return false
|
|
}
|
|
var temp [utf8.UTFMax]byte
|
|
n := len(text)
|
|
j := 0
|
|
for i := 0; i < n; i++ {
|
|
r1 := rune(text[i])
|
|
if r1 >= 0xD800 && r1 <= 0xDBFF && i+1 < n {
|
|
if r2 := rune(text[i+1]); r2 >= 0xDC00 && r2 <= 0xDFFF {
|
|
r1 = (r1-0xD800)<<10 | (r2 - 0xDC00) + 0x10000
|
|
i++
|
|
}
|
|
}
|
|
width := encodeWTF8Rune(temp[:], r1)
|
|
if j+width > len(str) {
|
|
return false
|
|
}
|
|
for k := 0; k < width; k++ {
|
|
if temp[k] != str[j] {
|
|
return false
|
|
}
|
|
j++
|
|
}
|
|
}
|
|
return j == len(str)
|
|
}
|
|
|
|
func UTF16EqualsUTF16(a []uint16, b []uint16) bool {
|
|
if len(a) == len(b) {
|
|
for i, c := range a {
|
|
if c != b[i] {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// This is a clone of "utf8.EncodeRune" that has been modified to encode using
|
|
// WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info.
|
|
func encodeWTF8Rune(p []byte, r rune) int {
|
|
// Negative values are erroneous. Making it unsigned addresses the problem.
|
|
switch i := uint32(r); {
|
|
case i <= 0x7F:
|
|
p[0] = byte(r)
|
|
return 1
|
|
case i <= 0x7FF:
|
|
_ = p[1] // eliminate bounds checks
|
|
p[0] = 0xC0 | byte(r>>6)
|
|
p[1] = 0x80 | byte(r)&0x3F
|
|
return 2
|
|
case i > utf8.MaxRune:
|
|
r = utf8.RuneError
|
|
fallthrough
|
|
case i <= 0xFFFF:
|
|
_ = p[2] // eliminate bounds checks
|
|
p[0] = 0xE0 | byte(r>>12)
|
|
p[1] = 0x80 | byte(r>>6)&0x3F
|
|
p[2] = 0x80 | byte(r)&0x3F
|
|
return 3
|
|
default:
|
|
_ = p[3] // eliminate bounds checks
|
|
p[0] = 0xF0 | byte(r>>18)
|
|
p[1] = 0x80 | byte(r>>12)&0x3F
|
|
p[2] = 0x80 | byte(r>>6)&0x3F
|
|
p[3] = 0x80 | byte(r)&0x3F
|
|
return 4
|
|
}
|
|
}
|
|
|
|
// This is a clone of "utf8.DecodeRuneInString" that has been modified to
|
|
// decode using WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for
|
|
// more info.
|
|
func DecodeWTF8Rune(s string) (rune, int) {
|
|
n := len(s)
|
|
if n < 1 {
|
|
return utf8.RuneError, 0
|
|
}
|
|
|
|
s0 := s[0]
|
|
if s0 < 0x80 {
|
|
return rune(s0), 1
|
|
}
|
|
|
|
var sz int
|
|
if (s0 & 0xE0) == 0xC0 {
|
|
sz = 2
|
|
} else if (s0 & 0xF0) == 0xE0 {
|
|
sz = 3
|
|
} else if (s0 & 0xF8) == 0xF0 {
|
|
sz = 4
|
|
} else {
|
|
return utf8.RuneError, 1
|
|
}
|
|
|
|
if n < sz {
|
|
return utf8.RuneError, 0
|
|
}
|
|
|
|
s1 := s[1]
|
|
if (s1 & 0xC0) != 0x80 {
|
|
return utf8.RuneError, 1
|
|
}
|
|
|
|
if sz == 2 {
|
|
cp := rune(s0&0x1F)<<6 | rune(s1&0x3F)
|
|
if cp < 0x80 {
|
|
return utf8.RuneError, 1
|
|
}
|
|
return cp, 2
|
|
}
|
|
s2 := s[2]
|
|
|
|
if (s2 & 0xC0) != 0x80 {
|
|
return utf8.RuneError, 1
|
|
}
|
|
|
|
if sz == 3 {
|
|
cp := rune(s0&0x0F)<<12 | rune(s1&0x3F)<<6 | rune(s2&0x3F)
|
|
if cp < 0x0800 {
|
|
return utf8.RuneError, 1
|
|
}
|
|
return cp, 3
|
|
}
|
|
s3 := s[3]
|
|
|
|
if (s3 & 0xC0) != 0x80 {
|
|
return utf8.RuneError, 1
|
|
}
|
|
|
|
cp := rune(s0&0x07)<<18 | rune(s1&0x3F)<<12 | rune(s2&0x3F)<<6 | rune(s3&0x3F)
|
|
if cp < 0x010000 || cp > 0x10FFFF {
|
|
return utf8.RuneError, 1
|
|
}
|
|
return cp, 4
|
|
}
|