package helpers import ( "strings" "unicode/utf8" ) func ContainsNonBMPCodePoint(text string) bool { for _, c := range text { if c > 0xFFFF { return true } } return false } // This does "ContainsNonBMPCodePoint(UTF16ToString(text))" without any allocations func ContainsNonBMPCodePointUTF16(text []uint16) bool { if n := len(text); n > 0 { for i, c := range text[:n-1] { // Check for a high surrogate if c >= 0xD800 && c <= 0xDBFF { // Check for a low surrogate if c2 := text[i+1]; c2 >= 0xDC00 && c2 <= 0xDFFF { return true } } } } return false } func StringToUTF16(text string) []uint16 { decoded := make([]uint16, 0, len(text)) for _, c := range text { if c <= 0xFFFF { decoded = append(decoded, uint16(c)) } else { c -= 0x10000 decoded = append(decoded, uint16(0xD800+((c>>10)&0x3FF)), uint16(0xDC00+(c&0x3FF))) } } return decoded } func UTF16ToString(text []uint16) string { var temp [utf8.UTFMax]byte b := strings.Builder{} n := len(text) for i := 0; i < n; i++ { r1 := rune(text[i]) if r1 >= 0xD800 && r1 <= 0xDBFF && i+1 < n { if r2 := rune(text[i+1]); r2 >= 0xDC00 && r2 <= 0xDFFF { r1 = (r1-0xD800)<<10 | (r2 - 0xDC00) + 0x10000 i++ } } width := encodeWTF8Rune(temp[:], r1) b.Write(temp[:width]) } return b.String() } func UTF16ToStringWithValidation(text []uint16) (string, uint16, bool) { var temp [utf8.UTFMax]byte b := strings.Builder{} n := len(text) for i := 0; i < n; i++ { r1 := rune(text[i]) if r1 >= 0xD800 && r1 <= 0xDBFF { if i+1 < n { if r2 := rune(text[i+1]); r2 >= 0xDC00 && r2 <= 0xDFFF { r1 = (r1-0xD800)<<10 | (r2 - 0xDC00) + 0x10000 i++ } else { return "", uint16(r1), false } } else { return "", uint16(r1), false } } else if r1 >= 0xDC00 && r1 <= 0xDFFF { return "", uint16(r1), false } width := encodeWTF8Rune(temp[:], r1) b.Write(temp[:width]) } return b.String(), 0, true } // Does "UTF16ToString(text) == str" without a temporary allocation func UTF16EqualsString(text []uint16, str string) bool { if len(text) > len(str) { // Strings can't be equal if UTF-16 encoding is longer than UTF-8 encoding return false } var temp [utf8.UTFMax]byte n := len(text) j := 0 for i := 0; i < n; i++ { r1 := rune(text[i]) if r1 >= 0xD800 && r1 <= 0xDBFF && i+1 < n { if r2 := rune(text[i+1]); r2 >= 0xDC00 && r2 <= 0xDFFF { r1 = (r1-0xD800)<<10 | (r2 - 0xDC00) + 0x10000 i++ } } width := encodeWTF8Rune(temp[:], r1) if j+width > len(str) { return false } for k := 0; k < width; k++ { if temp[k] != str[j] { return false } j++ } } return j == len(str) } func UTF16EqualsUTF16(a []uint16, b []uint16) bool { if len(a) == len(b) { for i, c := range a { if c != b[i] { return false } } return true } return false } // This is a clone of "utf8.EncodeRune" that has been modified to encode using // WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info. func encodeWTF8Rune(p []byte, r rune) int { // Negative values are erroneous. Making it unsigned addresses the problem. switch i := uint32(r); { case i <= 0x7F: p[0] = byte(r) return 1 case i <= 0x7FF: _ = p[1] // eliminate bounds checks p[0] = 0xC0 | byte(r>>6) p[1] = 0x80 | byte(r)&0x3F return 2 case i > utf8.MaxRune: r = utf8.RuneError fallthrough case i <= 0xFFFF: _ = p[2] // eliminate bounds checks p[0] = 0xE0 | byte(r>>12) p[1] = 0x80 | byte(r>>6)&0x3F p[2] = 0x80 | byte(r)&0x3F return 3 default: _ = p[3] // eliminate bounds checks p[0] = 0xF0 | byte(r>>18) p[1] = 0x80 | byte(r>>12)&0x3F p[2] = 0x80 | byte(r>>6)&0x3F p[3] = 0x80 | byte(r)&0x3F return 4 } } // This is a clone of "utf8.DecodeRuneInString" that has been modified to // decode using WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for // more info. func DecodeWTF8Rune(s string) (rune, int) { n := len(s) if n < 1 { return utf8.RuneError, 0 } s0 := s[0] if s0 < 0x80 { return rune(s0), 1 } var sz int if (s0 & 0xE0) == 0xC0 { sz = 2 } else if (s0 & 0xF0) == 0xE0 { sz = 3 } else if (s0 & 0xF8) == 0xF0 { sz = 4 } else { return utf8.RuneError, 1 } if n < sz { return utf8.RuneError, 0 } s1 := s[1] if (s1 & 0xC0) != 0x80 { return utf8.RuneError, 1 } if sz == 2 { cp := rune(s0&0x1F)<<6 | rune(s1&0x3F) if cp < 0x80 { return utf8.RuneError, 1 } return cp, 2 } s2 := s[2] if (s2 & 0xC0) != 0x80 { return utf8.RuneError, 1 } if sz == 3 { cp := rune(s0&0x0F)<<12 | rune(s1&0x3F)<<6 | rune(s2&0x3F) if cp < 0x0800 { return utf8.RuneError, 1 } return cp, 3 } s3 := s[3] if (s3 & 0xC0) != 0x80 { return utf8.RuneError, 1 } cp := rune(s0&0x07)<<18 | rune(s1&0x3F)<<12 | rune(s2&0x3F)<<6 | rune(s3&0x3F) if cp < 0x010000 || cp > 0x10FFFF { return utf8.RuneError, 1 } return cp, 4 }