codepoint: create codepoint package

main
RageCage64 2 years ago
parent 48795c72f3
commit d71ed1e1ff

@ -1,21 +1,21 @@
MIT License
Copyright (c) 2022 RageCage64
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
MIT License
Copyright (c) 2022 Braydon Kains
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,131 @@
// The codepoint package provides a mechanism to take UTF-8 codepoints
// as strings and encode them according to the UTF-8 standard. For a
// better explanation of the methods, see the Wikipedia article on
// UTF-8 encoding. https://en.wikipedia.org/wiki/UTF-8#Encoding
package codepoint
import (
"errors"
"fmt"
"strconv"
)
// Convert takes in a string of a UTF-8 codepoint in the format
// `U+0000` or `\U00000000` and converts it to a slice of bytes
// that represent the corresponding UTF-8 encoding.
func Convert(cpoint string) ([]byte, error) {
// Check for valid codepoint.
if cpoint[:2] != "U+" && cpoint[:2] != "\\U" {
return nil, ErrInvalidCodepoint
}
// Extract the hex number.
cpointHexStr := cpoint[2:]
cpointHexVal, err := strconv.ParseInt(cpointHexStr, 16, strconv.IntSize)
if err != nil {
return nil, err
}
// Determine if this requires 1, 2, 3, or 4 bytes.
startBytes, err := getStartBytes(int(cpointHexVal))
if err != nil {
return nil, err
}
utf8Bytes, err := convertCodepoint(int(cpointHexVal), startBytes)
if err != nil {
return nil, err
}
return utf8Bytes, nil
}
var (
ErrInvalidWidth = errors.New("an invalid width was specified")
ErrInvalidCodepoint = errors.New("specified codepoint was not valid")
)
var (
startByteTable = map[int]int{
0b00000000: 7,
0b10000000: 6,
0b11000000: 5,
0b11100000: 4,
0b11110000: 3,
}
bitCountByWidthTable = map[int]int{
1: 7,
2: 11,
3: 16,
4: 21,
}
oneByteEncode = []int{0b00000000}
twoByteEncode = []int{0b11000000, 0b10000000}
threeByteEncode = []int{0b11100000, 0b10000000, 0b10000000}
fourByteEncode = []int{0b11110000, 0b10000000, 0b10000000, 0b10000000}
)
func getStartBytes(value int) ([]int, error) {
if 0 <= value && value <= 0x7F {
return oneByteEncode, nil
} else if 0x80 <= value && value <= 0x7FF {
return twoByteEncode, nil
} else if 0x800 <= value && value <= 0xFFFF {
return threeByteEncode, nil
} else if 0x10000 <= value && value <= 0x10FFFF {
return fourByteEncode, nil
}
return nil, ErrInvalidWidth
}
func convertCodepoint(value int, startBytes []int) ([]byte, error) {
// Pad the required number of 0 bits to the left of the binary
// representation of the hex value. This is based on the width
// (1, 2, 3, or 4) which is the same as the length of the starting
// bytes slice.
s, err := padLeftBits(fmt.Sprintf("%b", value), len(startBytes))
if err != nil {
return nil, err
}
utf8Bytes := []byte{}
for _, currStartByte := range startBytes {
// Use the current byte to figure out how many bits we
// need from the codepoint value.
bits := startByteTable[currStartByte]
currBits := s[0:bits]
// Parse the bits into an int.
uintValue, err := strconv.ParseInt(currBits, 2, strconv.IntSize)
if err != nil {
return nil, err
}
value := int(uintValue)
// Use an or operation to store the significant bytes of
// the value into the remaining bits of the start byte.
utf8Byte := byte(currStartByte | value)
utf8Bytes = append(utf8Bytes, utf8Byte)
// Slice the bits we just used off the codepoint binary
// string.
s = s[bits:]
}
return utf8Bytes, nil
}
func padLeftBits(s string, width int) (string, error) {
bitCount, ok := bitCountByWidthTable[width]
if !ok {
return "", ErrInvalidWidth
}
leading := bitCount - len(s)
zeroes := ""
for i := 0; i < leading; i++ {
zeroes += "0"
}
return zeroes + s, nil
}

@ -0,0 +1,73 @@
package codepoint_test
import (
"fmt"
"testing"
"github.com/RageCage64/go-utf8-codepoint-converter/codepoint"
)
func TestConvert(t *testing.T) {
// I stole these test cases from the UTF-8 wikipedia.
// https://en.wikipedia.org/wiki/UTF-8#Encoding
testCases := []struct {
name string
codepointUPlus string
codepointSlashU string
expectedValues []byte
}{
{
name: "1 byte encode",
codepointUPlus: "U+0024",
codepointSlashU: "\\U00000024",
expectedValues: []byte{0x24},
},
{
name: "2 byte encode",
codepointUPlus: "U+00A3",
codepointSlashU: "\\U000000A3",
expectedValues: []byte{0xC2, 0xA3},
},
{
name: "3 byte encode",
codepointUPlus: "U+20AC",
codepointSlashU: "\\U000020AC",
expectedValues: []byte{0xE2, 0x82, 0xAC},
},
{
name: "4 byte encode",
codepointUPlus: "U+10348",
codepointSlashU: "\\U00010348",
expectedValues: []byte{0xF0, 0x90, 0x8D, 0x88},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
testCodepoint := func(t *testing.T, codepointStr string) {
utf8, err := codepoint.Convert(codepointStr)
if err != nil {
t.Fatal(err)
}
if len(utf8) != len(tc.expectedValues) {
t.Fatal("lengths not match")
}
for i, v := range utf8 {
if v != utf8[i] {
t.Fatal("mismatch values")
}
}
}
t.Run("U+ format", func(t *testing.T) {
testCodepoint(t, tc.codepointUPlus)
})
t.Run("\\U format", func(t *testing.T) {
fmt.Println(tc.codepointSlashU)
testCodepoint(t, tc.codepointSlashU)
})
})
}
}

@ -0,0 +1,3 @@
module github.com/RageCage64/go-utf8-codepoint-converter
go 1.19
Loading…
Cancel
Save