From d71ed1e1ffe9839677ede689cdb8f636f940b5f5 Mon Sep 17 00:00:00 2001 From: RageCage64 Date: Tue, 30 Aug 2022 12:56:00 -0400 Subject: [PATCH] codepoint: create codepoint package --- LICENSE | 42 ++++++------ codepoint/codepoint.go | 131 ++++++++++++++++++++++++++++++++++++ codepoint/codepoint_test.go | 73 ++++++++++++++++++++ go.mod | 3 + 4 files changed, 228 insertions(+), 21 deletions(-) create mode 100644 codepoint/codepoint.go create mode 100644 codepoint/codepoint_test.go create mode 100644 go.mod diff --git a/LICENSE b/LICENSE index e6018dd..c9d1b1f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2022 RageCage64 - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2022 Braydon Kains + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/codepoint/codepoint.go b/codepoint/codepoint.go new file mode 100644 index 0000000..d89ae2a --- /dev/null +++ b/codepoint/codepoint.go @@ -0,0 +1,131 @@ +// The codepoint package provides a mechanism to take UTF-8 codepoints +// as strings and encode them according to the UTF-8 standard. For a +// better explanation of the methods, see the Wikipedia article on +// UTF-8 encoding. https://en.wikipedia.org/wiki/UTF-8#Encoding +package codepoint + +import ( + "errors" + "fmt" + "strconv" +) + +// Convert takes in a string of a UTF-8 codepoint in the format +// `U+0000` or `\U00000000` and converts it to a slice of bytes +// that represent the corresponding UTF-8 encoding. +func Convert(cpoint string) ([]byte, error) { + // Check for valid codepoint. + if cpoint[:2] != "U+" && cpoint[:2] != "\\U" { + return nil, ErrInvalidCodepoint + } + + // Extract the hex number. + cpointHexStr := cpoint[2:] + cpointHexVal, err := strconv.ParseInt(cpointHexStr, 16, strconv.IntSize) + if err != nil { + return nil, err + } + // Determine if this requires 1, 2, 3, or 4 bytes. + startBytes, err := getStartBytes(int(cpointHexVal)) + if err != nil { + return nil, err + } + + utf8Bytes, err := convertCodepoint(int(cpointHexVal), startBytes) + if err != nil { + return nil, err + } + return utf8Bytes, nil +} + +var ( + ErrInvalidWidth = errors.New("an invalid width was specified") + ErrInvalidCodepoint = errors.New("specified codepoint was not valid") +) + +var ( + startByteTable = map[int]int{ + 0b00000000: 7, + 0b10000000: 6, + 0b11000000: 5, + 0b11100000: 4, + 0b11110000: 3, + } + + bitCountByWidthTable = map[int]int{ + 1: 7, + 2: 11, + 3: 16, + 4: 21, + } + + oneByteEncode = []int{0b00000000} + twoByteEncode = []int{0b11000000, 0b10000000} + threeByteEncode = []int{0b11100000, 0b10000000, 0b10000000} + fourByteEncode = []int{0b11110000, 0b10000000, 0b10000000, 0b10000000} +) + +func getStartBytes(value int) ([]int, error) { + if 0 <= value && value <= 0x7F { + return oneByteEncode, nil + } else if 0x80 <= value && value <= 0x7FF { + return twoByteEncode, nil + } else if 0x800 <= value && value <= 0xFFFF { + return threeByteEncode, nil + } else if 0x10000 <= value && value <= 0x10FFFF { + return fourByteEncode, nil + } + + return nil, ErrInvalidWidth +} + +func convertCodepoint(value int, startBytes []int) ([]byte, error) { + // Pad the required number of 0 bits to the left of the binary + // representation of the hex value. This is based on the width + // (1, 2, 3, or 4) which is the same as the length of the starting + // bytes slice. + s, err := padLeftBits(fmt.Sprintf("%b", value), len(startBytes)) + if err != nil { + return nil, err + } + + utf8Bytes := []byte{} + for _, currStartByte := range startBytes { + // Use the current byte to figure out how many bits we + // need from the codepoint value. + bits := startByteTable[currStartByte] + currBits := s[0:bits] + + // Parse the bits into an int. + uintValue, err := strconv.ParseInt(currBits, 2, strconv.IntSize) + if err != nil { + return nil, err + } + value := int(uintValue) + + // Use an or operation to store the significant bytes of + // the value into the remaining bits of the start byte. + utf8Byte := byte(currStartByte | value) + utf8Bytes = append(utf8Bytes, utf8Byte) + + // Slice the bits we just used off the codepoint binary + // string. + s = s[bits:] + } + + return utf8Bytes, nil +} + +func padLeftBits(s string, width int) (string, error) { + bitCount, ok := bitCountByWidthTable[width] + if !ok { + return "", ErrInvalidWidth + } + + leading := bitCount - len(s) + zeroes := "" + for i := 0; i < leading; i++ { + zeroes += "0" + } + return zeroes + s, nil +} diff --git a/codepoint/codepoint_test.go b/codepoint/codepoint_test.go new file mode 100644 index 0000000..c9f11a2 --- /dev/null +++ b/codepoint/codepoint_test.go @@ -0,0 +1,73 @@ +package codepoint_test + +import ( + "fmt" + "testing" + + "github.com/RageCage64/go-utf8-codepoint-converter/codepoint" +) + +func TestConvert(t *testing.T) { + // I stole these test cases from the UTF-8 wikipedia. + // https://en.wikipedia.org/wiki/UTF-8#Encoding + testCases := []struct { + name string + codepointUPlus string + codepointSlashU string + expectedValues []byte + }{ + { + name: "1 byte encode", + codepointUPlus: "U+0024", + codepointSlashU: "\\U00000024", + expectedValues: []byte{0x24}, + }, + { + name: "2 byte encode", + codepointUPlus: "U+00A3", + codepointSlashU: "\\U000000A3", + expectedValues: []byte{0xC2, 0xA3}, + }, + { + name: "3 byte encode", + codepointUPlus: "U+20AC", + codepointSlashU: "\\U000020AC", + expectedValues: []byte{0xE2, 0x82, 0xAC}, + }, + { + name: "4 byte encode", + codepointUPlus: "U+10348", + codepointSlashU: "\\U00010348", + expectedValues: []byte{0xF0, 0x90, 0x8D, 0x88}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + testCodepoint := func(t *testing.T, codepointStr string) { + utf8, err := codepoint.Convert(codepointStr) + if err != nil { + t.Fatal(err) + } + + if len(utf8) != len(tc.expectedValues) { + t.Fatal("lengths not match") + } + for i, v := range utf8 { + if v != utf8[i] { + t.Fatal("mismatch values") + } + } + } + + t.Run("U+ format", func(t *testing.T) { + testCodepoint(t, tc.codepointUPlus) + }) + + t.Run("\\U format", func(t *testing.T) { + fmt.Println(tc.codepointSlashU) + testCodepoint(t, tc.codepointSlashU) + }) + }) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..545cab3 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/RageCage64/go-utf8-codepoint-converter + +go 1.19