From d71ed1e1ffe9839677ede689cdb8f636f940b5f5 Mon Sep 17 00:00:00 2001
From: RageCage64 <kainsbraydon@gmail.com>
Date: Tue, 30 Aug 2022 12:56:00 -0400
Subject: [PATCH] codepoint: create codepoint package

---
 LICENSE                     |  42 ++++++------
 codepoint/codepoint.go      | 131 ++++++++++++++++++++++++++++++++++++
 codepoint/codepoint_test.go |  73 ++++++++++++++++++++
 go.mod                      |   3 +
 4 files changed, 228 insertions(+), 21 deletions(-)
 create mode 100644 codepoint/codepoint.go
 create mode 100644 codepoint/codepoint_test.go
 create mode 100644 go.mod

diff --git a/LICENSE b/LICENSE
index e6018dd..c9d1b1f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,21 @@
-MIT License
-
-Copyright (c) 2022 RageCage64
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+MIT License
+
+Copyright (c) 2022 Braydon Kains
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/codepoint/codepoint.go b/codepoint/codepoint.go
new file mode 100644
index 0000000..d89ae2a
--- /dev/null
+++ b/codepoint/codepoint.go
@@ -0,0 +1,131 @@
+// The codepoint package provides a mechanism to take UTF-8 codepoints
+// as strings and encode them according to the UTF-8 standard. For a
+// better explanation of the methods, see the Wikipedia article on
+// UTF-8 encoding. https://en.wikipedia.org/wiki/UTF-8#Encoding
+package codepoint
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+)
+
+// Convert takes in a string of a UTF-8 codepoint in the format
+// `U+0000` or `\U00000000` and converts it to a slice of bytes
+// that represent the corresponding UTF-8 encoding.
+func Convert(cpoint string) ([]byte, error) {
+	// Check for valid codepoint.
+	if cpoint[:2] != "U+" && cpoint[:2] != "\\U" {
+		return nil, ErrInvalidCodepoint
+	}
+
+	// Extract the hex number.
+	cpointHexStr := cpoint[2:]
+	cpointHexVal, err := strconv.ParseInt(cpointHexStr, 16, strconv.IntSize)
+	if err != nil {
+		return nil, err
+	}
+	// Determine if this requires 1, 2, 3, or 4 bytes.
+	startBytes, err := getStartBytes(int(cpointHexVal))
+	if err != nil {
+		return nil, err
+	}
+
+	utf8Bytes, err := convertCodepoint(int(cpointHexVal), startBytes)
+	if err != nil {
+		return nil, err
+	}
+	return utf8Bytes, nil
+}
+
+var (
+	ErrInvalidWidth     = errors.New("an invalid width was specified")
+	ErrInvalidCodepoint = errors.New("specified codepoint was not valid")
+)
+
+var (
+	startByteTable = map[int]int{
+		0b00000000: 7,
+		0b10000000: 6,
+		0b11000000: 5,
+		0b11100000: 4,
+		0b11110000: 3,
+	}
+
+	bitCountByWidthTable = map[int]int{
+		1: 7,
+		2: 11,
+		3: 16,
+		4: 21,
+	}
+
+	oneByteEncode   = []int{0b00000000}
+	twoByteEncode   = []int{0b11000000, 0b10000000}
+	threeByteEncode = []int{0b11100000, 0b10000000, 0b10000000}
+	fourByteEncode  = []int{0b11110000, 0b10000000, 0b10000000, 0b10000000}
+)
+
+func getStartBytes(value int) ([]int, error) {
+	if 0 <= value && value <= 0x7F {
+		return oneByteEncode, nil
+	} else if 0x80 <= value && value <= 0x7FF {
+		return twoByteEncode, nil
+	} else if 0x800 <= value && value <= 0xFFFF {
+		return threeByteEncode, nil
+	} else if 0x10000 <= value && value <= 0x10FFFF {
+		return fourByteEncode, nil
+	}
+
+	return nil, ErrInvalidWidth
+}
+
+func convertCodepoint(value int, startBytes []int) ([]byte, error) {
+	// Pad the required number of 0 bits to the left of the binary
+	// representation of the hex value. This is based on the width
+	// (1, 2, 3, or 4) which is the same as the length of the starting
+	// bytes slice.
+	s, err := padLeftBits(fmt.Sprintf("%b", value), len(startBytes))
+	if err != nil {
+		return nil, err
+	}
+
+	utf8Bytes := []byte{}
+	for _, currStartByte := range startBytes {
+		// Use the current byte to figure out how many bits we
+		// need from the codepoint value.
+		bits := startByteTable[currStartByte]
+		currBits := s[0:bits]
+
+		// Parse the bits into an int.
+		uintValue, err := strconv.ParseInt(currBits, 2, strconv.IntSize)
+		if err != nil {
+			return nil, err
+		}
+		value := int(uintValue)
+
+		// Use an or operation to store the significant bytes of
+		// the value into the remaining bits of the start byte.
+		utf8Byte := byte(currStartByte | value)
+		utf8Bytes = append(utf8Bytes, utf8Byte)
+
+		// Slice the bits we just used off the codepoint binary
+		// string.
+		s = s[bits:]
+	}
+
+	return utf8Bytes, nil
+}
+
+func padLeftBits(s string, width int) (string, error) {
+	bitCount, ok := bitCountByWidthTable[width]
+	if !ok {
+		return "", ErrInvalidWidth
+	}
+
+	leading := bitCount - len(s)
+	zeroes := ""
+	for i := 0; i < leading; i++ {
+		zeroes += "0"
+	}
+	return zeroes + s, nil
+}
diff --git a/codepoint/codepoint_test.go b/codepoint/codepoint_test.go
new file mode 100644
index 0000000..c9f11a2
--- /dev/null
+++ b/codepoint/codepoint_test.go
@@ -0,0 +1,73 @@
+package codepoint_test
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/RageCage64/go-utf8-codepoint-converter/codepoint"
+)
+
+func TestConvert(t *testing.T) {
+	// I stole these test cases from the UTF-8 wikipedia.
+	// https://en.wikipedia.org/wiki/UTF-8#Encoding
+	testCases := []struct {
+		name            string
+		codepointUPlus  string
+		codepointSlashU string
+		expectedValues  []byte
+	}{
+		{
+			name:            "1 byte encode",
+			codepointUPlus:  "U+0024",
+			codepointSlashU: "\\U00000024",
+			expectedValues:  []byte{0x24},
+		},
+		{
+			name:            "2 byte encode",
+			codepointUPlus:  "U+00A3",
+			codepointSlashU: "\\U000000A3",
+			expectedValues:  []byte{0xC2, 0xA3},
+		},
+		{
+			name:            "3 byte encode",
+			codepointUPlus:  "U+20AC",
+			codepointSlashU: "\\U000020AC",
+			expectedValues:  []byte{0xE2, 0x82, 0xAC},
+		},
+		{
+			name:            "4 byte encode",
+			codepointUPlus:  "U+10348",
+			codepointSlashU: "\\U00010348",
+			expectedValues:  []byte{0xF0, 0x90, 0x8D, 0x88},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			testCodepoint := func(t *testing.T, codepointStr string) {
+				utf8, err := codepoint.Convert(codepointStr)
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if len(utf8) != len(tc.expectedValues) {
+					t.Fatal("lengths not match")
+				}
+				for i, v := range utf8 {
+					if v != utf8[i] {
+						t.Fatal("mismatch values")
+					}
+				}
+			}
+
+			t.Run("U+ format", func(t *testing.T) {
+				testCodepoint(t, tc.codepointUPlus)
+			})
+
+			t.Run("\\U format", func(t *testing.T) {
+				fmt.Println(tc.codepointSlashU)
+				testCodepoint(t, tc.codepointSlashU)
+			})
+		})
+	}
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..545cab3
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/RageCage64/go-utf8-codepoint-converter
+
+go 1.19