codepoint: create codepoint package

4 years ago · d71ed1e1ff
parent 48795c72f3
commit d71ed1e1ff
4 changed files with 228 additions and 21 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2022 RageCage64
+Copyright (c) 2022 Braydon Kains
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/codepoint/codepoint.go
+++ b/codepoint/codepoint.go
@ -0,0 +1,131 @@
 // The codepoint package provides a mechanism to take UTF-8 codepoints
 // as strings and encode them according to the UTF-8 standard. For a
 // better explanation of the methods, see the Wikipedia article on
 // UTF-8 encoding. https://en.wikipedia.org/wiki/UTF-8#Encoding
 package codepoint
 import (
 	"errors"
 	"fmt"
 	"strconv"
 )
 // Convert takes in a string of a UTF-8 codepoint in the format
 // `U+0000` or `\U00000000` and converts it to a slice of bytes
 // that represent the corresponding UTF-8 encoding.
 func Convert(cpoint string) ([]byte, error) {
 	// Check for valid codepoint.
 	if cpoint[:2] != "U+" && cpoint[:2] != "\\U" {
 		return nil, ErrInvalidCodepoint
 	}
 	// Extract the hex number.
 	cpointHexStr := cpoint[2:]
 	cpointHexVal, err := strconv.ParseInt(cpointHexStr, 16, strconv.IntSize)
 	if err != nil {
 		return nil, err
 	}
 	// Determine if this requires 1, 2, 3, or 4 bytes.
 	startBytes, err := getStartBytes(int(cpointHexVal))
 	if err != nil {
 		return nil, err
 	}
 	utf8Bytes, err := convertCodepoint(int(cpointHexVal), startBytes)
 	if err != nil {
 		return nil, err
 	}
 	return utf8Bytes, nil
 }
 var (
 	ErrInvalidWidth     = errors.New("an invalid width was specified")
 	ErrInvalidCodepoint = errors.New("specified codepoint was not valid")
 )
 var (
 	startByteTable = map[int]int{
 		0b00000000: 7,
 		0b10000000: 6,
 		0b11000000: 5,
 		0b11100000: 4,
 		0b11110000: 3,
 	}
 	bitCountByWidthTable = map[int]int{
 		1: 7,
 		2: 11,
 		3: 16,
 		4: 21,
 	}
 	oneByteEncode   = []int{0b00000000}
 	twoByteEncode   = []int{0b11000000, 0b10000000}
 	threeByteEncode = []int{0b11100000, 0b10000000, 0b10000000}
 	fourByteEncode  = []int{0b11110000, 0b10000000, 0b10000000, 0b10000000}
 )
 func getStartBytes(value int) ([]int, error) {
 	if 0 <= value && value <= 0x7F {
 		return oneByteEncode, nil
 	} else if 0x80 <= value && value <= 0x7FF {
 		return twoByteEncode, nil
 	} else if 0x800 <= value && value <= 0xFFFF {
 		return threeByteEncode, nil
 	} else if 0x10000 <= value && value <= 0x10FFFF {
 		return fourByteEncode, nil
 	}
 	return nil, ErrInvalidWidth
 }
 func convertCodepoint(value int, startBytes []int) ([]byte, error) {
 	// Pad the required number of 0 bits to the left of the binary
 	// representation of the hex value. This is based on the width
 	// (1, 2, 3, or 4) which is the same as the length of the starting
 	// bytes slice.
 	s, err := padLeftBits(fmt.Sprintf("%b", value), len(startBytes))
 	if err != nil {
 		return nil, err
 	}
 	utf8Bytes := []byte{}
 	for _, currStartByte := range startBytes {
 		// Use the current byte to figure out how many bits we
 		// need from the codepoint value.
 		bits := startByteTable[currStartByte]
 		currBits := s[0:bits]
 		// Parse the bits into an int.
 		uintValue, err := strconv.ParseInt(currBits, 2, strconv.IntSize)
 		if err != nil {
 			return nil, err
 		}
 		value := int(uintValue)
 		// Use an or operation to store the significant bytes of
 		// the value into the remaining bits of the start byte.
 		utf8Byte := byte(currStartByte | value)
 		utf8Bytes = append(utf8Bytes, utf8Byte)
 		// Slice the bits we just used off the codepoint binary
 		// string.
 		s = s[bits:]
 	}
 	return utf8Bytes, nil
 }
 func padLeftBits(s string, width int) (string, error) {
 	bitCount, ok := bitCountByWidthTable[width]
 	if !ok {
 		return "", ErrInvalidWidth
 	}
 	leading := bitCount - len(s)
 	zeroes := ""
 	for i := 0; i < leading; i++ {
 		zeroes += "0"
 	}
 	return zeroes + s, nil
 }
--- a/codepoint/codepoint_test.go
+++ b/codepoint/codepoint_test.go
@ -0,0 +1,73 @@
 package codepoint_test
 import (
 	"fmt"
 	"testing"
 	"github.com/RageCage64/go-utf8-codepoint-converter/codepoint"
 )
 func TestConvert(t *testing.T) {
 	// I stole these test cases from the UTF-8 wikipedia.
 	// https://en.wikipedia.org/wiki/UTF-8#Encoding
 	testCases := []struct {
 		name            string
 		codepointUPlus  string
 		codepointSlashU string
 		expectedValues  []byte
 	}{
 		{
 			name:            "1 byte encode",
 			codepointUPlus:  "U+0024",
 			codepointSlashU: "\\U00000024",
 			expectedValues:  []byte{0x24},
 		},
 		{
 			name:            "2 byte encode",
 			codepointUPlus:  "U+00A3",
 			codepointSlashU: "\\U000000A3",
 			expectedValues:  []byte{0xC2, 0xA3},
 		},
 		{
 			name:            "3 byte encode",
 			codepointUPlus:  "U+20AC",
 			codepointSlashU: "\\U000020AC",
 			expectedValues:  []byte{0xE2, 0x82, 0xAC},
 		},
 		{
 			name:            "4 byte encode",
 			codepointUPlus:  "U+10348",
 			codepointSlashU: "\\U00010348",
 			expectedValues:  []byte{0xF0, 0x90, 0x8D, 0x88},
 		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			testCodepoint := func(t *testing.T, codepointStr string) {
 				utf8, err := codepoint.Convert(codepointStr)
 				if err != nil {
 					t.Fatal(err)
 				}
 				if len(utf8) != len(tc.expectedValues) {
 					t.Fatal("lengths not match")
 				}
 				for i, v := range utf8 {
 					if v != utf8[i] {
 						t.Fatal("mismatch values")
 					}
 				}
 			}
 			t.Run("U+ format", func(t *testing.T) {
 				testCodepoint(t, tc.codepointUPlus)
 			})
 			t.Run("\\U format", func(t *testing.T) {
 				fmt.Println(tc.codepointSlashU)
 				testCodepoint(t, tc.codepointSlashU)
 			})
 		})
 	}
 }
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,3 @@
 module github.com/RageCage64/go-utf8-codepoint-converter
 go 1.19
		`@ -0,0 +1,3 @@`
							`module github.com/RageCage64/go-utf8-codepoint-converter`

							`go 1.19`