go-utf8-codepoint-converter/codepoint/codepoint.go

// The codepoint package provides a mechanism to take UTF-8 codepoints
// as strings and encode them according to the UTF-8 standard. For a
// better explanation of the methods, see the Wikipedia article on
// UTF-8 encoding. https://en.wikipedia.org/wiki/UTF-8#Encoding
package codepoint

import (
	"errors"
	"fmt"
	"strconv"
)

// Convert takes in a string of a UTF-8 codepoint in the format
// `U+0000` or `\U00000000` and converts it to a slice of bytes
// that represent the corresponding UTF-8 encoding.
func Convert(cpoint string) ([]byte, error) {
	// Check for valid codepoint.
	if cpoint[:2] != "U+" && cpoint[:2] != "\\U" {
		return nil, ErrInvalidCodepoint
	}

	// Extract the hex number.
	cpointHexStr := cpoint[2:]
	cpointHexVal, err := strconv.ParseInt(cpointHexStr, 16, strconv.IntSize)
	if err != nil {
		return nil, err
	}
	// Determine if this requires 1, 2, 3, or 4 bytes.
	startBytes, err := getStartBytes(int(cpointHexVal))
	if err != nil {
		return nil, err
	}

	utf8Bytes, err := convertCodepoint(int(cpointHexVal), startBytes)
	if err != nil {
		return nil, err
	}
	return utf8Bytes, nil
}

var (
	ErrInvalidWidth     = errors.New("an invalid width was specified")
	ErrInvalidCodepoint = errors.New("specified codepoint was not valid")
)

var (
	startByteTable = map[int]int{
		0b00000000: 7,
		0b10000000: 6,
		0b11000000: 5,
		0b11100000: 4,
		0b11110000: 3,
	}

	bitCountByWidthTable = map[int]int{
		1: 7,
		2: 11,
		3: 16,
		4: 21,
	}

	oneByteEncode   = []int{0b00000000}
	twoByteEncode   = []int{0b11000000, 0b10000000}
	threeByteEncode = []int{0b11100000, 0b10000000, 0b10000000}
	fourByteEncode  = []int{0b11110000, 0b10000000, 0b10000000, 0b10000000}
)

func getStartBytes(value int) ([]int, error) {
	if 0 <= value && value <= 0x7F {
		return oneByteEncode, nil
	} else if 0x80 <= value && value <= 0x7FF {
		return twoByteEncode, nil
	} else if 0x800 <= value && value <= 0xFFFF {
		return threeByteEncode, nil
	} else if 0x10000 <= value && value <= 0x10FFFF {
		return fourByteEncode, nil
	}

	return nil, ErrInvalidWidth
}

func convertCodepoint(value int, startBytes []int) ([]byte, error) {
	// Pad the required number of 0 bits to the left of the binary
	// representation of the hex value. This is based on the width
	// (1, 2, 3, or 4) which is the same as the length of the starting
	// bytes slice.
	s, err := padLeftBits(fmt.Sprintf("%b", value), len(startBytes))
	if err != nil {
		return nil, err
	}

	utf8Bytes := []byte{}
	for _, currStartByte := range startBytes {
		// Use the current byte to figure out how many bits we
		// need from the codepoint value.
		bits := startByteTable[currStartByte]
		currBits := s[0:bits]

		// Parse the bits into an int.
		uintValue, err := strconv.ParseInt(currBits, 2, strconv.IntSize)
		if err != nil {
			return nil, err
		}
		value := int(uintValue)

		// Use an or operation to store the significant bytes of
		// the value into the remaining bits of the start byte.
		utf8Byte := byte(currStartByte | value)
		utf8Bytes = append(utf8Bytes, utf8Byte)

		// Slice the bits we just used off the codepoint binary
		// string.
		s = s[bits:]
	}

	return utf8Bytes, nil
}

func padLeftBits(s string, width int) (string, error) {
	bitCount, ok := bitCountByWidthTable[width]
	if !ok {
		return "", ErrInvalidWidth
	}

	leading := bitCount - len(s)
	zeroes := ""
	for i := 0; i < leading; i++ {
		zeroes += "0"
	}
	return zeroes + s, nil
}