/tinybpe

Tiny implementation of the Byte Pair Encoding (BPE) algorithm commonly used in LLM tokenization. Inspired by karpathy/minbpe

Primary LanguageGoMIT LicenseMIT

TinyBPE Library

License: MIT Go Reference GitHub go.mod Go version Go Report Card GitHub Release GitHub Downloads (all assets, all releases)

Installation

go get github.com/shadowy-pycoder/tinybpe@latest

Usage

package main

import (
	"fmt"
	"os"

	"github.com/shadowy-pycoder/tinybpe"
)

func main() {
	tokenizer := tinybpe.NewTokenizer()
	f, err := os.ReadFile("./testdata/t8.shakespeare.txt")
	if err != nil {
		panic(err)
	}
	vocabSize := 512
	verbose := true
	tokenizer.Train(f, vocabSize, verbose)
	if err := tokenizer.Save("test"); err != nil {
		panic(err)
	}
	tokens := tokenizer.Encode([]byte("Hello World"))
	fmt.Println(tokens)
	text, err := tokenizer.Decode(tokens)
	if err != nil {
		panic(err)
	}
	fmt.Println(text)
}

TinyBPE CLI

See here: README.md