Safe Haskell	Safe-Inferred
Language	Haskell2010

Tiktoken

Contents

Encoding
Stock Encodings
Tokenization
Detokenization

Description

You can use this module to convert back and forth between a ByteString and its corresponding tokens using an existing encoding like cl100k_base or o200k_base

Example usage:

{-# LANGUAGE OverloadedStrings #-}

import Tiktoken (o200k_base, toTokens, toRanks)

main :: IO ()
main = do
    -- Just ["El"," perro"," come"," las"," man","z","anas"]
    print (toTokens o200k_base "El perro come las manzanas")

    -- Just [4422,96439,3063,1996,873,89,14457]
    print (toRanks o200k_base "El perro come las manzanas")

Synopsis

data Encoding
tiktokenToEncoding :: ByteString -> Text -> Either (ParseErrorBundle Text Void) Encoding
addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding
r50k_base :: Encoding
p50k_base :: Encoding
p50k_edit :: Encoding
cl100k_base :: Encoding
o200k_base :: Encoding
toTokens :: Encoding -> ByteString -> Maybe [ByteString]
toRanks :: Encoding -> ByteString -> Maybe [Int]
toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
fromTokens :: [ByteString] -> ByteString
fromRanks :: Encoding -> [Int] -> Maybe ByteString

Encoding

data Encoding Source #

This is an efficient internal representation of an encoding like cl100k_base, p50k_edit, or o200k_base

Instances

Instances details

Generic Encoding Source #
Instance details Defined in Tiktoken Associated Types type Rep Encoding :: Type -> Type # Methods from :: Encoding -> Rep Encoding x # to :: Rep Encoding x -> Encoding #
NFData Encoding Source #
Instance details Defined in Tiktoken Methods rnf :: Encoding -> () #
type Rep Encoding Source #
Instance details Defined in Tiktoken type Rep Encoding = D1 ('MetaData "Encoding" "Tiktoken" "tiktoken-1.0.2-8bP3kw6JoV7IEn3Wtil93u" 'False) (C1 ('MetaCons "Encoding" 'PrefixI 'True) ((S1 ('MetaSel ('Just "encode") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (HashMap ByteString Int)) :: S1 ('MetaSel ('Just "decode") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (IntMap ByteString))) :: (S1 ('MetaSel ('Just "specialTokens") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Map ByteString Int)) :*: S1 ('MetaSel ('Just "regex") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 ByteString))))

tiktokenToEncoding Source #

Arguments

:: ByteString	Regular expression used for coarse-grained splitting of the input
-> Text	The contents of the `.tiktoken` file
-> Either (ParseErrorBundle Text Void) Encoding

Parse an encoding from the .tiktoken file format

addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding Source #

Add special tokens to a base Encoding

Stock Encodings

r50k_base :: Encoding Source #

r50k_base Encoding

p50k_base :: Encoding Source #

p50k_base Encoding

p50k_edit :: Encoding Source #

p50k_edit Encoding

cl100k_base :: Encoding Source #

cl100k_base Encoding

o200k_base :: Encoding Source #

o200k_base Encoding

Tokenization

toTokens :: Encoding -> ByteString -> Maybe [ByteString] Source #

Use an Encoding to tokenize a ByteString into smaller ByteStrings

This only fails if you provide an Encoding that cannot rank all possible 1-byte sequences

toRanks :: Encoding -> ByteString -> Maybe [Int] Source #

Use an Encoding to tokenize a ByteString into ranks

This only fails if you provide an Encoding that cannot rank all possible 1-byte sequences

toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)] Source #

Use an Encoding to tokenize a ByteString into smaller ByteStrings and their associated ranks

This only fails if you provide an Encoding that cannot rank all possible 1-byte sequences

Detokenization

fromTokens :: [ByteString] -> ByteString Source #

Combine a sequence of ByteString tokens back into a ByteString

This is just a synonym for Data.ByteString.concat (no Encoding necessary), provided solely for consistency/convenience.

fromRanks :: Encoding -> [Int] -> Maybe ByteString Source #

Convert a sequence of ranks back into a ByteString

This will fail if you supply any ranks which are not recognized by the Encoding.