Safe Haskell | Safe-Inferred |
---|---|
Language | Haskell2010 |
You can use this module to convert back and forth between a ByteString
and its corresponding tokens using an existing encoding like cl100k_base
or o200k_base
Example usage:
{-# LANGUAGE OverloadedStrings #-} import Tiktoken (o200k_base
, toTokens, toRanks) main ::IO
() main = do --Just
["El"," perro"," come"," las"," man","z","anas"]toTokens
o200k_base
"El perro come las manzanas") --Just
[4422,96439,3063,1996,873,89,14457]toRanks
o200k_base
"El perro come las manzanas")
Synopsis
- data Encoding
- tiktokenToEncoding :: ByteString -> Text -> Either (ParseErrorBundle Text Void) Encoding
- addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding
- r50k_base :: Encoding
- p50k_base :: Encoding
- p50k_edit :: Encoding
- cl100k_base :: Encoding
- o200k_base :: Encoding
- toTokens :: Encoding -> ByteString -> Maybe [ByteString]
- toRanks :: Encoding -> ByteString -> Maybe [Int]
- toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
- fromTokens :: [ByteString] -> ByteString
- fromRanks :: Encoding -> [Int] -> Maybe ByteString
Encoding
This is an efficient internal representation of an encoding like
cl100k_base
, p50k_edit
, or o200k_base
Instances
Generic Encoding Source # | |
NFData Encoding Source # | |
type Rep Encoding Source # | |
Defined in Tiktoken type Rep Encoding = D1 ('MetaData "Encoding" "Tiktoken" "tiktoken-1.0.0-RVgUPpRfQrK6y1UQpyMoD" 'False) (C1 ('MetaCons "Encoding" 'PrefixI 'True) ((S1 ('MetaSel ('Just "encode") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (HashMap ByteString Int)) :*: S1 ('MetaSel ('Just "decode") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Vector ByteString))) :*: (S1 ('MetaSel ('Just "specialTokens") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Map ByteString Int)) :*: S1 ('MetaSel ('Just "regex") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 ByteString)))) |
:: ByteString | Regular expression used for coarse-grained splitting of the input |
-> Text | The contents of the |
-> Either (ParseErrorBundle Text Void) Encoding |
Parse an encoding from the .tiktoken
file format
addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding Source #
Add special tokens to a base Encoding
Stock Encodings
cl100k_base :: Encoding Source #
cl100k_base
Encoding
o200k_base :: Encoding Source #
o200k_base
Encoding
Tokenization
toTokens :: Encoding -> ByteString -> Maybe [ByteString] Source #
Use an Encoding
to tokenize a ByteString
into smaller ByteString
s
This only fails if you provide an Encoding
that cannot rank all possible
1-byte sequences
toRanks :: Encoding -> ByteString -> Maybe [Int] Source #
Use an Encoding
to tokenize a ByteString
into ranks
This only fails if you provide an Encoding
that cannot rank all possible
1-byte sequences
toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)] Source #
Tokenizer that is special-token-aware
Detokenization
fromTokens :: [ByteString] -> ByteString Source #
Combine a sequence of ByteString
tokens back into a ByteString
This is just a synonym for Data.ByteString.
(no
concat
Encoding
necessary), provided solely for consistency/convenience.
fromRanks :: Encoding -> [Int] -> Maybe ByteString Source #
Convert a sequence of ranks back into a ByteString
This will fail if you supply any ranks which are not recognized by the
Encoding
.