{-# LANGUAGE BlockArguments        #-}
{-# LANGUAGE DeriveAnyClass        #-}
{-# LANGUAGE DeriveGeneric         #-}
{-# LANGUAGE DerivingStrategies    #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE NamedFieldPuns        #-}
{-# LANGUAGE NumericUnderscores    #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE OverloadedLists       #-}
{-# LANGUAGE QuasiQuotes           #-}
{-# LANGUAGE RecordWildCards       #-}

-- | You can use this module to convert back and forth between a `ByteString`
--   and its corresponding tokens using an existing encoding like @cl100k_base@
--   or @o200k_base@
--
--   Example usage:
--
-- @
-- {-# LANGUAGE OverloadedStrings #-}
--
-- import "Tiktoken" (`o200k_base`, toTokens, toRanks)
--
-- main :: `IO` ()
-- main = do
--     -- `Just` [\"El\",\" perro\",\" come\",\" las\",\" man\",\"z\",\"anas\"]
--     `print` (`toTokens` `o200k_base` \"El perro come las manzanas\")
--
--     -- `Just` [4422,96439,3063,1996,873,89,14457]
--     `print` (`toRanks` `o200k_base` \"El perro come las manzanas\")
-- @
module Tiktoken
    ( -- * Encoding
      Encoding
    , tiktokenToEncoding
    , addSpecialTokens

      -- * Stock Encodings
    , r50k_base
    , p50k_base
    , p50k_edit
    , cl100k_base
    , o200k_base

      -- * Tokenization
    , toTokens
    , toRanks
    , toTokensAndRanks

      -- * Detokenization
    , fromTokens
    , fromRanks
    ) where

import Control.Applicative ((<|>))
import Control.DeepSeq (NFData)
import Control.Monad.ST (ST)
import Control.Monad.Trans.Class (lift)
import Data.ByteString (ByteString)
import Data.Function (on)
import Data.HashMap.Strict (HashMap)
import Data.IntMap (IntMap, Key)
import Data.List.NonEmpty (NonEmpty(..))
import Data.Map (Map)
import Data.Text (Text)
import Data.Vector (MVector, Vector, (!?))
import Data.Void (Void)
import Data.Word (Word8)
import GHC.Generics (Generic)
import System.FilePath ((</>))
import Text.Megaparsec (ParseErrorBundle, ParsecT)
import Text.RawString.QQ (r)

import qualified Control.Exception as Exception
import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Base64 as Base64.Encoding
import qualified Data.ByteString.Char8 as Char8
import qualified Data.HashMap.Strict as HashMap
import qualified Data.IntMap.Strict as IntMap
import qualified Data.List as List
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Map as Map
import qualified Data.Ord as Ord
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text.Encoding
import qualified Data.Text.IO as Text.IO
import qualified Data.Vector as Vector
import qualified Data.Vector.Mutable as Vector.Mutable
import qualified Paths_tiktoken as Paths
import qualified System.IO.Unsafe as Unsafe
import qualified Text.Megaparsec as Megaparsec
import qualified Text.Megaparsec.Char as Megaparsec.Char
import qualified Text.Regex.PCRE.Light as Regex

{-| This is an efficient internal representation of an encoding like
    @cl100k_base@, @p50k_edit@, or @o200k_base@
-}
data Encoding = Encoding
    { Encoding -> HashMap ByteString Int
encode :: HashMap ByteString Int
    , Encoding -> Vector ByteString
decode :: Vector ByteString
    , Encoding -> Map ByteString Int
specialTokens :: Map ByteString Int
    , Encoding -> ByteString
regex :: ByteString
    } deriving stock ((forall x. Encoding -> Rep Encoding x)
-> (forall x. Rep Encoding x -> Encoding) -> Generic Encoding
forall x. Rep Encoding x -> Encoding
forall x. Encoding -> Rep Encoding x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. Encoding -> Rep Encoding x
from :: forall x. Encoding -> Rep Encoding x
$cto :: forall x. Rep Encoding x -> Encoding
to :: forall x. Rep Encoding x -> Encoding
Generic)
      deriving anyclass (Encoding -> ()
(Encoding -> ()) -> NFData Encoding
forall a. (a -> ()) -> NFData a
$crnf :: Encoding -> ()
rnf :: Encoding -> ()
NFData)

parseToken :: ParsecT Void Text m ByteString
parseToken :: forall (m :: * -> *). ParsecT Void Text m ByteString
parseToken = do
    Text
base64Text <- Maybe String
-> (Token Text -> Bool) -> ParsecT Void Text m (Tokens Text)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
Megaparsec.takeWhileP (String -> Maybe String
forall a. a -> Maybe a
Just String
"Base64 character") (Token Text -> Token Text -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
Token Text
' ')

    let base64Bytes :: ByteString
base64Bytes = Text -> ByteString
Text.Encoding.encodeUtf8 Text
base64Text

    ByteString
token <- case ByteString -> Either Text ByteString
Base64.Encoding.decodeBase64Untyped ByteString
base64Bytes of
        Left Text
text -> String -> ParsecT Void Text m ByteString
forall a. String -> ParsecT Void Text m a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail (Text -> String
Text.unpack Text
text)
        Right ByteString
token -> ByteString -> ParsecT Void Text m ByteString
forall a. a -> ParsecT Void Text m a
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
token

    -- We don't bother parsing the token ID because the tokens are always stored
    -- in sequential order by token ID.  We could *not* assume this but this
    -- would not only make the parsing slower but it would also require using
    -- a `HashMap` instead of a `Vector` to handle potential gaps in the token
    -- ID sequence.  It's much more efficient to make this simplifying
    -- assumption.

    Tokens Text
_ <- Maybe String
-> (Token Text -> Bool) -> ParsecT Void Text m (Tokens Text)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
Megaparsec.takeWhileP (String -> Maybe String
forall a. a -> Maybe a
Just String
"Base64 character") (Token Text -> Token Text -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
Token Text
'\n')

    Char
_ <- Token Text -> ParsecT Void Text m (Token Text)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
Megaparsec.Char.char Char
Token Text
'\n'

    ByteString -> ParsecT Void Text m ByteString
forall a. a -> ParsecT Void Text m a
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
token

parseDecode :: ParsecT Void Text (ST s) (MVector s ByteString)
parseDecode :: forall s. ParsecT Void Text (ST s) (MVector s ByteString)
parseDecode = do
    -- 100,000 is the size of the largest commonly-used encoding at the time of
    -- this writing (`cl100k_base`) and it's not that expensive to pre-allocate
    -- a `Vector` that big, so let's go wild and start with a large allocation.
    let initialSize :: Int
initialSize = Int
100_000

    MVector s ByteString
initialVector <- ST s (MVector s ByteString)
-> ParsecT Void Text (ST s) (MVector s ByteString)
forall (m :: * -> *) a. Monad m => m a -> ParsecT Void Text m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Int -> ST s (MVector (PrimState (ST s)) ByteString)
forall (m :: * -> *) a.
PrimMonad m =>
Int -> m (MVector (PrimState m) a)
Vector.Mutable.new Int
initialSize)

    let loop :: Int
-> MVector (PrimState m) ByteString
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
loop Int
index MVector (PrimState m) ByteString
vector
            | Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
size = do
                let success :: ParsecT Void Text m (MVector (PrimState m) ByteString)
success = do
                        ByteString
token <- ParsecT Void Text m ByteString
forall (m :: * -> *). ParsecT Void Text m ByteString
parseToken

                        m () -> ParsecT Void Text m ()
forall (m :: * -> *) a. Monad m => m a -> ParsecT Void Text m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (MVector (PrimState m) ByteString -> Int -> ByteString -> m ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
Vector.Mutable.write MVector (PrimState m) ByteString
vector Int
index ByteString
token)

                        Int
-> MVector (PrimState m) ByteString
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
loop (Int
index Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) MVector (PrimState m) ByteString
vector

                let failure :: ParsecT Void Text m (MVector (PrimState m) ByteString)
failure = do
                        MVector (PrimState m) ByteString
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
forall a. a -> ParsecT Void Text m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
-> MVector (PrimState m) ByteString
-> MVector (PrimState m) ByteString
forall s a. Int -> MVector s a -> MVector s a
Vector.Mutable.take Int
index MVector (PrimState m) ByteString
vector)

                ParsecT Void Text m (MVector (PrimState m) ByteString)
success ParsecT Void Text m (MVector (PrimState m) ByteString)
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
forall a.
ParsecT Void Text m a
-> ParsecT Void Text m a -> ParsecT Void Text m a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> ParsecT Void Text m (MVector (PrimState m) ByteString)
failure
                
            | Bool
otherwise = do
                MVector (PrimState m) ByteString
largerVector <- m (MVector (PrimState m) ByteString)
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
forall (m :: * -> *) a. Monad m => m a -> ParsecT Void Text m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (MVector (PrimState m) ByteString
-> Int -> m (MVector (PrimState m) ByteString)
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> m (MVector (PrimState m) a)
Vector.Mutable.grow MVector (PrimState m) ByteString
vector Int
size)

                Int
-> MVector (PrimState m) ByteString
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
loop Int
index MVector (PrimState m) ByteString
largerVector
          where
            size :: Int
size = MVector (PrimState m) ByteString -> Int
forall s a. MVector s a -> Int
Vector.Mutable.length MVector (PrimState m) ByteString
vector

    Int
-> MVector (PrimState (ST s)) ByteString
-> ParsecT Void Text (ST s) (MVector (PrimState (ST s)) ByteString)
forall {m :: * -> *}.
PrimMonad m =>
Int
-> MVector (PrimState m) ByteString
-> ParsecT Void Text m (MVector (PrimState m) ByteString)
loop Int
0 MVector s ByteString
MVector (PrimState (ST s)) ByteString
initialVector

-- | Create an `Encoding` from regular expression and an ordered set of tokens
tokensToEncoding
    :: ByteString
    -- ^ Regular expression used for coarse-grained splitting of the input
    -> Vector ByteString
    -- ^ The tokens in sequential order of their token IDs
    -> Encoding
tokensToEncoding :: ByteString -> Vector ByteString -> Encoding
tokensToEncoding ByteString
regex Vector ByteString
decode = Encoding{ByteString
Map ByteString Int
HashMap ByteString Int
Vector ByteString
$sel:encode:Encoding :: HashMap ByteString Int
$sel:decode:Encoding :: Vector ByteString
$sel:specialTokens:Encoding :: Map ByteString Int
$sel:regex:Encoding :: ByteString
regex :: ByteString
decode :: Vector ByteString
encode :: HashMap ByteString Int
specialTokens :: Map ByteString Int
..}
  where
    encode :: HashMap ByteString Int
encode = [(ByteString, Int)] -> HashMap ByteString Int
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HashMap.fromList (Vector (ByteString, Int) -> [(ByteString, Int)]
forall a. Vector a -> [a]
Vector.toList ((Int -> ByteString -> (ByteString, Int))
-> Vector ByteString -> Vector (ByteString, Int)
forall a b. (Int -> a -> b) -> Vector a -> Vector b
Vector.imap Int -> ByteString -> (ByteString, Int)
forall {b} {a}. b -> a -> (a, b)
adapt Vector ByteString
decode))
      where
        adapt :: b -> a -> (a, b)
adapt b
index a
token = (a
token, b
index)

    specialTokens :: Map ByteString Int
specialTokens = Map ByteString Int
forall a. Monoid a => a
mempty

-- | Parse an encoding from the @.tiktoken@ file format
tiktokenToEncoding
    :: ByteString
    -- ^ Regular expression used for coarse-grained splitting of the input
    -> Text
    -- ^ The contents of the @.tiktoken@ file
    -> Either (ParseErrorBundle Text Void) Encoding
tiktokenToEncoding :: ByteString -> Text -> Either (ParseErrorBundle Text Void) Encoding
tiktokenToEncoding ByteString
regex Text
text =
    (Vector ByteString -> Encoding)
-> Either (ParseErrorBundle Text Void) (Vector ByteString)
-> Either (ParseErrorBundle Text Void) Encoding
forall a b.
(a -> b)
-> Either (ParseErrorBundle Text Void) a
-> Either (ParseErrorBundle Text Void) b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (ByteString -> Vector ByteString -> Encoding
tokensToEncoding ByteString
regex)
        ((forall s.
 ST s (Either (ParseErrorBundle Text Void) (MVector s ByteString)))
-> Either (ParseErrorBundle Text Void) (Vector ByteString)
forall (f :: * -> *) a.
Traversable f =>
(forall s. ST s (f (MVector s a))) -> f (Vector a)
Vector.createT (ParsecT Void Text (ST s) (MVector s ByteString)
-> String
-> Text
-> ST
     s (Either (ParseErrorBundle Text Void) (MVector s ByteString))
forall (m :: * -> *) e s a.
Monad m =>
ParsecT e s m a
-> String -> s -> m (Either (ParseErrorBundle s e) a)
Megaparsec.runParserT ParsecT Void Text (ST s) (MVector s ByteString)
forall s. ParsecT Void Text (ST s) (MVector s ByteString)
parseDecode String
"" Text
text))

-- | Add special tokens to a base `Encoding`
addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding
addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding
addSpecialTokens Map ByteString Int
tokens Encoding{ $sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
specialTokens = Map ByteString Int
oldSpecialTokens, ByteString
HashMap ByteString Int
Vector ByteString
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> Vector ByteString
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: Vector ByteString
regex :: ByteString
.. } =
    Encoding{ $sel:specialTokens:Encoding :: Map ByteString Int
specialTokens = Map ByteString Int -> Map ByteString Int -> Map ByteString Int
forall k a. Ord k => Map k a -> Map k a -> Map k a
Map.union Map ByteString Int
tokens Map ByteString Int
oldSpecialTokens, ByteString
HashMap ByteString Int
Vector ByteString
$sel:encode:Encoding :: HashMap ByteString Int
$sel:decode:Encoding :: Vector ByteString
$sel:regex:Encoding :: ByteString
encode :: HashMap ByteString Int
decode :: Vector ByteString
regex :: ByteString
.. }

_ENDOFTEXT :: ByteString
_ENDOFTEXT :: ByteString
_ENDOFTEXT = ByteString
"<|endoftext|>"

_FIM_PREFIX :: ByteString
_FIM_PREFIX :: ByteString
_FIM_PREFIX = ByteString
"<|fim_prefix|>"

_FIM_MIDDLE :: ByteString
_FIM_MIDDLE :: ByteString
_FIM_MIDDLE = ByteString
"<|fim_middle|>"

_FIM_SUFFIX :: ByteString
_FIM_SUFFIX :: ByteString
_FIM_SUFFIX = ByteString
"<|fim_suffix|>"

_ENDOFPROMPT :: ByteString
_ENDOFPROMPT :: ByteString
_ENDOFPROMPT = ByteString
"<|endofprompt|>"

loadEncoding :: FilePath -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding :: String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
file ByteString
regex Map ByteString Int
specialTokens = do
    String
dataDirectory <- IO String
Paths.getDataDir

    Text
text <- String -> IO Text
Text.IO.readFile (String
dataDirectory String -> String -> String
</> String
file)

    Encoding
encoding <- case ByteString -> Text -> Either (ParseErrorBundle Text Void) Encoding
tiktokenToEncoding ByteString
regex Text
text of
        Left ParseErrorBundle Text Void
exception -> ParseErrorBundle Text Void -> IO Encoding
forall e a. Exception e => e -> IO a
Exception.throwIO ParseErrorBundle Text Void
exception
        Right Encoding
encoding -> Encoding -> IO Encoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Encoding
encoding

    Encoding -> IO Encoding
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Map ByteString Int -> Encoding -> Encoding
addSpecialTokens Map ByteString Int
specialTokens Encoding
encoding)

-- | @r50k_base@ `Encoding`
r50k_base :: Encoding
r50k_base :: Encoding
r50k_base =
    IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
        (String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"r50k_base.tiktoken" ByteString
regex [ (ByteString
_ENDOFTEXT, Int
50256) ])
  where
    regex :: ByteString
regex =
        ByteString
[r|'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+|]
{-# NOINLINE r50k_base #-}

-- | @p50k_base@ `Encoding`
p50k_base :: Encoding
p50k_base :: Encoding
p50k_base =
    IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
        (String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"p50k_base.tiktoken" ByteString
regex [ (ByteString
_ENDOFTEXT, Int
50256) ])
  where
    regex :: ByteString
regex =
        ByteString
[r|'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+|]
{-# NOINLINE p50k_base #-}

-- | @p50k_edit@ `Encoding`
p50k_edit :: Encoding
p50k_edit :: Encoding
p50k_edit =
    IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
        (String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"p50k_base.tiktoken"
            ByteString
regex
            [ (ByteString
_ENDOFTEXT , Int
50256)
            , (ByteString
_FIM_PREFIX, Int
50281)
            , (ByteString
_FIM_MIDDLE, Int
50282)
            , (ByteString
_FIM_SUFFIX, Int
50283)
            ] 
        )
  where
    regex :: ByteString
regex =
        ByteString
[r|'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+|]
{-# NOINLINE p50k_edit #-}

-- | @cl100k_base@ `Encoding`
cl100k_base :: Encoding
cl100k_base :: Encoding
cl100k_base =
    IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
        (String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"cl100k_base.tiktoken"
            ByteString
regex
            [ (ByteString
_ENDOFTEXT  , Int
100257)
            , (ByteString
_FIM_PREFIX , Int
100258)
            , (ByteString
_FIM_MIDDLE , Int
100259)
            , (ByteString
_FIM_SUFFIX , Int
100260)
            , (ByteString
_ENDOFPROMPT, Int
100276)
            ]
        )
  where
    regex :: ByteString
regex =
        ByteString
[r|'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+|]
{-# NOINLINE cl100k_base #-}

-- | @o200k_base@ `Encoding`
o200k_base :: Encoding
o200k_base :: Encoding
o200k_base =
    IO Encoding -> Encoding
forall a. IO a -> a
Unsafe.unsafePerformIO
        (String -> ByteString -> Map ByteString Int -> IO Encoding
loadEncoding String
"o200k_base.tiktoken"
            ByteString
regex
            [ (ByteString
_ENDOFTEXT  , Int
199999)
            , (ByteString
_ENDOFPROMPT, Int
200018)
            ]
        )
  where
    regex :: ByteString
regex =
        ByteString -> [ByteString] -> ByteString
Char8.intercalate ByteString
"|"
            [ Item [ByteString]
[r|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|]
            , Item [ByteString]
[r|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|]
            , Item [ByteString]
[r|\p{N}{1,3}|]
            , Item [ByteString]
[r| ?[^\s\p{L}\p{N}]+[\r\n/]*|]
            , Item [ByteString]
[r|\s*[\r\n]+|]
            , Item [ByteString]
[r|\s+(?!\S)|]
            , Item [ByteString]
[r|\s+|]
            ]
{-# NOINLINE o200k_base #-}

minimumBy :: (a -> a -> Ordering) -> IntMap a -> Maybe (Int, a)
minimumBy :: forall a. (a -> a -> Ordering) -> IntMap a -> Maybe (Int, a)
minimumBy a -> a -> Ordering
comparison IntMap a
intMap
    | IntMap a -> Bool
forall a. IntMap a -> Bool
IntMap.null IntMap a
intMap =
        Maybe (Int, a)
forall a. Maybe a
Nothing
    | Bool
otherwise =
        (Int, a) -> Maybe (Int, a)
forall a. a -> Maybe a
Just (((Int, a) -> (Int, a) -> Ordering) -> [(Int, a)] -> (Int, a)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
List.minimumBy (a -> a -> Ordering
comparison (a -> a -> Ordering)
-> ((Int, a) -> a) -> (Int, a) -> (Int, a) -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Int, a) -> a
forall a b. (a, b) -> b
snd) (IntMap a -> [(Int, a)]
forall a. IntMap a -> [(Int, a)]
IntMap.toList IntMap a
intMap))

drop1 :: [a] -> [a]
drop1 :: forall a. [a] -> [a]
drop1 (a
_ : [a]
xs) = [a]
xs
drop1      []  = []

{-| This is basically the same thing as `Maybe Int` except with an `Ord`
    instance that treats `Ranked` values as less than `Unranked` values
-}
data Ranked = Ranked Int | Unranked
    deriving (Ranked -> Ranked -> Bool
(Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool) -> Eq Ranked
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Ranked -> Ranked -> Bool
== :: Ranked -> Ranked -> Bool
$c/= :: Ranked -> Ranked -> Bool
/= :: Ranked -> Ranked -> Bool
Eq, Eq Ranked
Eq Ranked =>
(Ranked -> Ranked -> Ordering)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Bool)
-> (Ranked -> Ranked -> Ranked)
-> (Ranked -> Ranked -> Ranked)
-> Ord Ranked
Ranked -> Ranked -> Bool
Ranked -> Ranked -> Ordering
Ranked -> Ranked -> Ranked
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Ranked -> Ranked -> Ordering
compare :: Ranked -> Ranked -> Ordering
$c< :: Ranked -> Ranked -> Bool
< :: Ranked -> Ranked -> Bool
$c<= :: Ranked -> Ranked -> Bool
<= :: Ranked -> Ranked -> Bool
$c> :: Ranked -> Ranked -> Bool
> :: Ranked -> Ranked -> Bool
$c>= :: Ranked -> Ranked -> Bool
>= :: Ranked -> Ranked -> Bool
$cmax :: Ranked -> Ranked -> Ranked
max :: Ranked -> Ranked -> Ranked
$cmin :: Ranked -> Ranked -> Ranked
min :: Ranked -> Ranked -> Ranked
Ord)

data Chunk = Chunk
    { Chunk -> Int
rank  :: Int
      -- ^ Rank of this chunk
    , Chunk -> Ranked
rank2 :: Ranked
      -- ^ Rank of this chunk combined with the next chunk
    }

{-| This corresponds to the `_byte_pair_merge` function in the upstream `tiktoken`
    package:

    https://github.com/openai/tiktoken/blob/c0ba74c238d18b4824c25f3c27fc8698055b9a76/src/lib.rs#L18-L74

    The intermediate data structure is an `IntMap` instead of a `Vector` but other
    than that the algorithm is essentially identical.
-}
bytePairEncode
    :: HashMap ByteString Int -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncode :: HashMap ByteString Int -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncode HashMap ByteString Int
hashMap ByteString
bytes
    | Just Int
rank <- ByteString -> HashMap ByteString Int -> Maybe Int
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup ByteString
bytes HashMap ByteString Int
hashMap =
        [(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
Just [ (Int
rank, ByteString
bytes) ]
    | ByteString -> Bool
ByteString.null ByteString
bytes =
        [(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
    | Bool
otherwise = do
        -- In practice this should always return a `Just` because all of
        -- OpenAI's encodings are defined all bytes, but in theory the user
        -- could create an `Encoding` that doesn't satisfy that invariant, so
        -- we still need to handle that case.
        let lookupByte :: Word8 -> Maybe Int
            lookupByte :: Word8 -> Maybe Int
lookupByte Word8
word8 = ByteString -> HashMap ByteString Int -> Maybe Int
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup (Word8 -> ByteString
ByteString.singleton Word8
word8) HashMap ByteString Int
hashMap

        let toChunk :: Word8 -> Word8 -> Maybe Chunk
toChunk Word8
w0 Word8
w1 = do
                Int
rank <- Word8 -> Maybe Int
lookupByte Word8
w0

                let rank2 :: Ranked
rank2 = ByteString -> Ranked
lookupSlice ([Word8] -> ByteString
ByteString.pack [ Word8
Item [Word8]
w0, Word8
Item [Word8]
w1 ])

                Chunk -> Maybe Chunk
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Chunk{ Int
$sel:rank:Chunk :: Int
rank :: Int
rank, Ranked
$sel:rank2:Chunk :: Ranked
rank2 :: Ranked
rank2 }

        [Chunk]
initChunks <- do
            [Maybe Chunk] -> Maybe [Chunk]
forall (t :: * -> *) (m :: * -> *) a.
(Traversable t, Monad m) =>
t (m a) -> m (t a)
forall (m :: * -> *) a. Monad m => [m a] -> m [a]
sequence ((Word8 -> Word8 -> Maybe Chunk)
-> ByteString -> ByteString -> [Maybe Chunk]
forall a. (Word8 -> Word8 -> a) -> ByteString -> ByteString -> [a]
ByteString.zipWith Word8 -> Word8 -> Maybe Chunk
toChunk ByteString
bytes (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
ByteString.tail ByteString
bytes))

        Chunk
lastChunk <- do
            Int
rank <- Word8 -> Maybe Int
lookupByte (HasCallStack => ByteString -> Word8
ByteString -> Word8
ByteString.last ByteString
bytes)

            Chunk -> Maybe Chunk
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Chunk{ Int
$sel:rank:Chunk :: Int
rank :: Int
rank, $sel:rank2:Chunk :: Ranked
rank2 = Ranked
Unranked }

        {- Unlike the upstream `tiktoken` we do not use `Vector (Key, Ranked)`
           as our intermediate datastructure, but rather something more like
           `IntMap Ranked` (technically `IntMap Chunk`, which is just a tiny
           optimization).

           This makes it cheaper to delete keys without having to rebuild a
           `Vector` each time, but at the expense of neighbor lookups (e.g.
           `lookupLT` / `lookupGT`) being more expensive.
        -}
        let initialMap :: IntMap Chunk
            initialMap :: IntMap Chunk
initialMap =
                [(Int, Chunk)] -> IntMap Chunk
forall a. [(Int, a)] -> IntMap a
IntMap.fromList ([Int] -> [Chunk] -> [(Int, Chunk)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
Item [Int]
0 ..] ([Chunk]
initChunks [Chunk] -> [Chunk] -> [Chunk]
forall a. Semigroup a => a -> a -> a
<> [ Item [Chunk]
Chunk
lastChunk ]))

        let keyValues :: [(Int, Chunk)]
keyValues = IntMap Chunk -> [(Int, Chunk)]
forall a. IntMap a -> [(Int, a)]
IntMap.toAscList (IntMap Chunk -> IntMap Chunk
loop IntMap Chunk
initialMap)

        [(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure do
            let adapt :: (Int, Chunk) -> Int -> (Int, ByteString)
adapt (Int
index, Chunk{ Int
$sel:rank:Chunk :: Chunk -> Int
rank :: Int
rank }) Int
nextIndex =
                    (Int
rank, Int -> Int -> ByteString
slice Int
index Int
nextIndex)

            ((Int, Chunk) -> Int -> (Int, ByteString))
-> [(Int, Chunk)] -> [Int] -> [(Int, ByteString)]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (Int, Chunk) -> Int -> (Int, ByteString)
adapt [(Int, Chunk)]
keyValues ([Int] -> [Int]
forall a. [a] -> [a]
drop1 (((Int, Chunk) -> Int) -> [(Int, Chunk)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Int, Chunk) -> Int
forall a b. (a, b) -> a
fst [(Int, Chunk)]
keyValues) [Int] -> [Int] -> [Int]
forall a. Semigroup a => a -> a -> a
<> [ Int
Item [Int]
size ])
  where
    size :: Int
    size :: Int
size = ByteString -> Int
ByteString.length ByteString
bytes

    lookupSlice :: ByteString -> Ranked
    lookupSlice :: ByteString -> Ranked
lookupSlice ByteString
b = case ByteString -> HashMap ByteString Int -> Maybe Int
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup ByteString
b HashMap ByteString Int
hashMap of
        Maybe Int
Nothing  -> Ranked
Unranked
        Just Int
int -> Int -> Ranked
Ranked Int
int

    slice :: Int -> Int -> ByteString
    slice :: Int -> Int -> ByteString
slice Int
begin Int
end = Int -> ByteString -> ByteString
ByteString.take (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
begin) (Int -> ByteString -> ByteString
ByteString.drop Int
begin ByteString
bytes)

    loop :: IntMap Chunk -> IntMap Chunk
    loop :: IntMap Chunk -> IntMap Chunk
loop IntMap Chunk
chunks0 = case (Chunk -> Chunk -> Ordering) -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. (a -> a -> Ordering) -> IntMap a -> Maybe (Int, a)
minimumBy ((Chunk -> Ranked) -> Chunk -> Chunk -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
Ord.comparing Chunk -> Ranked
rank2) IntMap Chunk
chunks0 of
        Just (Int
index, Chunk{ $sel:rank2:Chunk :: Chunk -> Ranked
rank2 = Ranked Int
ranked }) -> IntMap Chunk -> IntMap Chunk
loop IntMap Chunk
chunks3
          where
            chunks1 :: IntMap Chunk
chunks1 = Int -> Int -> IntMap Chunk -> IntMap Chunk
rerank Int
index Int
ranked IntMap Chunk
chunks0

            chunks2 :: IntMap Chunk
chunks2 = case Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupLT Int
index IntMap Chunk
chunks1 of
                Just (Int
prevIndex, Chunk{ $sel:rank:Chunk :: Chunk -> Int
rank = Int
prevRanked }) ->
                    Int -> Int -> IntMap Chunk -> IntMap Chunk
rerank Int
prevIndex Int
prevRanked IntMap Chunk
chunks1
                Maybe (Int, Chunk)
_ ->
                    IntMap Chunk
chunks1

            chunks3 :: IntMap Chunk
chunks3 = case Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index IntMap Chunk
chunks2 of
                -- In theory we should never hit the `Nothing` case here because
                -- the `rank2` field can only be `Ranked` if there is a `Chunk`
                -- following this one.
                Maybe (Int, Chunk)
Nothing ->
                    String -> IntMap Chunk
forall a. HasCallStack => String -> a
error String
"Tiktoken.bytePairEncode: Internal error - a ranked byte pair is missing the second byte in the pair"
                Just (Int
nextIndex, Chunk
_) -> Int -> IntMap Chunk -> IntMap Chunk
forall a. Int -> IntMap a -> IntMap a
IntMap.delete Int
nextIndex IntMap Chunk
chunks2

        Maybe (Int, Chunk)
_ ->
            IntMap Chunk
chunks0

    rerank :: Key -> Int -> IntMap Chunk -> IntMap Chunk
    rerank :: Int -> Int -> IntMap Chunk -> IntMap Chunk
rerank Int
index0 Int
rank IntMap Chunk
chunks = Int -> Chunk -> IntMap Chunk -> IntMap Chunk
forall a. Int -> a -> IntMap a -> IntMap a
IntMap.insert Int
index0 Chunk
newChunk IntMap Chunk
chunks
      where
        maybeIndex3 :: Maybe Int
maybeIndex3 = do
            (Int
index1, Chunk
_) <- Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index0 IntMap Chunk
chunks

            (Int
index2, Chunk
_) <- Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index1 IntMap Chunk
chunks

            Int -> Maybe Int
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure case Int -> IntMap Chunk -> Maybe (Int, Chunk)
forall a. Int -> IntMap a -> Maybe (Int, a)
IntMap.lookupGT Int
index2 IntMap Chunk
chunks of
                Just (Int
index3, Chunk
_) -> Int
index3
                Maybe (Int, Chunk)
Nothing          -> Int
size

        rank2 :: Ranked
rank2 = case Maybe Int
maybeIndex3 of
            Maybe Int
Nothing     -> Ranked
Unranked
            Just Int
index3 -> ByteString -> Ranked
lookupSlice (Int -> Int -> ByteString
slice Int
index0 Int
index3)

        newChunk :: Chunk
newChunk = Chunk{ Int
$sel:rank:Chunk :: Int
rank :: Int
rank, Ranked
$sel:rank2:Chunk :: Ranked
rank2 :: Ranked
rank2 }

{-| Split a `ByteString` into smaller `ByteString`s, each of which are
    successive longest possible matches to the provided regular expression
-}
splitUsingRegex
    :: ByteString
    -- ^ Regex to match
    -> ByteString
    -- ^ Bytes to split into chunks
    -> Maybe [ByteString]
splitUsingRegex :: ByteString -> ByteString -> Maybe [ByteString]
splitUsingRegex ByteString
pattern = ([ByteString] -> [ByteString]) -> ByteString -> Maybe [ByteString]
forall {c}. ([ByteString] -> c) -> ByteString -> Maybe c
loop [ByteString] -> [ByteString]
forall a. a -> a
Prelude.id
  where
    loop :: ([ByteString] -> c) -> ByteString -> Maybe c
loop [ByteString] -> c
diff ByteString
bytes
        | ByteString -> Bool
ByteString.null ByteString
bytes =
            c -> Maybe c
forall a. a -> Maybe a
Just ([ByteString] -> c
diff [])
        | Bool
otherwise =
            case Regex -> ByteString -> [PCREExecOption] -> Maybe [ByteString]
Regex.match Regex
regex ByteString
bytes [ Item [PCREExecOption]
PCREExecOption
Regex.exec_no_utf8_check ] of
                Just (ByteString
prefix : [ByteString]
_) ->
                    let suffix :: ByteString
suffix = Int -> ByteString -> ByteString
ByteString.drop (ByteString -> Int
ByteString.length ByteString
prefix) ByteString
bytes
                    in  ([ByteString] -> c) -> ByteString -> Maybe c
loop ([ByteString] -> c
diff ([ByteString] -> c)
-> ([ByteString] -> [ByteString]) -> [ByteString] -> c
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString
prefix ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:)) ByteString
suffix
                Maybe [ByteString]
_ -> Maybe c
forall a. Maybe a
Nothing

    regex :: Regex
regex = ByteString -> [PCREOption] -> Regex
Regex.compile ByteString
pattern [ Item [PCREOption]
PCREOption
Regex.utf8 ]

{-| Divide up the input into coarse-grained chunks based on the provided splitting
    regular expression before doing the final byte pair encoding
-}
bytePairEncodeWithSplitting :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncodeWithSplitting :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncodeWithSplitting Encoding{ByteString
Map ByteString Int
HashMap ByteString Int
Vector ByteString
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> Vector ByteString
$sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: Vector ByteString
specialTokens :: Map ByteString Int
regex :: ByteString
..} ByteString
bytes = do
    [ByteString]
chunks <- ByteString -> ByteString -> Maybe [ByteString]
splitUsingRegex ByteString
regex ByteString
bytes

    [[(Int, ByteString)]]
tokenss <- (ByteString -> Maybe [(Int, ByteString)])
-> [ByteString] -> Maybe [[(Int, ByteString)]]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> [a] -> f [b]
traverse (HashMap ByteString Int -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncode HashMap ByteString Int
encode) [ByteString]
chunks

    [(Int, ByteString)] -> Maybe [(Int, ByteString)]
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([[(Int, ByteString)]] -> [(Int, ByteString)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[(Int, ByteString)]]
tokenss)

{-| Split a `ByteString` into smaller `ByteString`s separated by the given
    separator
-}
splitOnSeparator
    :: ByteString
    -- ^ Separator
    -> ByteString
    -- ^ `ByteString` to separate
    -> NonEmpty ByteString
splitOnSeparator :: ByteString -> ByteString -> NonEmpty ByteString
splitOnSeparator ByteString
separator ByteString
initialBytes = ByteString
initialPrefix ByteString -> [ByteString] -> NonEmpty ByteString
forall a. a -> [a] -> NonEmpty a
:| ByteString -> [ByteString]
loop ByteString
initialSuffix
  where
    split :: ByteString -> (ByteString, ByteString)
split = ByteString -> ByteString -> (ByteString, ByteString)
ByteString.breakSubstring ByteString
separator

    (ByteString
initialPrefix, ByteString
initialSuffix) = ByteString -> (ByteString, ByteString)
split ByteString
initialBytes

    loop :: ByteString -> [ByteString]
loop ByteString
bytes
        | ByteString -> Bool
ByteString.null ByteString
bytes = []
        | Bool
otherwise             = ByteString
prefix ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: ByteString -> [ByteString]
loop ByteString
suffix
      where
        rest :: ByteString
rest = Int -> ByteString -> ByteString
ByteString.drop (ByteString -> Int
ByteString.length ByteString
separator) ByteString
bytes

        (ByteString
prefix, ByteString
suffix) = ByteString -> (ByteString, ByteString)
split ByteString
rest

-- | Tokenizer that is special-token-aware
toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks encoding :: Encoding
encoding@Encoding{ByteString
Map ByteString Int
HashMap ByteString Int
Vector ByteString
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> Vector ByteString
$sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: Vector ByteString
specialTokens :: Map ByteString Int
regex :: ByteString
..} ByteString
initialBytes =
    ((ByteString, Int)
 -> (ByteString -> Maybe [(Int, ByteString)])
 -> ByteString
 -> Maybe [(Int, ByteString)])
-> (ByteString -> Maybe [(Int, ByteString)])
-> [(ByteString, Int)]
-> ByteString
-> Maybe [(Int, ByteString)]
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (ByteString, Int)
-> (ByteString -> Maybe [(Int, ByteString)])
-> ByteString
-> Maybe [(Int, ByteString)]
forall {f :: * -> *} {b}.
Applicative f =>
(ByteString, b)
-> (ByteString -> f [(b, ByteString)])
-> ByteString
-> f [(b, ByteString)]
cons ByteString -> Maybe [(Int, ByteString)]
nil (Map ByteString Int -> [(ByteString, Int)]
forall k a. Map k a -> [(k, a)]
Map.toList Map ByteString Int
specialTokens) ByteString
initialBytes
  where
    cons :: (ByteString, b)
-> (ByteString -> f [(b, ByteString)])
-> ByteString
-> f [(b, ByteString)]
cons (ByteString
token, b
rank) ByteString -> f [(b, ByteString)]
tokenizer ByteString
bytes = do
        (NonEmpty [(b, ByteString)] -> [(b, ByteString)])
-> f (NonEmpty [(b, ByteString)]) -> f [(b, ByteString)]
forall a b. (a -> b) -> f a -> f b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap NonEmpty [(b, ByteString)] -> [(b, ByteString)]
joinSegments ((ByteString -> f [(b, ByteString)])
-> NonEmpty ByteString -> f (NonEmpty [(b, ByteString)])
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> NonEmpty a -> f (NonEmpty b)
traverse ByteString -> f [(b, ByteString)]
tokenizer (ByteString -> ByteString -> NonEmpty ByteString
splitOnSeparator ByteString
token ByteString
bytes))
      where
        joinSegments :: NonEmpty [(b, ByteString)] -> [(b, ByteString)]
joinSegments =
              [[(b, ByteString)]] -> [(b, ByteString)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat
            ([[(b, ByteString)]] -> [(b, ByteString)])
-> (NonEmpty [(b, ByteString)] -> [[(b, ByteString)]])
-> NonEmpty [(b, ByteString)]
-> [(b, ByteString)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. NonEmpty [(b, ByteString)] -> [[(b, ByteString)]]
forall a. NonEmpty a -> [a]
NonEmpty.toList
            (NonEmpty [(b, ByteString)] -> [[(b, ByteString)]])
-> (NonEmpty [(b, ByteString)] -> NonEmpty [(b, ByteString)])
-> NonEmpty [(b, ByteString)]
-> [[(b, ByteString)]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(b, ByteString)]
-> NonEmpty [(b, ByteString)] -> NonEmpty [(b, ByteString)]
forall a. a -> NonEmpty a -> NonEmpty a
NonEmpty.intersperse [ (b
rank, ByteString
token) ]

    nil :: ByteString -> Maybe [(Int, ByteString)]
nil ByteString
bytes = Encoding -> ByteString -> Maybe [(Int, ByteString)]
bytePairEncodeWithSplitting Encoding
encoding ByteString
bytes

{-| Use an `Encoding` to tokenize a `ByteString` into smaller `ByteString`s

    This only fails if you provide an `Encoding` that cannot rank all possible
    1-byte sequences
-}
toTokens :: Encoding -> ByteString -> Maybe [ByteString]
toTokens :: Encoding -> ByteString -> Maybe [ByteString]
toTokens = ((ByteString -> Maybe [(Int, ByteString)])
 -> ByteString -> Maybe [ByteString])
-> (Encoding -> ByteString -> Maybe [(Int, ByteString)])
-> Encoding
-> ByteString
-> Maybe [ByteString]
forall a b. (a -> b) -> (Encoding -> a) -> Encoding -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Maybe [(Int, ByteString)] -> Maybe [ByteString])
-> (ByteString -> Maybe [(Int, ByteString)])
-> ByteString
-> Maybe [ByteString]
forall a b. (a -> b) -> (ByteString -> a) -> ByteString -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (([(Int, ByteString)] -> [ByteString])
-> Maybe [(Int, ByteString)] -> Maybe [ByteString]
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((Int, ByteString) -> ByteString)
-> [(Int, ByteString)] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int, ByteString) -> ByteString
forall a b. (a, b) -> b
snd))) Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks

{-| Use an `Encoding` to tokenize a `ByteString` into ranks

    This only fails if you provide an `Encoding` that cannot rank all possible
    1-byte sequences
-}
toRanks :: Encoding -> ByteString -> Maybe [Int]
toRanks :: Encoding -> ByteString -> Maybe [Int]
toRanks = ((ByteString -> Maybe [(Int, ByteString)])
 -> ByteString -> Maybe [Int])
-> (Encoding -> ByteString -> Maybe [(Int, ByteString)])
-> Encoding
-> ByteString
-> Maybe [Int]
forall a b. (a -> b) -> (Encoding -> a) -> Encoding -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Maybe [(Int, ByteString)] -> Maybe [Int])
-> (ByteString -> Maybe [(Int, ByteString)])
-> ByteString
-> Maybe [Int]
forall a b. (a -> b) -> (ByteString -> a) -> ByteString -> b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (([(Int, ByteString)] -> [Int])
-> Maybe [(Int, ByteString)] -> Maybe [Int]
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((Int, ByteString) -> Int) -> [(Int, ByteString)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int, ByteString) -> Int
forall a b. (a, b) -> a
fst))) Encoding -> ByteString -> Maybe [(Int, ByteString)]
toTokensAndRanks

{-| Combine a sequence of `ByteString` tokens back into a `ByteString`

    This is just a synonym for @"Data.ByteString".`ByteString.concat`@ (no
    `Encoding` necessary), provided solely for consistency/convenience.
-}
fromTokens :: [ByteString] -> ByteString
fromTokens :: [ByteString] -> ByteString
fromTokens = [ByteString] -> ByteString
ByteString.concat

{-| Convert a sequence of ranks back into a `ByteString`

    This will fail if you supply any ranks which are not recognized by the
    `Encoding`.
-}
fromRanks :: Encoding -> [Int] -> Maybe ByteString
fromRanks :: Encoding -> [Int] -> Maybe ByteString
fromRanks Encoding{ByteString
Map ByteString Int
HashMap ByteString Int
Vector ByteString
$sel:encode:Encoding :: Encoding -> HashMap ByteString Int
$sel:decode:Encoding :: Encoding -> Vector ByteString
$sel:specialTokens:Encoding :: Encoding -> Map ByteString Int
$sel:regex:Encoding :: Encoding -> ByteString
encode :: HashMap ByteString Int
decode :: Vector ByteString
specialTokens :: Map ByteString Int
regex :: ByteString
..} [Int]
vector = ([ByteString] -> ByteString)
-> Maybe [ByteString] -> Maybe ByteString
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [ByteString] -> ByteString
fromTokens ((Int -> Maybe ByteString) -> [Int] -> Maybe [ByteString]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> [a] -> f [b]
traverse (Vector ByteString
decode Vector ByteString -> Int -> Maybe ByteString
forall a. Vector a -> Int -> Maybe a
!?) [Int]
vector)