{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
UnliftedFFITypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ViewPatterns #-}
module Data.Text.Internal.Encoding
( validateUtf8Chunk
, validateUtf8More
, decodeUtf8Chunk
, decodeUtf8More
, decodeUtf8With1
, decodeUtf8With2
, Utf8State
, startUtf8State
, StrictBuilder()
, strictBuilderToText
, textToStrictBuilder
, skipIncomplete
, getCompleteLen
, getPartialUtf8
) where
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.Bits ((.&.), shiftL, shiftR)
import Data.ByteString (ByteString)
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup (Semigroup(..))
#endif
import Data.Word (Word32, Word8)
import Foreign.Storable (pokeElemOff)
import Data.Text.Encoding.Error (OnDecodeError)
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Encoding.Utf8
(DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState)
import Data.Text.Internal.StrictBuilder (StrictBuilder)
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Short.Internal as SBS
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.StrictBuilder as SB
#if defined(ASSERTS)
import GHC.Stack (HasCallStack)
#endif
#ifdef SIMDUTF
import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Foreign.C.Types (CSize(..))
import Foreign.C.Types (CInt(..))
import Foreign.Ptr (Ptr)
#endif
strictBuilderToText :: StrictBuilder -> Text
strictBuilderToText :: StrictBuilder -> Text
strictBuilderToText = StrictBuilder -> Text
SB.toText
textToStrictBuilder :: Text -> StrictBuilder
textToStrictBuilder :: Text -> StrictBuilder
textToStrictBuilder = Text -> StrictBuilder
SB.fromText
data Utf8State = Utf8State
{
Utf8State -> DecoderState
utf8CodePointState :: {-# UNPACK #-} !DecoderState
, Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint
}
deriving (Utf8State -> Utf8State -> Bool
(Utf8State -> Utf8State -> Bool)
-> (Utf8State -> Utf8State -> Bool) -> Eq Utf8State
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Utf8State -> Utf8State -> Bool
== :: Utf8State -> Utf8State -> Bool
$c/= :: Utf8State -> Utf8State -> Bool
/= :: Utf8State -> Utf8State -> Bool
Eq, Int -> Utf8State -> ShowS
[Utf8State] -> ShowS
Utf8State -> String
(Int -> Utf8State -> ShowS)
-> (Utf8State -> String)
-> ([Utf8State] -> ShowS)
-> Show Utf8State
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Utf8State -> ShowS
showsPrec :: Int -> Utf8State -> ShowS
$cshow :: Utf8State -> String
show :: Utf8State -> String
$cshowList :: [Utf8State] -> ShowS
showList :: [Utf8State] -> ShowS
Show)
startUtf8State :: Utf8State
startUtf8State :: Utf8State
startUtf8State = DecoderState -> PartialUtf8CodePoint -> Utf8State
Utf8State DecoderState
utf8AcceptState PartialUtf8CodePoint
partUtf8Empty
newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
deriving (PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
(PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool)
-> (PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool)
-> Eq PartialUtf8CodePoint
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
== :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
$c/= :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
/= :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
Eq, Int -> PartialUtf8CodePoint -> ShowS
[PartialUtf8CodePoint] -> ShowS
PartialUtf8CodePoint -> String
(Int -> PartialUtf8CodePoint -> ShowS)
-> (PartialUtf8CodePoint -> String)
-> ([PartialUtf8CodePoint] -> ShowS)
-> Show PartialUtf8CodePoint
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> PartialUtf8CodePoint -> ShowS
showsPrec :: Int -> PartialUtf8CodePoint -> ShowS
$cshow :: PartialUtf8CodePoint -> String
show :: PartialUtf8CodePoint -> String
$cshowList :: [PartialUtf8CodePoint] -> ShowS
showList :: [PartialUtf8CodePoint] -> ShowS
Show)
partUtf8Empty :: PartialUtf8CodePoint
partUtf8Empty :: PartialUtf8CodePoint
partUtf8Empty = Word32 -> PartialUtf8CodePoint
PartialUtf8CodePoint Word32
0
partUtf8Len :: PartialUtf8CodePoint -> Int
partUtf8Len :: PartialUtf8CodePoint -> Int
partUtf8Len (PartialUtf8CodePoint Word32
w) = Word32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Int) -> Word32 -> Int
forall a b. (a -> b) -> a -> b
$ Word32
w Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftR` Int
24
partUtf8CompleteLen :: PartialUtf8CodePoint -> Int
partUtf8CompleteLen :: PartialUtf8CodePoint -> Int
partUtf8CompleteLen c :: PartialUtf8CodePoint
c@(PartialUtf8CodePoint Word32
w)
| PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 = Int
0
| Word32
0xf0 Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
firstByte = Int
4
| Word32
0xe0 Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
firstByte = Int
3
| Word32
0xc2 Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
firstByte = Int
2
| Bool
otherwise = Int
0
where
firstByte :: Word32
firstByte = (Word32
w Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftR` Int
16) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
255
partUtf8UnsafeIndex ::
#if defined(ASSERTS)
HasCallStack =>
#endif
PartialUtf8CodePoint -> Int -> Word8
partUtf8UnsafeIndex :: PartialUtf8CodePoint -> Int -> Word8
partUtf8UnsafeIndex _c :: PartialUtf8CodePoint
_c@(PartialUtf8CodePoint Word32
w) Int
n =
#if defined(ASSERTS)
assert (0 <= n && n < partUtf8Len _c) $
#endif
Word32 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Word8) -> Word32 -> Word8
forall a b. (a -> b) -> a -> b
$ Word32
w Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftR` (Int
16 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
8 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
n)
partUtf8UnsafeAppend ::
#if defined(ASSERTS)
HasCallStack =>
#endif
PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend :: PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend c :: PartialUtf8CodePoint
c@(PartialUtf8CodePoint Word32
word) ByteString
bs =
#if defined(ASSERTS)
assert (lenc + lenbs <= 3) $
#endif
Word32 -> PartialUtf8CodePoint
PartialUtf8CodePoint (Word32 -> PartialUtf8CodePoint) -> Word32 -> PartialUtf8CodePoint
forall a b. (a -> b) -> a -> b
$
Int -> Word32 -> Word32
forall {a}. (Num a, Bits a) => Int -> a -> a
tryPush Int
0 (Word32 -> Word32) -> Word32 -> Word32
forall a b. (a -> b) -> a -> b
$ Int -> Word32 -> Word32
forall {a}. (Num a, Bits a) => Int -> a -> a
tryPush Int
1 (Word32 -> Word32) -> Word32 -> Word32
forall a b. (a -> b) -> a -> b
$ Int -> Word32 -> Word32
forall {a}. (Num a, Bits a) => Int -> a -> a
tryPush Int
2 (Word32 -> Word32) -> Word32 -> Word32
forall a b. (a -> b) -> a -> b
$ Word32
word Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
lenbs Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftL` Int
24)
where
lenc :: Int
lenc = PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c
lenbs :: Int
lenbs = ByteString -> Int
B.length ByteString
bs
tryPush :: Int -> a -> a
tryPush Int
i a
w =
if Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lenbs
then a
w a -> a -> a
forall a. Num a => a -> a -> a
+ (Word8 -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
i) a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftL` Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
16 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
8 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Int
lenc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i)))
else a
w
{-# INLINE partUtf8Foldr #-}
partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr :: forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr Word8 -> a -> a
f a
x0 PartialUtf8CodePoint
c = case PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c of
Int
0 -> a
x0
Int
1 -> Int -> a -> a
build Int
0 a
x0
Int
2 -> Int -> a -> a
build Int
0 (Int -> a -> a
build Int
1 a
x0)
Int
_ -> Int -> a -> a
build Int
0 (Int -> a -> a
build Int
1 (Int -> a -> a
build Int
2 a
x0))
where
build :: Int -> a -> a
build Int
i a
x = Word8 -> a -> a
f (PartialUtf8CodePoint -> Int -> Word8
partUtf8UnsafeIndex PartialUtf8CodePoint
c Int
i) a
x
partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
partUtf8ToByteString :: PartialUtf8CodePoint -> ByteString
partUtf8ToByteString PartialUtf8CodePoint
c = Int -> (Ptr Word8 -> IO ()) -> ByteString
BI.unsafeCreate (PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c) ((Ptr Word8 -> IO ()) -> ByteString)
-> (Ptr Word8 -> IO ()) -> ByteString
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr ->
(Word8 -> (Int -> IO ()) -> Int -> IO ())
-> (Int -> IO ()) -> PartialUtf8CodePoint -> Int -> IO ()
forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr (\Word8
w Int -> IO ()
k Int
i -> Ptr Word8 -> Int -> Word8 -> IO ()
forall a. Storable a => Ptr a -> Int -> a -> IO ()
pokeElemOff Ptr Word8
ptr Int
i Word8
w IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Int -> IO ()
k (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)) (\Int
_ -> () -> IO ()
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()) PartialUtf8CodePoint
c Int
0
getCompleteLen :: Utf8State -> Int
getCompleteLen :: Utf8State -> Int
getCompleteLen = PartialUtf8CodePoint -> Int
partUtf8CompleteLen (PartialUtf8CodePoint -> Int)
-> (Utf8State -> PartialUtf8CodePoint) -> Utf8State -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint
getPartialUtf8 :: Utf8State -> B.ByteString
getPartialUtf8 :: Utf8State -> ByteString
getPartialUtf8 = PartialUtf8CodePoint -> ByteString
partUtf8ToByteString (PartialUtf8CodePoint -> ByteString)
-> (Utf8State -> PartialUtf8CodePoint) -> Utf8State -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint
#ifdef SIMDUTF
foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
:: Ptr Word8 -> CSize -> IO CInt
#endif
validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
validateUtf8Chunk ByteString
bs = Int
-> ByteString
-> (Int -> Maybe Utf8State -> (Int, Maybe Utf8State))
-> (Int, Maybe Utf8State)
forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
0 ByteString
bs (,)
{-# INLINE validateUtf8ChunkFrom #-}
validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
ofs ByteString
bs Int -> Maybe Utf8State -> r
k
#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,12,1)
| Int
guessUtf8Boundary Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&&
(
#ifdef SIMDUTF
ByteString -> (ForeignPtr Word8 -> Int -> Bool) -> Bool
forall r. ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
withBS (Int -> ByteString -> ByteString
B.drop Int
ofs ByteString
bs) ((ForeignPtr Word8 -> Int -> Bool) -> Bool)
-> (ForeignPtr Word8 -> Int -> Bool) -> Bool
forall a b. (a -> b) -> a -> b
$ \ ForeignPtr Word8
fp Int
_ -> IO Bool -> Bool
forall a. IO a -> a
unsafeDupablePerformIO (IO Bool -> Bool) -> IO Bool -> Bool
forall a b. (a -> b) -> a -> b
$
ForeignPtr Word8 -> (Ptr Word8 -> IO Bool) -> IO Bool
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
unsafeWithForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO Bool) -> IO Bool)
-> (Ptr Word8 -> IO Bool) -> IO Bool
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> (CInt -> CInt -> Bool
forall a. Eq a => a -> a -> Bool
/= CInt
0) (CInt -> Bool) -> IO CInt -> IO Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
Ptr Word8 -> CSize -> IO CInt
c_is_valid_utf8 Ptr Word8
ptr (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
guessUtf8Boundary)
#else
B.isValidUtf8 $ B.take guessUtf8Boundary (B.drop ofs bs)
#endif
) = Int -> r
slowValidateUtf8ChunkFrom (Int
ofs Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
guessUtf8Boundary)
| Bool
otherwise = Int -> r
slowValidateUtf8ChunkFrom Int
ofs
where
len :: Int
len = ByteString -> Int
B.length ByteString
bs Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
ofs
isBoundary :: Int -> (Word8 -> Bool) -> Bool
isBoundary Int
n Word8 -> Bool
p = Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
n Bool -> Bool -> Bool
&& Word8 -> Bool
p (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs (Int
ofs Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n))
guessUtf8Boundary :: Int
guessUtf8Boundary
| Int -> (Word8 -> Bool) -> Bool
isBoundary Int
1 (Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x80) = Int
len
| Int -> (Word8 -> Bool) -> Bool
isBoundary Int
1 (Word8
0xc2 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<=) = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
| Int -> (Word8 -> Bool) -> Bool
isBoundary Int
2 (Word8
0xe0 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<=) = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2
| Int -> (Word8 -> Bool) -> Bool
isBoundary Int
3 (Word8
0xf0 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<=) = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
3
| Bool
otherwise = Int
len
#else
= slowValidateUtf8ChunkFrom ofs
where
#endif
slowValidateUtf8ChunkFrom :: Int -> r
slowValidateUtf8ChunkFrom :: Int -> r
slowValidateUtf8ChunkFrom Int
ofs1 = Int -> Int -> DecoderState -> r
slowLoop Int
ofs1 Int
ofs1 DecoderState
utf8AcceptState
slowLoop :: Int -> Int -> DecoderState -> r
slowLoop !Int
utf8End Int
i DecoderState
s
| Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< ByteString -> Int
B.length ByteString
bs =
case Word8 -> DecoderState -> DecoderState
updateDecoderState (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
i) DecoderState
s of
DecoderState
s' | DecoderState
s' DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8RejectState -> Int -> Maybe Utf8State -> r
k Int
utf8End Maybe Utf8State
forall a. Maybe a
Nothing
| DecoderState
s' DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8AcceptState -> Int -> Int -> DecoderState -> r
slowLoop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) DecoderState
s'
| Bool
otherwise -> Int -> Int -> DecoderState -> r
slowLoop Int
utf8End (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) DecoderState
s'
| Bool
otherwise = Int -> Maybe Utf8State -> r
k Int
utf8End (Utf8State -> Maybe Utf8State
forall a. a -> Maybe a
Just (DecoderState -> PartialUtf8CodePoint -> Utf8State
Utf8State DecoderState
s (PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend PartialUtf8CodePoint
partUtf8Empty (Int -> ByteString -> ByteString
B.drop Int
utf8End ByteString
bs))))
validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
validateUtf8More Utf8State
st ByteString
bs = Utf8State
-> ByteString
-> (Int -> Maybe Utf8State -> (Int, Maybe Utf8State))
-> (Int, Maybe Utf8State)
forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont Utf8State
st ByteString
bs (,)
{-# INLINE validateUtf8MoreCont #-}
validateUtf8MoreCont :: Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont :: forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont st :: Utf8State
st@(Utf8State DecoderState
s0 PartialUtf8CodePoint
part) ByteString
bs Int -> Maybe Utf8State -> r
k
| Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 = Int -> DecoderState -> r
loop Int
0 DecoderState
s0
| Bool
otherwise = Int -> Maybe Utf8State -> r
k (- PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
part) (Utf8State -> Maybe Utf8State
forall a. a -> Maybe a
Just Utf8State
st)
where
len :: Int
len = ByteString -> Int
B.length ByteString
bs
loop :: Int -> DecoderState -> r
loop !Int
i DecoderState
s
| DecoderState
s DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8AcceptState = Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
i ByteString
bs Int -> Maybe Utf8State -> r
k
| Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len =
case Word8 -> DecoderState -> DecoderState
updateDecoderState (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
i) DecoderState
s of
DecoderState
s' | DecoderState
s' DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8RejectState -> Int -> Maybe Utf8State -> r
k (- PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
part) Maybe Utf8State
forall a. Maybe a
Nothing
| Bool
otherwise -> Int -> DecoderState -> r
loop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) DecoderState
s'
| Bool
otherwise = Int -> Maybe Utf8State -> r
k (- PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
part) (Utf8State -> Maybe Utf8State
forall a. a -> Maybe a
Just (DecoderState -> PartialUtf8CodePoint -> Utf8State
Utf8State DecoderState
s (PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend PartialUtf8CodePoint
part ByteString
bs)))
partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
partUtf8ToStrictBuilder PartialUtf8CodePoint
c =
(Word8 -> StrictBuilder -> StrictBuilder)
-> StrictBuilder -> PartialUtf8CodePoint -> StrictBuilder
forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr (StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
(<>) (StrictBuilder -> StrictBuilder -> StrictBuilder)
-> (Word8 -> StrictBuilder)
-> Word8
-> StrictBuilder
-> StrictBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> StrictBuilder
SB.unsafeFromWord8) StrictBuilder
forall a. Monoid a => a
mempty PartialUtf8CodePoint
c
utf8StateToStrictBuilder :: Utf8State -> StrictBuilder
utf8StateToStrictBuilder :: Utf8State -> StrictBuilder
utf8StateToStrictBuilder = PartialUtf8CodePoint -> StrictBuilder
partUtf8ToStrictBuilder (PartialUtf8CodePoint -> StrictBuilder)
-> (Utf8State -> PartialUtf8CodePoint)
-> Utf8State
-> StrictBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint
decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8More :: Utf8State
-> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8More Utf8State
s ByteString
bs =
Utf8State
-> ByteString
-> (Int
-> Maybe Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State))
-> (StrictBuilder, ByteString, Maybe Utf8State)
forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont Utf8State
s ByteString
bs ((Int
-> Maybe Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State))
-> (StrictBuilder, ByteString, Maybe Utf8State))
-> (Int
-> Maybe Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State))
-> (StrictBuilder, ByteString, Maybe Utf8State)
forall a b. (a -> b) -> a -> b
$ \Int
len Maybe Utf8State
ms ->
let builder :: StrictBuilder
builder | Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0 = StrictBuilder
forall a. Monoid a => a
mempty
| Bool
otherwise = Utf8State -> StrictBuilder
utf8StateToStrictBuilder Utf8State
s
StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len ByteString
bs)
in (StrictBuilder
builder, Int -> ByteString -> ByteString
B.drop Int
len ByteString
bs, Maybe Utf8State
ms)
decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8Chunk = Utf8State
-> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8More Utf8State
startUtf8State
{-# INLINE skipIncomplete #-}
skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s =
(Word8 -> StrictBuilder -> StrictBuilder)
-> StrictBuilder -> PartialUtf8CodePoint -> StrictBuilder
forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr
(StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
(<>) (StrictBuilder -> StrictBuilder -> StrictBuilder)
-> (Word8 -> StrictBuilder)
-> Word8
-> StrictBuilder
-> StrictBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg)
StrictBuilder
forall a. Monoid a => a
mempty (Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint Utf8State
s)
{-# INLINE handleUtf8Error #-}
handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg Word8
w = case OnDecodeError
onErr String
msg (Word8 -> Maybe Word8
forall a. a -> Maybe a
Just Word8
w) of
Just Char
c -> Char -> StrictBuilder
SB.fromChar Char
c
Maybe Char
Nothing -> StrictBuilder
forall a. Monoid a => a
mempty
decodeUtf8With1 ::
#if defined(ASSERTS)
HasCallStack =>
#endif
OnDecodeError -> String -> ByteString -> Text
decodeUtf8With1 :: OnDecodeError -> String -> ByteString -> Text
decodeUtf8With1 OnDecodeError
onErr String
msg ByteString
bs = Int -> ByteString -> (Int -> Maybe Utf8State -> Text) -> Text
forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
0 ByteString
bs ((Int -> Maybe Utf8State -> Text) -> Text)
-> (Int -> Maybe Utf8State -> Text) -> Text
forall a b. (a -> b) -> a -> b
$ \Int
len Maybe Utf8State
ms -> case Maybe Utf8State
ms of
Just Utf8State
s
| Int
len Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length ByteString
bs ->
let !(SBS.SBS ByteArray#
arr) = ByteString -> ShortByteString
SBS.toShort ByteString
bs in
Array -> Int -> Int -> Text
Text (ByteArray# -> Array
A.ByteArray ByteArray#
arr) Int
0 Int
len
| Bool
otherwise -> StrictBuilder -> Text
SB.toText (StrictBuilder -> Text) -> StrictBuilder -> Text
forall a b. (a -> b) -> a -> b
$
ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len ByteString
bs) StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s
Maybe Utf8State
Nothing ->
let (StrictBuilder
builder, ByteString
_, Utf8State
s) = OnDecodeError
-> String
-> Utf8State
-> ByteString
-> (StrictBuilder, ByteString, Utf8State)
decodeUtf8With2 OnDecodeError
onErr String
msg Utf8State
startUtf8State (Int -> ByteString -> ByteString
B.drop (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) ByteString
bs) in
StrictBuilder -> Text
SB.toText (StrictBuilder -> Text) -> StrictBuilder -> Text
forall a b. (a -> b) -> a -> b
$
ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len ByteString
bs) StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<>
OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
len) StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<>
StrictBuilder
builder StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<>
OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s
decodeUtf8With2 ::
#if defined(ASSERTS)
HasCallStack =>
#endif
OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State)
decodeUtf8With2 :: OnDecodeError
-> String
-> Utf8State
-> ByteString
-> (StrictBuilder, ByteString, Utf8State)
decodeUtf8With2 OnDecodeError
onErr String
msg Utf8State
s0 ByteString
bs = Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
s0 Int
0 StrictBuilder
forall a. Monoid a => a
mempty
where
loop :: Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
s Int
i !StrictBuilder
builder =
let nonEmptyPrefix :: Int -> StrictBuilder
nonEmptyPrefix Int
len = StrictBuilder
builder
StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> Utf8State -> StrictBuilder
utf8StateToStrictBuilder Utf8State
s
StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len (Int -> ByteString -> ByteString
B.drop Int
i ByteString
bs))
in Utf8State
-> ByteString
-> (Int
-> Maybe Utf8State -> (StrictBuilder, ByteString, Utf8State))
-> (StrictBuilder, ByteString, Utf8State)
forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont Utf8State
s (Int -> ByteString -> ByteString
B.drop Int
i ByteString
bs) ((Int -> Maybe Utf8State -> (StrictBuilder, ByteString, Utf8State))
-> (StrictBuilder, ByteString, Utf8State))
-> (Int
-> Maybe Utf8State -> (StrictBuilder, ByteString, Utf8State))
-> (StrictBuilder, ByteString, Utf8State)
forall a b. (a -> b) -> a -> b
$ \Int
len Maybe Utf8State
ms -> case Maybe Utf8State
ms of
Maybe Utf8State
Nothing ->
if Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0
then
let builder' :: StrictBuilder
builder' = StrictBuilder
builder StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s
in Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
startUtf8State Int
i StrictBuilder
builder'
else
let builder' :: StrictBuilder
builder' = Int -> StrictBuilder
nonEmptyPrefix Int
len
StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len))
in Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
startUtf8State (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) StrictBuilder
builder'
Just Utf8State
s' ->
let builder' :: StrictBuilder
builder' = if Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0 then StrictBuilder
builder else Int -> StrictBuilder
nonEmptyPrefix Int
len
undecoded :: ByteString
undecoded = if ByteString -> Int
B.length ByteString
bs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= PartialUtf8CodePoint -> Int
partUtf8Len (Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint Utf8State
s')
then Int -> ByteString -> ByteString
B.drop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len) ByteString
bs
else PartialUtf8CodePoint -> ByteString
partUtf8ToByteString (Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint Utf8State
s')
in (StrictBuilder
builder', ByteString
undecoded, Utf8State
s')