{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
module System.IO.Streams.Text
(
decodeUtf8
, decodeUtf8With
, encodeUtf8
) where
import Control.Monad (when)
import Control.Monad.IO.Class (MonadIO (..))
import Data.ByteString (ByteString)
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
#if !MIN_VERSION_base(4,8,0)
import Data.Monoid (mappend)
#endif
import Data.Text (Text)
import qualified Data.Text.Encoding as T
import Data.Text.Encoding.Error (OnDecodeError)
import Data.Word (Word8)
import qualified System.IO.Streams.Combinators as Streams
import System.IO.Streams.Internal (InputStream, OutputStream)
import qualified System.IO.Streams.Internal as Streams
encodeUtf8 :: OutputStream ByteString -> IO (OutputStream Text)
encodeUtf8 = Streams.contramap T.encodeUtf8
decodeUtf8 :: InputStream ByteString -> IO (InputStream Text)
decodeUtf8 = decode T.decodeUtf8
{-# INLINE decodeUtf8 #-}
decodeUtf8With :: OnDecodeError
-> InputStream ByteString
-> IO (InputStream Text)
decodeUtf8With e = decode (T.decodeUtf8With e)
{-# INLINE decodeUtf8With #-}
decode :: (ByteString -> Text)
-> InputStream ByteString
-> IO (InputStream Text)
decode decodeFunc input = Streams.fromGenerator $ go Nothing
where
go !soFar = liftIO (Streams.read input) >>=
maybe (finish soFar) (chunk soFar)
finish Nothing = return $! ()
finish (Just x) = Streams.yield $! decodeFunc x
chunk Nothing s = process s
chunk (Just a) b = process $ a `mappend` b
process !s =
case findLastFullCode s of
LastCodeIsComplete x -> (Streams.yield $! decodeFunc x) >> go Nothing
Split a b -> do
when (not $ S.null a) $
Streams.yield $! decodeFunc a
go (Just b)
NoCodesAreComplete x -> go (Just x)
data ByteType = Regular
| Continuation
| Start !Int
between :: Word8 -> Word8 -> Word8 -> Bool
between x y z = x >= y && x <= z
{-# INLINE between #-}
characterizeByte :: Word8 -> ByteType
characterizeByte c | between c 0 0x7F = Regular
| between c 0x80 0xBF = Continuation
| between c 0xC0 0xDF = Start 1
| between c 0xE0 0xEF = Start 2
| otherwise = Start 3
data FindOutput = LastCodeIsComplete !ByteString
| Split !ByteString !ByteString
| NoCodesAreComplete !ByteString
findLastFullCode :: ByteString -> FindOutput
findLastFullCode b | len == 0 = LastCodeIsComplete b
| otherwise = go
where
len = S.length b
go = let !idx = len - 1
!c = S.unsafeIndex b idx
in case characterizeByte c of
Regular -> LastCodeIsComplete b
Continuation -> cont (len - 2)
_ -> Split (S.unsafeTake idx b) (S.unsafeDrop idx b)
cont !idx | idx < 0 = NoCodesAreComplete b
| otherwise =
let !c = S.unsafeIndex b idx
in case characterizeByte c of
Regular -> LastCodeIsComplete b
Continuation -> cont (idx - 1)
Start n -> if n + idx == len - 1
then LastCodeIsComplete b
else Split (S.unsafeTake idx b)
(S.unsafeDrop idx b)
{-# INLINE findLastFullCode #-}