module Data.Monoid.Instances.ByteString.UTF8 (
ByteStringUTF8(..), decode
)
where
import Prelude hiding (drop, dropWhile, foldl, foldl1, foldr, foldr1, scanl, scanr, scanl1, scanr1,
map, concatMap, break, span)
import Data.Bits ((.&.), (.|.), shiftL, shiftR)
import Data.Char (chr, ord)
import qualified Data.Foldable as Foldable
import qualified Data.List as List
import Data.Maybe (fromMaybe)
import Data.String (IsString(fromString))
import Data.Word (Word8)
import Data.ByteString (ByteString)
import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Char8 as ByteString.Char8
import Data.ByteString.Unsafe (unsafeDrop, unsafeHead, unsafeTail, unsafeIndex)
import Data.Monoid (Monoid(mempty, mappend))
import Data.Monoid.Cancellative (LeftReductiveMonoid(..), LeftCancellativeMonoid, LeftGCDMonoid(..))
import Data.Monoid.Null (MonoidNull(null), PositiveMonoid)
import Data.Monoid.Factorial (FactorialMonoid(..))
import Data.Monoid.Textual (TextualMonoid(..))
import qualified Data.Monoid.Factorial as Factorial (FactorialMonoid(..))
import qualified Data.Monoid.Textual as Textual (TextualMonoid(..))
newtype ByteStringUTF8 = ByteStringUTF8 ByteString deriving (Eq, Ord)
decode :: ByteString -> (ByteStringUTF8, ByteString)
decode bs
| ByteString.null bs || l < 0x80 = (ByteStringUTF8 bs, mempty)
| l >= 0xC0 = (ByteStringUTF8 (ByteString.init bs), ByteString.singleton l)
| ByteString.null prefix = (mempty, bs)
| otherwise =
case toChar (ByteString.last prefix) suffix
of Nothing -> (ByteStringUTF8 (ByteString.init prefix), drop (ByteString.length prefix 1) bs)
Just{} -> (ByteStringUTF8 bs, mempty)
where (prefix, suffix) = ByteString.breakEnd byteStartsCharacter bs
l = ByteString.last bs
instance Monoid ByteStringUTF8 where
mempty = ByteStringUTF8 ByteString.empty
ByteStringUTF8 a `mappend` ByteStringUTF8 b = ByteStringUTF8 (a `mappend` b)
instance MonoidNull ByteStringUTF8 where
null (ByteStringUTF8 b) = ByteString.null b
instance LeftReductiveMonoid ByteStringUTF8 where
stripPrefix (ByteStringUTF8 a) (ByteStringUTF8 b) = fmap ByteStringUTF8 (stripPrefix a b)
ByteStringUTF8 a `isPrefixOf` ByteStringUTF8 b = a `isPrefixOf` b
instance LeftCancellativeMonoid ByteStringUTF8
instance LeftGCDMonoid ByteStringUTF8 where
commonPrefix (ByteStringUTF8 a) (ByteStringUTF8 b) = ByteStringUTF8 (commonPrefix a b)
stripCommonPrefix (ByteStringUTF8 a) (ByteStringUTF8 b) = wrapTriple (stripCommonPrefix a b)
instance Show ByteStringUTF8 where
showsPrec _ bs s = '"' : Textual.foldr showsBytes (:) ('"' : s) bs
where showsBytes (ByteStringUTF8 b) s = '\\' : shows (ByteString.unpack b) s
instance IsString ByteStringUTF8 where
fromString = ByteStringUTF8 . Foldable.foldMap fromChar
instance PositiveMonoid ByteStringUTF8
instance FactorialMonoid ByteStringUTF8 where
splitPrimePrefix utf8@(ByteStringUTF8 bs)
| ByteString.null bs = Nothing
| unsafeHead bs < 0x80 = Just (wrapPair $ ByteString.splitAt 1 bs)
| otherwise = case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Just i -> Just (wrapPair $ ByteString.splitAt (succ i) bs)
Nothing -> Just (utf8, ByteStringUTF8 $ ByteString.empty)
splitPrimeSuffix utf8@(ByteStringUTF8 bs)
| ByteString.null bs = Nothing
| ByteString.null prefix = Just (wrapPair split)
| not (ByteString.null suffix) && ByteString.last prefix < 0x80 = Just (wrapPair split)
| otherwise = Just (wrapPair $ ByteString.splitAt (pred $ ByteString.length prefix) bs)
where split@(prefix, suffix) = ByteString.breakEnd byteStartsCharacter bs
primePrefix utf8@(ByteStringUTF8 bs)
| ByteString.null bs = utf8
| unsafeHead bs < 0x80 = ByteStringUTF8 (ByteString.take 1 bs)
| otherwise = case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Just i -> ByteStringUTF8 (ByteString.take (succ i) bs)
Nothing -> utf8
factors (ByteStringUTF8 bs) = List.map ByteStringUTF8 $ ByteString.groupBy continued bs
where continued a b = a >= 0x80 && b >= 0x80 && b < 0xC0
length (ByteStringUTF8 bs) = fst (ByteString.foldl' count (0, False) bs)
where count (n, high) byte | byte < 0x80 = (succ n, False)
| byte < 0xC0 = (if high then n else succ n, True)
| otherwise = (succ n, True)
foldl f a0 (ByteStringUTF8 bs) = List.foldl f' a0 (groupASCII bs)
where f' a b | unsafeHead b < 0x80 = ByteString.foldl f'' a b
| otherwise = f a (ByteStringUTF8 b)
f'' a w = f a (ByteStringUTF8 $ ByteString.singleton w)
foldl' f a0 (ByteStringUTF8 bs) = List.foldl' f' a0 (groupASCII bs)
where f' a b | unsafeHead b < 0x80 = ByteString.foldl' f'' a b
| otherwise = f a (ByteStringUTF8 b)
f'' a w = f a (ByteStringUTF8 $ ByteString.singleton w)
foldr f a0 (ByteStringUTF8 bs) = List.foldr f' a0 (groupASCII bs)
where f' b a | unsafeHead b < 0x80 = ByteString.foldr f'' a b
| otherwise = f (ByteStringUTF8 b) a
f'' w a = f (ByteStringUTF8 $ ByteString.singleton w) a
splitAt n (ByteStringUTF8 bs) = wrapPair (ByteString.splitAt (charStartIndex n bs) bs)
take n (ByteStringUTF8 bs) = ByteStringUTF8 (ByteString.take (charStartIndex n bs) bs)
drop n (ByteStringUTF8 bs) = ByteStringUTF8 (ByteString.drop (charStartIndex n bs) bs)
dropWhile p (ByteStringUTF8 bs) = dropASCII bs
where dropASCII bs =
let suffix = ByteString.dropWhile (\w-> w < 0x80 && p (ByteStringUTF8 $ ByteString.singleton w)) bs
in if ByteString.null suffix || unsafeHead suffix < 0x80
then ByteStringUTF8 suffix
else dropMultiByte suffix
dropMultiByte bs =
let utf8 = ByteStringUTF8 bs
in case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Nothing -> if p utf8 then ByteStringUTF8 ByteString.empty else utf8
Just i -> let (hd, tl) = ByteString.splitAt (succ i) bs
in if p (ByteStringUTF8 hd)
then dropASCII tl
else utf8
takeWhile p utf8@(ByteStringUTF8 bs) =
ByteStringUTF8 $ ByteString.take (ByteString.length bs ByteString.length s) bs
where suffix@(ByteStringUTF8 s) = Factorial.dropWhile p utf8
span p utf8@(ByteStringUTF8 bs) =
(ByteStringUTF8 $ ByteString.take (ByteString.length bs ByteString.length s) bs, suffix)
where suffix@(ByteStringUTF8 s) = Factorial.dropWhile p utf8
break p = Factorial.span (not . p)
reverse (ByteStringUTF8 bs) =
ByteStringUTF8 (ByteString.concat $ List.reverse $ List.map reverseASCII $ groupASCII bs)
where reverseASCII b | unsafeHead b < 0x80 = ByteString.reverse b
| otherwise = b
instance TextualMonoid ByteStringUTF8 where
singleton = ByteStringUTF8 . fromChar
splitCharacterPrefix (ByteStringUTF8 bs) = ByteString.uncons bs >>= uncurry toChar
foldl ft fc a0 (ByteStringUTF8 bs) = List.foldl f a0 (groupASCII bs)
where f a b = let hd = unsafeHead b
in if hd < 0x80
then ByteString.Char8.foldl fc a b
else maybe (ft a $ ByteStringUTF8 b) (fc a . fst) (toChar hd $ unsafeTail b)
foldl' ft fc a0 (ByteStringUTF8 bs) = List.foldl' f a0 (groupASCII bs)
where f a b = let hd = unsafeHead b
in if hd < 0x80
then ByteString.Char8.foldl' fc a b
else maybe (ft a $ ByteStringUTF8 b) (fc a . fst) (toChar hd $ unsafeTail b)
foldr ft fc a0 (ByteStringUTF8 bs) = List.foldr f a0 (groupASCII bs)
where f b a = let hd = unsafeHead b
in if hd < 0x80
then ByteString.Char8.foldr fc a b
else maybe (ft (ByteStringUTF8 b) a) (flip fc a . fst) (toChar hd $ unsafeTail b)
dropWhile pb pc (ByteStringUTF8 bs) = ByteStringUTF8 $ dropASCII bs
where dropASCII rest = case ByteString.Char8.findIndex (\c-> c > '\x7f' || not (pc c)) rest
of Nothing -> ByteString.empty
Just j -> let rest' = unsafeDrop j rest
in if unsafeHead rest' > 0x7f
then dropMultiByte rest'
else rest'
dropMultiByte rest = case splitCharacterPrefix (ByteStringUTF8 rest)
of Just (c, ByteStringUTF8 rest') | pc c -> dropASCII rest'
Nothing -> let j = succ (headIndex $ drop 1 rest)
in if pb (ByteStringUTF8 $ ByteString.take j rest)
then dropASCII (unsafeDrop j rest)
else rest
_ -> rest
takeWhile pb pc utf8@(ByteStringUTF8 bs) =
ByteStringUTF8 $ ByteString.take (ByteString.length bs ByteString.length suffix) bs
where ByteStringUTF8 suffix = Textual.dropWhile pb pc utf8
span pb pc utf8@(ByteStringUTF8 bs) =
wrapPair $ ByteString.splitAt (ByteString.length bs ByteString.length suffix) bs
where ByteStringUTF8 suffix = Textual.dropWhile pb pc utf8
break pb pc = Textual.span (not . pb) (not . pc)
wrapPair (bs1, bs2) = (ByteStringUTF8 bs1, ByteStringUTF8 bs2)
wrapTriple (bs1, bs2, bs3) = (ByteStringUTF8 bs1, ByteStringUTF8 bs2, ByteStringUTF8 bs3)
fromChar :: Char -> ByteString
fromChar c | c < '\x80' = ByteString.Char8.singleton c
| c < '\x800' = ByteString.pack [0xC0 + fromIntegral (shiftR n 6),
0x80 + fromIntegral (n .&. 0x3F)]
| c < '\x10000' = ByteString.pack [0xE0 + fromIntegral (shiftR n 12),
0x80 + fromIntegral (shiftR n 6 .&. 0x3F),
0x80 + fromIntegral (n .&. 0x3F)]
| n < 0x200000 = ByteString.pack [0xF0 + fromIntegral (shiftR n 18),
0x80 + fromIntegral (shiftR n 12 .&. 0x3F),
0x80 + fromIntegral (shiftR n 6 .&. 0x3F),
0x80 + fromIntegral (n .&. 0x3F)]
where n = ord c
toChar :: Word8 -> ByteString -> Maybe (Char, ByteStringUTF8)
toChar hd tl | hd < 0x80 = Just (chr $ fromIntegral hd, ByteStringUTF8 tl)
| hd < 0xC2 = Nothing
| hd < 0xE0 = do (b0, t0) <- ByteString.uncons tl
if headIndex tl == 1
then return (chr (shiftL (fromIntegral hd .&. 0x1F) 6
.|. fromIntegral b0 .&. 0x3F),
ByteStringUTF8 t0)
else Nothing
| hd < 0xF0 = do (b1, t1) <- ByteString.uncons tl
(b0, t0) <- ByteString.uncons t1
if (hd > 0xE0 || b1 >= 0xA0) && headIndex tl == 2
then return (chr (shiftL (fromIntegral hd .&. 0xF) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F),
ByteStringUTF8 t0)
else Nothing
| hd < 0xF4 = do (b2, t2) <- ByteString.uncons tl
(b1, t1) <- ByteString.uncons t2
(b0, t0) <- ByteString.uncons t1
if (hd > 0xF0 || b2 >= 0x90) && headIndex tl == 3
then return (chr (shiftL (fromIntegral hd .&. 0x7) 18
.|. shiftL (fromIntegral b2 .&. 0x3F) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F),
ByteStringUTF8 t0)
else Nothing
| otherwise = Nothing
groupASCII :: ByteString -> [ByteString]
groupASCII = ByteString.groupBy continued
where continued a b = (a < 0x80) == (b < 0x80) && b < 0xC0
headIndex bs = fromMaybe (ByteString.length bs) $ ByteString.findIndex byteStartsCharacter bs
byteStartsCharacter :: Word8 -> Bool
byteStartsCharacter b = b < 0x80 || b >= 0xC0
charStartIndex :: Int -> ByteString -> Int
charStartIndex n _ | n <= 0 = 0
charStartIndex n bs =
case List.drop (pred n) (ByteString.findIndices byteStartsCharacter $ ByteString.drop 1 bs)
of [] -> ByteString.length bs
k:_ -> succ k