Codec/Binary/UTF8/Generic.hs

{-# LANGUAGE CPP, MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
#endif
--
-- |
-- Module      :  Codec.Binary.UTF8.Generic
-- Copyright   :  (c) Iavor S. Diatchki 2009
-- License     :  BSD3-style (see LICENSE)
--
-- Maintainer  :  emertens@galois.com
-- Stability   :  experimental
-- Portability :  portable
--
module Codec.Binary.UTF8.Generic
  ( UTF8Bytes(..)
  , decode
  , replacement_char
  , uncons
  , splitAt
  , take
  , drop
  , span
  , break
  , fromString
  , toString
  , foldl
  , foldr
  , length
  , lines
  , lines'
  ) where

import Data.Bits
import Data.Int
import Data.Word
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.List as List
import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines,null,tail)

import Codec.Binary.UTF8.String(encode)

#ifdef BYTESTRING_IN_BASE
import Data.ByteString.Base (unsafeHead, unsafeTail)
#endif

class (Num s, Ord s) => UTF8Bytes b s | b -> s where
  bsplit        :: s -> b -> (b,b)
  bdrop         :: s -> b -> b
  buncons       :: b -> Maybe (Word8,b)
  elemIndex     :: Word8 -> b -> Maybe s
  empty         :: b
  null          :: b -> Bool
  pack          :: [Word8] -> b
  tail          :: b -> b

instance UTF8Bytes B.ByteString Int where
  bsplit        = B.splitAt
  bdrop         = B.drop
  buncons       = unconsB
  elemIndex     = B.elemIndex
  empty         = B.empty
  null          = B.null
  pack          = B.pack
  tail          = B.tail

instance UTF8Bytes L.ByteString Int64 where
  bsplit        = L.splitAt
  bdrop         = L.drop
  buncons       = unconsL
  elemIndex     = L.elemIndex
  empty         = L.empty
  null          = L.null
  pack          = L.pack
  tail          = L.tail

instance UTF8Bytes [Word8] Int where
  bsplit          = List.splitAt
  bdrop           = List.drop
  buncons (x:xs)  = Just (x,xs)
  buncons []      = Nothing
  elemIndex x xs  = List.elemIndex (toEnum (fromEnum x)) xs
  empty           = []
  null            = List.null
  pack            = id
  tail            = List.tail

-- | Converts a Haskell string into a UTF8 encoded bytestring.
{-# SPECIALIZE fromString :: String -> B.ByteString  #-}
{-# SPECIALIZE fromString :: String -> L.ByteString  #-}
{-# SPECIALIZE fromString :: String -> [Word8] #-}
fromString :: UTF8Bytes b s => String -> b
fromString xs = pack (encode xs)

-- | Convert a UTF8 encoded bytestring into a Haskell string.
-- Invalid characters are replaced with '\xFFFD'.
{-# SPECIALIZE toString :: B.ByteString -> String #-}
{-# SPECIALIZE toString :: L.ByteString -> String #-}
{-# SPECIALIZE toString :: [Word8] -> String #-}
toString :: UTF8Bytes b s => b -> String
toString bs = foldr (:) [] bs

-- | This character is used to mark errors in a UTF8 encoded string.
replacement_char :: Char
replacement_char = '\xfffd'

-- | Try to extract a character from a byte string.
-- Returns 'Nothing' if there are no more bytes in the byte string.
-- Otherwise, it returns a decoded character and the number of
-- bytes used in its representation.
-- Errors are replaced by character '\0xFFFD'.

-- XXX: Should we combine sequences of errors into a single replacement
-- character?
{-# SPECIALIZE decode :: B.ByteString -> Maybe (Char,Int) #-}
{-# SPECIALIZE decode :: L.ByteString -> Maybe (Char,Int64) #-}
{-# SPECIALIZE decode :: [Word8]      -> Maybe (Char,Int) #-}
decode :: UTF8Bytes b s => b -> Maybe (Char,s)
decode bs = do (c,cs) <- buncons bs
               return (choose (fromEnum c) cs)
  where
  choose c cs
    | c < 0x80  = (toEnum $ fromEnum c, 1)
    | c < 0xc0  = (replacement_char, 1)
    | c < 0xe0  = bytes2 (mask c 0x1f) cs
    | c < 0xf0  = bytes3 (mask c 0x0f) cs
    | c < 0xf8  = bytes4 (mask c 0x07) cs
    | otherwise = (replacement_char, 1)

  mask c m = fromEnum (c .&. m)

  combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f)

  follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r)
  follower _ _                        = Nothing

  {-# INLINE get_follower #-}
  get_follower acc cs = do (x,xs) <- buncons cs
                           acc1 <- follower acc x
                           return (acc1,xs)

  bytes2 c cs = case get_follower c cs of
                  Just (d, _) | d >= 0x80  -> (toEnum d, 2)
                              | otherwise  -> (replacement_char, 1)
                  _ -> (replacement_char, 1)

  bytes3 c cs =
    case get_follower c cs of
      Just (d1, cs1) ->
        case get_follower d1 cs1 of
          Just (d, _) | (d >= 0x800 && d < 0xd800) ||
                        (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3)
                      | otherwise -> (replacement_char, 3)
          _ -> (replacement_char, 2)
      _ -> (replacement_char, 1)

  bytes4 c cs =
    case get_follower c cs of
      Just (d1, cs1) ->
        case get_follower d1 cs1 of
          Just (d2, cs2) ->
            case get_follower d2 cs2 of
              Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4)
                         | otherwise                    -> (replacement_char, 4)
              _ -> (replacement_char, 3)
          _ -> (replacement_char, 2)
      _ -> (replacement_char, 1)


-- | Split after a given number of characters.
-- Negative values are treated as if they are 0.
{-# SPECIALIZE splitAt :: Int   -> B.ByteString -> (B.ByteString,B.ByteString) #-}
{-# SPECIALIZE splitAt :: Int64 -> L.ByteString -> (L.ByteString,L.ByteString) #-}
{-# SPECIALIZE splitAt :: Int   -> [Word8] -> ([Word8],[Word8])    #-}
splitAt :: UTF8Bytes b s => s -> b -> (b,b)
splitAt x bs = loop 0 x bs
  where loop a n _ | n <= 0 = bsplit a bs
        loop a n bs1 = case decode bs1 of
                         Just (_,y) -> loop (a+y) (n-1) (bdrop y bs1)
                         Nothing    -> (bs, empty)

-- | @take n s@ returns the first @n@ characters of @s@.
-- If @s@ has less than @n@ characters, then we return the whole of @s@.
{-# INLINE take #-}
take :: UTF8Bytes b s => s -> b -> b
take n bs = fst (splitAt n bs)

-- | @drop n s@ returns the @s@ without its first @n@ characters.
-- If @s@ has less than @n@ characters, then we return an empty string.
{-# INLINE drop #-}
drop :: UTF8Bytes b s => s -> b -> b
drop n bs = snd (splitAt n bs)

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as '\0xFFFD' to the predicate.
{-# SPECIALIZE span :: (Char -> Bool) -> B.ByteString -> (B.ByteString,B.ByteString) #-}
{-# SPECIALIZE span :: (Char -> Bool) -> L.ByteString -> (L.ByteString,L.ByteString) #-}
{-# SPECIALIZE span :: (Char -> Bool) -> [Word8] -> ([Word8],[Word8])    #-}
span :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b)
span p bs = loop 0 bs
  where loop a cs = case decode cs of
                      Just (c,n) | p c -> loop (a+n) (bdrop n cs)
                      _ -> bsplit a bs

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that do not satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as '\0xFFFD' to the predicate.
{-# INLINE break #-}
break :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b)
break p bs = span (not . p) bs

-- | Get the first character of a byte string, if any.
-- Malformed characters are replaced by '\0xFFFD'.
{-# INLINE uncons #-}
uncons :: UTF8Bytes b s => b -> Maybe (Char,b)
uncons bs = do (c,n) <- decode bs
               return (c, bdrop n bs)

-- | Traverse a bytestring (right biased).
{-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> B.ByteString -> a #-}
{-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> L.ByteString -> a #-}
{-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> [Word8]      -> a #-}
foldr :: UTF8Bytes b s => (Char -> a -> a) -> a -> b -> a
foldr cons nil cs = case uncons cs of
                      Just (a,as) -> cons a (foldr cons nil as)
                      Nothing     -> nil

-- | Traverse a bytestring (left biased).
-- This function is strict in the accumulator.
{-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> B.ByteString -> a #-}
{-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> L.ByteString -> a #-}
{-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> [Word8]      -> a #-}
foldl :: UTF8Bytes b s => (a -> Char -> a) -> a -> b -> a
foldl add acc cs  = case uncons cs of
                      Just (a,as) -> let v = add acc a
                                     in seq v (foldl add v as)
                      Nothing     -> acc

-- | Counts the number of characters encoded in the bytestring.
-- Note that this includes replacement characters.
{-# SPECIALIZE length :: B.ByteString -> Int #-}
{-# SPECIALIZE length :: L.ByteString -> Int64 #-}
{-# SPECIALIZE length :: [Word8]      -> Int #-}
length :: UTF8Bytes b s => b -> s
length b = loop 0 b
  where loop n xs = case decode xs of
                      Just (_,m) -> loop (n+1) (bdrop m xs)
                      Nothing -> n

-- | Split a string into a list of lines.
-- Lines are terminated by '\n' or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- See also 'lines\''.
{-# SPECIALIZE lines :: B.ByteString -> [B.ByteString] #-}
{-# SPECIALIZE lines :: L.ByteString -> [L.ByteString] #-}
{-# SPECIALIZE lines :: [Word8]      -> [[Word8]]       #-}
lines :: UTF8Bytes b s => b -> [b]
lines bs | null bs  = []
lines bs = case elemIndex 10 bs of
             Just x -> let (xs,ys) = bsplit x bs
                       in xs : lines (tail ys)
             Nothing -> [bs]

-- | Split a string into a list of lines.
-- Lines are terminated by '\n' or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- This function preserves the terminators.
-- See also 'lines'.
{-# SPECIALIZE lines' :: B.ByteString -> [B.ByteString] #-}
{-# SPECIALIZE lines' :: L.ByteString -> [L.ByteString] #-}
{-# SPECIALIZE lines' :: [Word8]      -> [[Word8]]      #-}
lines' :: UTF8Bytes b s => b -> [b]
lines' bs | null bs  = []
lines' bs = case elemIndex 10 bs of
              Just x -> let (xs,ys) = bsplit (x+1) bs
                        in xs : lines' ys
              Nothing -> [bs]

-----------
-- Compatibility functions for base-2

unconsB :: B.ByteString -> Maybe (Word8,B.ByteString)
unconsL :: L.ByteString -> Maybe (Word8,L.ByteString)

#ifdef BYTESTRING_IN_BASE
unconsB bs | B.null bs = Nothing
           | otherwise = Just (unsafeHead bs, unsafeTail bs)

unconsL bs = case L.toChunks bs of
    (x:xs) | not (B.null x)     -> Just (unsafeHead x, L.fromChunks (unsafeTail x:xs))
    _                           -> Nothing
#else
unconsB = B.uncons
unconsL = L.uncons
#endif