Data/String/UTF8.hs

{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
#endif
--
-- |
-- Module      :  Data.String.UTF8
-- Copyright   :  (c) Iavor S. Diatchki 2009
-- License     :  BSD3-style (see LICENSE)
--
-- Maintainer  :  emertens@galois.com
-- Stability   :  experimental
-- Portability :  portable
--
{-# LANGUAGE MultiParamTypeClasses, UndecidableInstances #-}
module Data.String.UTF8
  ( -- * Representation
    UTF8
  , UTF8Bytes()
  , fromString
  , toString
  , fromRep
  , toRep
  , G.replacement_char

  -- * Character based operations
  , uncons
  , splitAt
  , take
  , drop
  , span
  , break
  , foldl
  , foldr
  , length
  , lines
  , lines'

  -- * Representation based operations
  , null
  , decode
  , byteSplitAt
  , byteTake
  , byteDrop
  ) where

import Prelude hiding (null,take,drop,span,break
                      ,foldl,foldr,length,lines,splitAt)
import qualified Codec.Binary.UTF8.Generic as G
import Codec.Binary.UTF8.Generic (UTF8Bytes)

-- | The type of strings that are represented using the UTF8 encoding.
-- The parameter is the type of the container for the representation.
newtype UTF8 string = Str string deriving (Eq,Ord)   -- XXX: Is this OK?

instance UTF8Bytes string index => Show (UTF8 string) where
  show x = show (toString x)

fromRep :: string -> UTF8 string
fromRep = Str

toRep :: UTF8 string -> string
toRep (Str x) = x

-- | Converts a Haskell string into a UTF8 encoded string.
-- Complexity: linear.
fromString :: UTF8Bytes string index => String -> UTF8 string
fromString xs = Str (G.fromString xs)

-- | Convert a UTF8 encoded string into a Haskell string.
-- Invalid characters are replaced by 'replacement_char'.
-- Complexity: linear.
toString :: UTF8Bytes string index => UTF8 string -> String
toString (Str xs) = G.toString xs

-- | Checks if there are no more bytes in the underlying representation.
null :: UTF8Bytes string index => UTF8 string -> Bool
null (Str x) = G.null x

-- | Split after a given number of characters.
-- Negative values are treated as if they are 0.
-- See also 'bytesSplitAt'.
splitAt :: UTF8Bytes string index
        => index -> UTF8 string -> (UTF8 string, UTF8 string)
splitAt x (Str bs)  = case G.splitAt x bs of
                        (s1,s2) -> (Str s1, Str s2)

-- | Split after a given number of bytes in the underlying representation.
-- See also 'splitAt'.
byteSplitAt :: UTF8Bytes string index
             => index -> UTF8 string -> (UTF8 string, UTF8 string)
byteSplitAt n (Str x) = case G.bsplit n x of
                          (as,bs) -> (Str as, Str bs)

-- | Take only the given number of bytes from the underlying representation.
-- See also 'take'.
byteTake :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
byteTake n (Str x) = Str (fst (G.bsplit n x))

-- | Drop the given number of bytes from the underlying representation.
-- See also 'drop'.
byteDrop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
byteDrop n (Str x) = Str (G.bdrop n x)


-- | @take n s@ returns the first @n@ characters of @s@.
-- If @s@ has less than @n@ characters, then we return the whole of @s@.
take :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
take n (Str bs) = Str (G.take n bs)

-- | @drop n s@ returns the @s@ without its first @n@ characters.
-- If @s@ has less than @n@ characters, then we return an empty string.
drop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
drop n (Str bs) = Str (G.drop n bs)

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as '\0xFFFD' to the predicate.
span :: UTF8Bytes string index
     => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
span p (Str bs) = case G.span p bs of
                    (s1,s2) -> (Str s1, Str s2)

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that do not satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as 'replacement_char' to the predicate.
break :: UTF8Bytes string index
      => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
break p (Str bs)  = case G.break p bs of
                      (s1,s2) -> (Str s1, Str s2)

-- | Get the first character of a byte string, if any.
-- Invalid characters are replaced by 'replacement_char'.
uncons :: UTF8Bytes string index
       => UTF8 string -> Maybe (Char, UTF8 string)
uncons (Str x)  = do (c,y) <- G.uncons x
                     return (c, Str y)

-- | Extract the first character for the underlying representation,
-- if one is available.  It also returns the number of bytes used
-- in the representation of the character.
-- See also 'uncons', 'dropBytes'.
decode :: UTF8Bytes string index => UTF8 string -> Maybe (Char, index)
decode (Str x)  = G.decode x

-- | Traverse a bytestring (right biased).
foldr :: UTF8Bytes string index => (Char -> a -> a) -> a -> UTF8 string -> a
foldr cons nil (Str cs) = G.foldr cons nil cs

-- | Traverse a bytestring (left biased).
-- This function is strict in the accumulator.
foldl :: UTF8Bytes string index => (a -> Char -> a) -> a -> UTF8 string -> a
foldl add acc (Str cs)  = G.foldl add acc cs

-- | Counts the number of characters encoded in the bytestring.
-- Note that this includes replacement characters.
-- The function is linear in the number of bytes in the representation.
length :: UTF8Bytes string index => UTF8 string -> index
length (Str b) = G.length b

-- | Split a string into a list of lines.
-- Lines are terminated by '\n' or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- See also 'lines\''.
lines :: UTF8Bytes string index => UTF8 string -> [UTF8 string]
lines (Str b) = map Str (G.lines b)   -- XXX: unnecessary map

-- | Split a string into a list of lines.
-- Lines are terminated by '\n' or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- This function preserves the terminators.
-- See also 'lines'.
lines' :: UTF8Bytes string index => UTF8 string -> [UTF8 string]
lines' (Str x)  = map Str (G.lines' x)  -- XXX: unnecessary map