src/Darcs/Util/ByteString.hs

{-# LANGUAGE BangPatterns, ForeignFunctionInterface, CPP, ScopedTypeVariables #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  Darcs.Util.ByteString
-- Copyright   :  (c) The University of Glasgow 2001,
--                    David Roundy 2003-2005
-- License : GPL (I'm happy to also license this file BSD style but don't
--           want to bother distributing two license files with darcs.
--
-- Maintainer  :  droundy@abridgegame.org
-- Stability   :  experimental
-- Portability :  portable
--
-- GZIp and MMap IO for ByteStrings, encoding utilities, and miscellaneous
-- functions for Data.ByteString
--

module Darcs.Util.ByteString (

        unsafeWithInternals,
        unpackPSFromUTF8,
        packStringToUTF8,

        -- IO with mmap or gzip
        gzReadFilePS,
        mmapFilePS,
        gzWriteFilePS,
        gzWriteFilePSs,
        gzReadStdin,
        gzWriteHandle,
        FileSegment,
        readSegment,

        -- gzip handling
        isGZFile,
        gzDecompress,

        -- list utilities
        dropSpace,
        breakSpace,
        linesPS,
        unlinesPS,
        hashPS,
        breakFirstPS,
        breakLastPS,
        substrPS,
        readIntPS,
        isFunky,
        fromHex2PS,
        fromPS2Hex,
        betweenLinesPS,
        breakAfterNthNewline,
        breakBeforeNthNewline,
        intercalate,

        -- encoding and unicode utilities
        isAscii,
        decodeLocale,
        encodeLocale,
        decodeString
    ) where

import Prelude ()
import Darcs.Prelude

import qualified Data.ByteString            as B
import qualified Data.ByteString.Char8      as BC
import qualified Data.ByteString.Internal   as BI
import qualified Data.ByteString.Lazy.Char8 as BL8
import Data.ByteString (intercalate)
import Data.ByteString.Internal (fromForeignPtr)

import Control.Exception ( catch, SomeException )
import System.IO ( withFile, IOMode(ReadMode)
                 , hSeek, SeekMode(SeekFromEnd,AbsoluteSeek)
                 , openBinaryFile, hClose, Handle, hGetChar
                 , stdin)
import System.IO.Unsafe         ( unsafePerformIO )

import Foreign.Storable         ( peek )
import Foreign.Marshal.Array    ( advancePtr )
import Foreign.C.Types          ( CInt(..) )

import Data.Bits                ( rotateL )
import Data.Char                ( ord, isSpace )
import Data.Word                ( Word8 )
import Data.Int                 ( Int32, Int64 )
import qualified Data.Text as T ( pack, unpack )
import Data.Text.Encoding       ( encodeUtf8, decodeUtf8With )
import Data.Text.Encoding.Error ( lenientDecode )
import Control.Monad            ( when )
#if MIN_VERSION_zlib(0,6,0)
import Control.Monad.ST.Lazy    ( ST )
#endif

import Foreign.Ptr              ( plusPtr, Ptr )
import Foreign.ForeignPtr       ( withForeignPtr )

#ifdef DEBUG_PS
import Foreign.ForeignPtr       ( addForeignPtrFinalizer )
import Foreign.Ptr              ( FunPtr )
#endif

import qualified Data.ByteString.Lazy as BL
import qualified Codec.Compression.GZip as GZ
import qualified Codec.Compression.Zlib.Internal as ZI
import Darcs.Util.Encoding ( decode, encode )
import Darcs.Util.Global ( addCRCWarning )

#if mingw32_HOST_OS
#else
import System.IO.MMap( mmapFileByteString )
import System.Posix.Files( fileSize, getSymbolicLinkStatus )
#endif
import System.Mem( performGC )
import qualified Bundled.Posix as Bundled ( getFileStatus, fileSize )

-- -----------------------------------------------------------------------------
-- obsolete debugging code

-- -----------------------------------------------------------------------------
-- unsafeWithInternals

-- | Do something with the internals of a PackedString. Beware of
-- altering the contents!
unsafeWithInternals :: B.ByteString -> (Ptr Word8 -> Int -> IO a) -> IO a
unsafeWithInternals ps f
 = case BI.toForeignPtr ps of
   (fp,s,l) -> withForeignPtr fp $ \p -> f (p `plusPtr` s) l

-- | readIntPS skips any whitespace at the beginning of its argument, and
-- reads an Int from the beginning of the PackedString.  If there is no
-- integer at the beginning of the string, it returns Nothing, otherwise it
-- just returns the int read, along with a B.ByteString containing the
-- remainder of its input.

readIntPS :: B.ByteString -> Maybe (Int, B.ByteString)
readIntPS = BC.readInt . BC.dropWhile isSpace

-- -----------------------------------------------------------------------------
-- Destructor functions (taking PackedStrings apart)

-- | Decodes a 'ByteString' containing UTF-8 to a 'String'. Decoding errors are
--   flagged with the U+FFFD character.
unpackPSFromUTF8 :: B.ByteString -> String
unpackPSFromUTF8  = T.unpack . decodeUtf8With lenientDecode

packStringToUTF8 :: String -> B.ByteString
packStringToUTF8 = encodeUtf8 . T.pack

------------------------------------------------------------------------
-- A reimplementation of Data.ByteString.Char8.dropSpace, but
-- specialised to darcs' need for a 4 way isspace.
--
-- TODO: if it is safe to use the expanded definition of isSpaceWord8
-- provided by Data.ByteString.Char8, then all this can go.

-- A locale-independent isspace(3) so patches are interpreted the same everywhere.
-- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
isSpaceWord8 :: Word8 -> Bool
isSpaceWord8 = (`elem` [0x20, 0x09, 0x0A, 0x0D])
{-# INLINE isSpaceWord8 #-}

dropSpace :: B.ByteString -> B.ByteString
dropSpace bs = B.dropWhile isSpaceWord8 bs

breakSpace :: B.ByteString -> (B.ByteString, B.ByteString)
breakSpace bs = B.break isSpaceWord8 bs

------------------------------------------------------------------------

{-# INLINE isFunky #-}
isFunky :: B.ByteString -> Bool
isFunky ps = case BI.toForeignPtr ps of
   (x,s,l) ->
    unsafePerformIO $ withForeignPtr x $ \p->
    (/=0) `fmap` has_funky_char (p `plusPtr` s) (fromIntegral l)

foreign import ccall unsafe "fpstring.h has_funky_char" has_funky_char
    :: Ptr Word8 -> CInt -> IO CInt

------------------------------------------------------------------------

-- ByteString rewrites break (=='x') to breakByte 'x'
--  break ((==) x) = breakChar x
--  break (==x) = breakChar x
--

{-
{-# INLINE breakOnPS #-}
breakOnPS :: Char -> B.ByteString -> (B.ByteString, B.ByteString)
breakOnPS c p = case BC.elemIndex c p of
                Nothing -> (p, BC.empty)
                Just n  -> (B.take n p, B.drop n p)
-}

{-# INLINE hashPS #-}
hashPS :: B.ByteString -> Int32
hashPS ps =
   case BI.toForeignPtr ps of
   (x,s,l) ->
    unsafePerformIO $ withForeignPtr x $ \p->
    hash (p `plusPtr` s) l

hash :: Ptr Word8 -> Int -> IO Int32
hash = f (0 :: Int32)
 where f h _ 0 = return h
       f h p n = do x <- peek p
                    let !h' =  fromIntegral x + rotateL h 8
                    f h' (p `advancePtr` 1) (n-1)

{-# INLINE substrPS #-}
substrPS :: B.ByteString -> B.ByteString -> Maybe Int
substrPS tok str
    | B.null tok = Just 0
    | B.length tok > B.length str = Nothing
    | otherwise = do n <- BC.elemIndex (BC.head tok) str
                     let ttok = B.tail tok
                         reststr = B.drop (n+1) str
                     if ttok == B.take (B.length ttok) reststr
                        then Just n
                        else ((n+1)+) `fmap` substrPS tok reststr

------------------------------------------------------------------------

-- TODO: replace breakFirstPS and breakLastPS with definitions based on
-- ByteString's break/breakEnd
{-# INLINE breakFirstPS #-}
breakFirstPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
breakFirstPS c p = case BC.elemIndex c p of
                   Nothing -> Nothing
                   Just n -> Just (B.take n p, B.drop (n+1) p)

{-# INLINE breakLastPS #-}
breakLastPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
breakLastPS c p = case BC.elemIndexEnd c p of
                  Nothing -> Nothing
                  Just n -> Just (B.take n p, B.drop (n+1) p)

-- TODO: rename
{-# INLINE linesPS #-}
linesPS :: B.ByteString -> [B.ByteString]
linesPS ps
     | B.null ps = [B.empty]
     | otherwise = BC.split '\n' ps

{- QuickCheck property:

import Test.QuickCheck
import qualified Data.ByteString.Char8 as BC
import Data.Char
instance Arbitrary BC.ByteString where
    arbitrary = fmap BC.pack arbitrary
instance Arbitrary Char where
  arbitrary = chr `fmap` choose (32,127)
deepCheck = check (defaultConfig { configMaxTest = 10000})
testLines =  deepCheck (\x -> (linesPS x == linesPSOld x))
linesPSOld ps = case  BC.elemIndex '\n' ps of
             Nothing -> [ps]
             Just n -> B.take n ps : linesPS (B.drop (n+1) ps) -}

{-| This function acts exactly like the "Prelude" unlines function, or like
"Data.ByteString.Char8" 'unlines', but with one important difference: it will
produce a string which may not end with a newline! That is:

> unlinesPS ["foo", "bar"]

evaluates to \"foo\\nbar\", not \"foo\\nbar\\n\"! This point should hold true for
'linesPS' as well.

TODO: rename this function. -}
unlinesPS :: [B.ByteString] -> B.ByteString
unlinesPS [] = BC.empty
unlinesPS x  = BC.init $ BC.unlines x
{-# INLINE unlinesPS #-}
{- QuickCheck property:

testUnlines = deepCheck (\x -> (unlinesPS x == unlinesPSOld x))
unlinesPSOld ss = BC.concat $ intersperse_newlines ss
    where intersperse_newlines (a:b:s) = a : newline : intersperse_newlines (b:s)
          intersperse_newlines s = s
          newline = BC.pack "\n" -}

-- -----------------------------------------------------------------------------
-- gzReadFilePS

-- |Decompress the given bytestring into a lazy list of chunks, along with a boolean
-- flag indicating (if True) that the CRC was corrupted.
-- Inspecting the flag will cause the entire list of chunks to be evaluated (but if
-- you throw away the list immediately this should run in constant space).
gzDecompress :: Maybe Int -> BL.ByteString -> ([B.ByteString], Bool)
gzDecompress mbufsize =
    -- This is what the code would be without the bad CRC recovery logic:
    -- return . BL.toChunks . GZ.decompressWith decompressParams
#if MIN_VERSION_zlib(0,6,0)
    decompressWarn (ZI.decompressST ZI.gzipFormat decompressParams)
#else
    toListWarn . ZI.decompressWithErrors ZI.gzipFormat decompressParams
#endif
  where
        decompressParams = case mbufsize of
                              Just bufsize -> GZ.defaultDecompressParams { GZ.decompressBufferSize = bufsize }
                              Nothing -> GZ.defaultDecompressParams

#if MIN_VERSION_zlib(0,6,0)
        decompressWarn :: (forall s . ZI.DecompressStream (ST s)) -> BL.ByteString -> ([B.ByteString], Bool)
        decompressWarn = ZI.foldDecompressStreamWithInput
                           (\x ~(xs, b) -> (x:xs, b))
                           (\xs -> if BL.null xs
                                      then ([], False)
                                      else error "trailing data at end of compressed stream"
                           )
                           handleBad
#else
        toListWarn :: ZI.DecompressStream -> ([B.ByteString], Bool)
        toListWarn = foldDecompressStream (\x ~(xs, b) -> (x:xs, b)) ([], False) handleBad

        -- cut and paste from Zlib since it's not currently exported (interface not yet certain)
        foldDecompressStream :: (B.ByteString -> a -> a) -> a
                             -> (ZI.DecompressError -> String -> a)
                             -> ZI.DecompressStream -> a
        foldDecompressStream chunk end err = fold
                   where
                       fold ZI.StreamEnd               = end
                       fold (ZI.StreamChunk bs stream) = chunk bs (fold stream)
                       fold (ZI.StreamError code msg)  = err code msg
#endif

        -- For a while a bug in darcs caused gzip files with good data but bad CRCs to be
        -- produced. Trap bad CRC messages, run the specified action to report that it happened,
        -- but continue on the assumption that the data is valid.
#if MIN_VERSION_zlib(0,6,0)
        handleBad (ZI.DataFormatError "incorrect data check") = ([], True)
        handleBad e = error (show e)
#else
        handleBad ZI.DataError "incorrect data check" = ([], True)
        handleBad _ msg = error msg
#endif

isGZFile :: FilePath -> IO (Maybe Int)
isGZFile f = do
    h <- openBinaryFile f ReadMode
    header <- B.hGet h 2
    if header /= BC.pack "\31\139"
       then do hClose h
               return Nothing
       else do hSeek h SeekFromEnd (-4)
               len <- hGetLittleEndInt h
               hClose h
               return (Just len)

-- | Read an entire file, which may or may not be gzip compressed, directly
-- into a 'B.ByteString'.
gzReadFilePS :: FilePath -> IO B.ByteString
gzReadFilePS f = do
    mlen <- isGZFile f
    case mlen of
       Nothing -> mmapFilePS f
       Just len ->
            do -- Passing the length to gzDecompress means that it produces produces one chunk,
               -- which in turn means that B.concat won't need to copy data.
               -- If the length is wrong this will just affect efficiency, not correctness
               let doDecompress buf = let (res, bad) = gzDecompress (Just len) buf
                                      in do when bad $ addCRCWarning f
                                            return res
               compressed <- (BL.fromChunks . return) `fmap` mmapFilePS f
               B.concat `fmap` doDecompress compressed

hGetLittleEndInt :: Handle -> IO Int
hGetLittleEndInt h = do
    b1 <- ord `fmap` hGetChar h
    b2 <- ord `fmap` hGetChar h
    b3 <- ord `fmap` hGetChar h
    b4 <- ord `fmap` hGetChar h
    return $ b1 + 256*b2 + 65536*b3 + 16777216*b4

gzWriteFilePS :: FilePath -> B.ByteString -> IO ()
gzWriteFilePS f ps = gzWriteFilePSs f [ps]

gzWriteFilePSs :: FilePath -> [B.ByteString] -> IO ()
gzWriteFilePSs f pss  =
    BL.writeFile f $ GZ.compress $ BL.fromChunks pss

gzWriteHandle :: Handle -> [B.ByteString] -> IO ()
gzWriteHandle h pss  =
    BL.hPut h $ GZ.compress $ BL.fromChunks pss

-- | Read standard input, which may or may not be gzip compressed, directly
-- into a 'B.ByteString'.
gzReadStdin :: IO B.ByteString
gzReadStdin = do
    header <- B.hGet stdin 2
    rest   <- B.hGetContents stdin
    let allStdin = B.concat [header,rest]
    return $
     if header /= BC.pack "\31\139"
      then allStdin
      else let decompress = fst . gzDecompress Nothing
               compressed = BL.fromChunks [allStdin]
           in
           B.concat $ decompress compressed

-- | Pointer to a filesystem, possibly with start/end offsets. Supposed to be
-- fed to (uncurry mmapFileByteString) or similar.
type FileSegment = (FilePath, Maybe (Int64, Int))

-- | Read in a FileSegment into a Lazy ByteString. Implemented using mmap.
readSegment :: FileSegment -> IO BL.ByteString
readSegment (f,range) = do
    bs <- tryToRead
       `catch` (\(_::SomeException) -> do
                     size <- Bundled.fileSize `fmap` Bundled.getFileStatus f
                     if size == 0
                        then return BC.empty
                        else performGC >> tryToRead)
    return $ BL8.fromChunks [bs]
  where
    tryToRead =
        case range of
            Nothing -> B.readFile f
            Just (off, size) -> withFile f ReadMode $ \h -> do
                hSeek h AbsoluteSeek $ fromIntegral off
                B.hGet h size
{-# INLINE readSegment #-}

-- -----------------------------------------------------------------------------
-- mmapFilePS

-- | Like readFilePS, this reads an entire file directly into a
-- 'B.ByteString', but it is even more efficient.  It involves directly
-- mapping the file to memory.  This has the advantage that the contents of
-- the file never need to be copied.  Also, under memory pressure the page
-- may simply be discarded, wile in the case of readFilePS it would need to
-- be written to swap.  If you read many small files, mmapFilePS will be
-- less memory-efficient than readFilePS, since each mmapFilePS takes up a
-- separate page of memory.  Also, you can run into bus errors if the file
-- is modified.  NOTE: as with 'readFilePS', the string representation in
-- the file is assumed to be ISO-8859-1.

mmapFilePS :: FilePath -> IO B.ByteString
#if mingw32_HOST_OS
mmapFilePS = B.readFile
#else
mmapFilePS f =
  mmapFileByteString f Nothing
   `catch` (\(_ :: SomeException) -> do
                     size <- fileSize `fmap` getSymbolicLinkStatus f
                     if size == 0
                        then return B.empty
                        else performGC >> mmapFileByteString f Nothing)
#endif

-- -------------------------------------------------------------------------
-- fromPS2Hex

foreign import ccall unsafe "static fpstring.h conv_to_hex" conv_to_hex
    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()

fromPS2Hex :: B.ByteString -> B.ByteString
fromPS2Hex ps = case BI.toForeignPtr ps of
          (x,s,l) ->
           BI.unsafeCreate (2*l) $ \p -> withForeignPtr x $ \f ->
           conv_to_hex p (f `plusPtr` s) $ fromIntegral l

-- -------------------------------------------------------------------------
-- fromHex2PS

foreign import ccall unsafe "static fpstring.h conv_from_hex" conv_from_hex
    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()

fromHex2PS :: B.ByteString -> B.ByteString
fromHex2PS ps = case BI.toForeignPtr ps of
          (x,s,l) ->
           BI.unsafeCreate (l `div` 2) $ \p -> withForeignPtr x $ \f ->
           conv_from_hex p (f `plusPtr` s) (fromIntegral $ l `div` 2)

-- -------------------------------------------------------------------------
-- betweenLinesPS

-- | betweenLinesPS returns the B.ByteString between the two lines given,
-- or Nothing if they do not appear.

betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString
               -> Maybe B.ByteString
betweenLinesPS start end ps
 = case break (start ==) (linesPS ps) of
       (_, _:rest@(bs1:_)) ->
           case BI.toForeignPtr bs1 of
            (ps1,s1,_) ->
             case break (end ==) rest of
               (_, bs2:_) -> case BI.toForeignPtr bs2 of (_,s2,_) -> Just $ fromForeignPtr ps1 s1 (s2 - s1)
               _ -> Nothing
       _ -> Nothing

-- -------------------------------------------------------------------------
-- breakAfterNthNewline

breakAfterNthNewline :: Int -> B.ByteString
                        -> Maybe (B.ByteString, B.ByteString)
breakAfterNthNewline 0 the_ps | B.null the_ps = Just (B.empty, B.empty)
                              | otherwise     = Just (B.empty, the_ps)
breakAfterNthNewline n the_ps =
  go n (B.elemIndices (BI.c2w '\n') the_ps)
  where  go 0 []      = Just (the_ps, B.empty)
         go _ []      = Nothing
         go 1 (i:_)   = Just $ B.splitAt (i+1) the_ps
         go !m (_:is) = go (m-1) is

-- -------------------------------------------------------------------------
-- breakBeforeNthNewline

breakBeforeNthNewline :: Int -> B.ByteString -> (B.ByteString, B.ByteString)
breakBeforeNthNewline 0 the_ps
 | B.null the_ps = (B.empty, B.empty)
breakBeforeNthNewline n the_ps =
  go n (B.elemIndices (BI.c2w '\n') the_ps)
  where  go _ []      = (the_ps, B.empty)
         go 0 (i:_)   = B.splitAt i the_ps
         go !m (_:is) = go (m-1) is

-- | Test if a ByteString is made of ascii characters
isAscii :: B.ByteString -> Bool
isAscii = B.all (< 128)

-- | Decode a ByteString to a String according to the current locale
-- unsafePerformIO in the locale function is ratified by the fact that GHC 6.12
-- and above also supply locale conversion with functions with a pure type.
-- Unrecognized byte sequences in the input are skipped.
decodeLocale :: B.ByteString -> String
decodeLocale = unsafePerformIO . decode

-- | Encode a String to a ByteString with char8 encoding (i.e., the values of the
-- characters become the values of the bytes; if a character value is greater
-- than 255, its byte becomes the character value modulo 256)
encodeChar8 :: String -> B.ByteString
encodeChar8 = B.pack . map (fromIntegral . ord)

-- | Encode a String to a ByteString according to the current locale
encodeLocale :: String -> B.ByteString
encodeLocale = unsafePerformIO . encode

-- | Take a 'String' that represents byte values and re-decode it acording to
-- the current locale.
-- Note: we globally enforce char8 as the default encoding, see "Main" and
-- "Darcs.Utils". This means we get command line args and environment variables
-- as 'String's with char8 encoding, too. So we need this to convert such
-- strings back to the user's encoding.
decodeString :: String -> IO String
decodeString = decode . encodeChar8