{-# LANGUAGE BangPatterns #-} {-# LANGUAGE PackageImports #-} {-# OPTIONS_HADDOCK hide #-} ----------------------------------------------------------------------------- -- | -- Module : Codec.Archive.Tar.Read -- Copyright : (c) 2007 Bjorn Bringert, -- 2008 Andrea Vezzosi, -- 2008-2009 Duncan Coutts, -- 2011 Max Bolingbroke -- License : BSD3 -- -- Maintainer : duncan@community.haskell.org -- Portability : portable -- ----------------------------------------------------------------------------- module Codec.Archive.Tar.Read ( read , FormatError(..) ) where import Codec.Archive.Tar.PackAscii import Codec.Archive.Tar.Types import Data.Char (ord) import Data.Int (Int64) import Data.Bits (Bits(shiftL, (.&.), complement)) import Control.Exception (Exception(..)) import Data.Typeable (Typeable) import Control.Applicative import Control.Monad import Control.DeepSeq import Control.Monad.Trans.State.Lazy import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as BS.Char8 import qualified Data.ByteString.Unsafe as BS import qualified Data.ByteString.Lazy as LBS import System.IO.Unsafe (unsafePerformIO) import "os-string" System.OsString.Posix (PosixString, PosixChar) import qualified "os-string" System.OsString.Posix as PS import Prelude hiding (read) -- | Errors that can be encountered when parsing a Tar archive. data FormatError = TruncatedArchive | ShortTrailer | BadTrailer | TrailingJunk | ChecksumIncorrect | NotTarFormat | UnrecognisedTarFormat | HeaderBadNumericEncoding deriving (Eq, Show, Typeable) instance Exception FormatError where displayException TruncatedArchive = "truncated tar archive" displayException ShortTrailer = "short tar trailer" displayException BadTrailer = "bad tar trailer" displayException TrailingJunk = "tar file has trailing junk" displayException ChecksumIncorrect = "tar checksum error" displayException NotTarFormat = "data is not in tar format" displayException UnrecognisedTarFormat = "tar entry not in a recognised format" displayException HeaderBadNumericEncoding = "tar header is malformed (bad numeric encoding)" instance NFData FormatError where rnf !_ = () -- enumerations are fully strict by construction -- | Convert a data stream in the tar file format into an internal data -- structure. Decoding errors are reported by the 'Fail' constructor of the -- 'Entries' type. -- -- * The conversion is done lazily. -- read :: LBS.ByteString -> Entries FormatError read = evalState (readStreaming getN get) where getN :: Int64 -> State LBS.ByteString LBS.ByteString getN n = do (pref, st) <- LBS.splitAt n <$> get put st pure pref readStreaming :: Monad m => (Int64 -> m LBS.ByteString) -> m LBS.ByteString -> m (Entries FormatError) readStreaming = (unfoldEntriesM id .) . getEntryStreaming getEntryStreaming :: Monad m => (Int64 -> m LBS.ByteString) -> m LBS.ByteString -> m (Either FormatError (Maybe Entry)) getEntryStreaming getN getAll = do header <- getN 512 if LBS.length header < 512 then pure (Left TruncatedArchive) else do -- Tar files end with at least two blocks of all '0'. Checking this serves -- two purposes. It checks the format but also forces the tail of the data -- which is necessary to close the file if it came from a lazily read file. -- -- It's tempting to fall into trailer parsing as soon as LBS.head bs == '\0', -- because, if interpreted as an 'Entry', it means that 'entryTarPath' is an empty -- string. Yet it's not a concern of this function: parse it as an 'Entry' -- and let further pipeline such as 'checkEntrySecurity' deal with it. After all, -- it might be a format extension with unknown semantics. Such somewhat malformed -- archives do exist in the wild, see https://github.com/haskell/tar/issues/73. -- -- Only if an entire block is null, we assume that we are parsing a trailer. if LBS.all (== 0) header then do nextBlock <- getN 512 if LBS.length nextBlock < 512 then pure (Left ShortTrailer) else if LBS.all (== 0) nextBlock then do remainder <- getAll pure $ if LBS.all (== 0) remainder then Right Nothing else Left TrailingJunk else pure (Left BadTrailer) else case parseHeader header of Left err -> pure $ Left err Right (name, mode, uid, gid, size, mtime, typecode, linkname, format, uname, gname, devmajor, devminor, prefix) -> do -- It is crucial to get (size + padding) in one monadic operation -- and drop padding in a pure computation. If you get size bytes first, -- then skip padding, unpacking in constant memory will become impossible. let paddedSize = (size + 511) .&. complement 511 paddedContent <- getN paddedSize let content = LBS.take size paddedContent pure $ Right $ Just $ Entry { entryTarPath = TarPath (byteToPosixString name) (byteToPosixString prefix), entryContent = case typecode of '\0' -> NormalFile content size '0' -> NormalFile content size '1' -> HardLink (LinkTarget $ byteToPosixString linkname) '2' -> SymbolicLink (LinkTarget $ byteToPosixString linkname) _ | format == V7Format -> OtherEntryType typecode content size '3' -> CharacterDevice devmajor devminor '4' -> BlockDevice devmajor devminor '5' -> Directory '6' -> NamedPipe '7' -> NormalFile content size _ -> OtherEntryType typecode content size, entryPermissions = mode, entryOwnership = Ownership (BS.Char8.unpack uname) (BS.Char8.unpack gname) uid gid, entryTime = mtime, entryFormat = format } parseHeader :: LBS.ByteString -> Either FormatError (BS.ByteString, Permissions, Int, Int, Int64, EpochTime, Char, BS.ByteString, Format, BS.ByteString, BS.ByteString, DevMajor, DevMinor, BS.ByteString) parseHeader header' = do case (chksum_, format_ magic) of (Right chksum, _ ) | correctChecksum header chksum -> return () (Right _, Right _) -> Left ChecksumIncorrect _ -> Left NotTarFormat mode <- mode_ uid <- uid_ gid <- gid_ size <- size_ mtime <- mtime_ format <- format_ magic devmajor <- devmajor_ devminor <- devminor_ pure (name, mode, uid, gid, size, mtime, typecode, linkname, format, uname, gname, devmajor, devminor, prefix) where header = LBS.toStrict header' name = getString 0 100 header mode_ = getOct 100 8 header uid_ = getOct 108 8 header gid_ = getOct 116 8 header size_ = getOct 124 12 header mtime_ = getOct 136 12 header chksum_ = getOct 148 8 header typecode = getByte 156 header linkname = getString 157 100 header magic = getChars 257 8 header uname = getString 265 32 header gname = getString 297 32 header devmajor_ = getOct 329 8 header devminor_ = getOct 337 8 header prefix = getString 345 155 header -- trailing = getBytes 500 12 header format_ :: BS.ByteString -> Either FormatError Format format_ magic | magic == ustarMagic = return UstarFormat | magic == gnuMagic = return GnuFormat | magic == v7Magic = return V7Format | otherwise = Left UnrecognisedTarFormat v7Magic, ustarMagic, gnuMagic :: BS.ByteString v7Magic = BS.Char8.pack "\0\0\0\0\0\0\0\0" ustarMagic = BS.Char8.pack "ustar\NUL00" gnuMagic = BS.Char8.pack "ustar \NUL" correctChecksum :: BS.ByteString -> Int -> Bool correctChecksum header checksum = checksum == checksum' where -- sum of all 512 bytes in the header block, -- treating each byte as an 8-bit unsigned value sumchars = BS.foldl' (\x y -> x + fromIntegral y) 0 -- treating the 8 bytes of chksum as blank characters. checksum' = sumchars (BS.take 148 header) + 256 -- 256 = sumchars (BS.Char8.replicate 8 ' ') + sumchars (BS.drop 156 header) -- * TAR format primitive input {-# SPECIALISE getOct :: Int -> Int -> BS.ByteString -> Either FormatError Int #-} {-# SPECIALISE getOct :: Int -> Int -> BS.ByteString -> Either FormatError Int64 #-} getOct :: (Integral a, Bits a) => Int -> Int -> BS.ByteString -> Either FormatError a getOct off len = parseOct . getBytes off len where -- As a star extension, octal fields can hold a base-256 value if the high -- bit of the initial character is set. The initial character can be: -- 0x80 ==> trailing characters hold a positive base-256 value -- 0xFF ==> trailing characters hold a negative base-256 value -- -- In both cases, there won't be a trailing NUL/space. -- -- GNU tar seems to contain a half-implementation of code that deals with -- extra bits in the first character, but I don't think it works and the -- docs I can find on star seem to suggest that these will always be 0, -- which is what I will assume. parseOct s | BS.head s == 128 = return (readBytes (BS.tail s)) | BS.head s == 255 = return (negate (readBytes (BS.tail s))) parseOct s | BS.null stripped = return 0 | otherwise = case readOct stripped of Just x -> return x Nothing -> Left HeaderBadNumericEncoding where stripped = BS.Char8.takeWhile (\c -> c /= '\NUL' && c /= ' ') $ BS.Char8.dropWhile (== ' ') s readBytes :: (Integral a, Bits a) => BS.ByteString -> a readBytes = BS.foldl' (\acc x -> acc `shiftL` 8 + fromIntegral x) 0 getBytes :: Int -> Int -> BS.ByteString -> BS.ByteString getBytes off len = BS.take len . BS.drop off getByte :: Int -> BS.ByteString -> Char getByte off bs = BS.Char8.index bs off getChars :: Int -> Int -> BS.ByteString -> BS.ByteString getChars = getBytes getString :: Int -> Int -> BS.ByteString -> BS.ByteString getString off len = BS.copy . BS.Char8.takeWhile (/='\0') . getBytes off len {-# SPECIALISE readOct :: BS.ByteString -> Maybe Int #-} {-# SPECIALISE readOct :: BS.ByteString -> Maybe Int64 #-} readOct :: Integral n => BS.ByteString -> Maybe n readOct = go 0 0 where go :: Integral n => Int -> n -> BS.ByteString -> Maybe n go !i !n !bs = case BS.uncons bs of Nothing -> if i == 0 then Nothing else Just n Just (w, tl) | w >= 0x30 && w <= 0x39 -> go (i+1) (n * 8 + (fromIntegral w - 0x30)) tl | otherwise -> Nothing