{-# LANGUAGE NumericUnderscores #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TupleSections #-} module Streamly.External.Archive.Tests (tests) where import qualified Codec.Archive.Tar as Tar import Codec.Compression.GZip import Control.Concurrent.Async import Control.Monad import Crypto.Random.Entropy import Data.Bifunctor import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC import qualified Data.ByteString.Lazy as LB import Data.Char import Data.Function import Data.Functor import Data.List import Data.List.Split import Data.Maybe import qualified Data.Set as Set import Data.Word import qualified Streamly.Data.Fold as F import qualified Streamly.Data.Stream.Prelude as S import Streamly.External.Archive import Streamly.External.Archive.Internal.Foreign import System.Directory import System.FilePath import System.IO.Temp import Test.QuickCheck import Test.QuickCheck.Monadic import Test.Tasty import Test.Tasty.HUnit import Test.Tasty.QuickCheck tests :: [TestTree] tests = [ testTar False, testTar True, testSparse, testChunkOnAndChunkOnFold, testEitherByLeft ] -- | Use other libraries to create a tar (or tar.gz) file containing random data, read the file back -- using our library, and check if the results are as expected. testTar :: Bool -> TestTree testTar gz = testProperty ("tar (" ++ (if gz then "gz" else "no gz") ++ ")") $ monadicIO $ do -- Generate a random file system hierarchy for writing to disk. hierarchyToWrite <- pick $ randomHierarchy "" 4 4 5 numThreads <- pick $ chooseInt (1, 4) -- Of the hierarchy we wrote, sometimes we only read some of them back. readSome :: Bool <- pick arbitrary readSomePaths <- pick $ sublistOf hierarchyToWrite <&> Set.fromList . map (("files/" ++) . fst) -- Make comparable to what our library reads back. -- Create a new temporary directory, write our hierarchy into a "files" subdirectory of the -- temporary directory, use other libraries to create files.tar (or files.tar.gz), read the file -- back using our library, and check if the results are as expected. run . withSystemTempDirectory "archive-streaming-testZip" $ \tmpDir -> do let filesDir = joinPath [tmpDir, "files"] createDirectoryIfMissing True filesDir writePathsAndByteStrings <- writeHierarchy filesDir hierarchyToWrite let archFile = joinPath [tmpDir, "files.tar" ++ (if gz then ".gz" else "")] LB.writeFile archFile . (if gz then compress else id) . Tar.write =<< Tar.pack tmpDir ["files"] let fileFold = F.foldlM' ( \(mfp, mtyp, msz, mbs) e -> case e of Left h -> do mfp_ <- headerPathName h mtyp_ <- headerFileType h msz_ <- headerSize h return (BC.unpack <$> mfp_, mtyp_, msz_, mbs) Right bs -> return ( mfp, mtyp, msz, case mbs of Nothing -> Just bs Just bs' -> Just $ bs' `BC.append` bs ) ) (return (Nothing, Nothing, Nothing, Nothing)) readPathsFileTypesSizesAndByteStringss <- replicateConcurrently numThreads $ S.unfold readArchive ( if readSome then mapHeaderMaybe ( \h -> do p <- fromJust <$> headerPathName h return $ if BC.unpack p `Set.member` readSomePaths then Just h else Nothing ) else id, archFile ) & groupByLeft fileFold & fmap (\(mfp, mtyp, msz, mbs) -> (fromJust mfp, fromJust mtyp, msz, mbs)) & S.toList threadResults <- forM readPathsFileTypesSizesAndByteStringss $ \readPathsFileTypesSizesAndByteStrings -> do let readPathAndByteStrings = sort . map (\(x, _, _, y) -> (x, y)) $ readPathsFileTypesSizesAndByteStrings let writePathsAndByteStrings2 = sort . (if readSome then filter (\(x, _) -> x `Set.member` readSomePaths) else id) . map (first ("files/" ++)) -- Make comparable to what our library reads back. $ ("", Nothing) : writePathsAndByteStrings let samePathsAndByteStrings = writePathsAndByteStrings2 == readPathAndByteStrings -- Check FileType. let fileTypesCorrect = all ( \(fp, typ, _, _) -> if hasTrailingPathSeparator fp then typ == FileTypeDirectory else typ == FileTypeRegular ) readPathsFileTypesSizesAndByteStrings -- Check header file size. let fileSizeCorrect = all ( \(_, _, msz, mbs) -> case (msz, mbs) of (Nothing, _) -> False -- The size is always available. (Just sz, Nothing) -> sz == 0 -- File or directory. (Just sz, Just bs) -> fromIntegral sz == B.length bs ) readPathsFileTypesSizesAndByteStrings return $ samePathsAndByteStrings && fileTypesCorrect && fileSizeCorrect return $ and threadResults -- | Read a fixed sparse file (sparse.tar) using our library and make sure the results are as -- expected. (The file was created manually on Linux with "cp --sparse=always" to create the sparse -- files and "tar -Scvf" to create the archive. We were unable to do the equivalent thing on macOS -- Mojave / APFS.) testSparse :: TestTree testSparse = testProperty "sparse" $ monadicIO $ do let fileFold = F.foldlM' ( \(mfp, mbs) e -> case e of Left h -> do mfp_ <- headerPathName h return (BC.unpack <$> mfp_, mbs) Right bs -> return ( mfp, case mbs of Nothing -> Just bs Just bs' -> Just $ bs' `BC.append` bs ) ) (return (Nothing, Nothing)) numThreads <- pick $ chooseInt (1, 4) archives <- run $ replicateConcurrently numThreads $ S.unfold readArchive (id, "test/data/sparse.tar") & groupByLeft fileFold & fmap (\(mfp, mbs) -> (fromJust mfp, fromJust mbs)) & S.toList threadResults <- forM archives $ \archive -> do let validPaths = map fst archive == ["zero", "zeroZero", "zeroAsdf", "asdfZero"] let tenMb = 10_000_000 let zero = B.replicate tenMb 0 let asdf = "asdf" let validByteString1 = snd (head archive) == zero let validByteString2 = snd (archive !! 1) == zero `B.append` zero let validByteString3 = snd (archive !! 2) == zero `B.append` asdf let validByteString4 = snd (archive !! 3) == asdf `B.append` zero return $ and [ validPaths, validByteString1, validByteString2, validByteString3, validByteString4 ] return $ and threadResults testChunkOnAndChunkOnFold :: TestTree testChunkOnAndChunkOnFold = testProperty "chunkOn/chunkOnFold" $ monadicIO $ do -- Although we say “lines,” our splitWd is an arbitrary non-printable ASCII character. splitWd :: Word8 <- pick $ elements [0 .. 31] let maxLineLen = 50 maxChunkSz = 100 genLine = do lineLen <- choose (0, maxLineLen) B.pack <$> -- For easier debug visualization, lines have only a to z. replicateM lineLen (elements [97 .. 122]) genLines = do numLines <- pick $ choose (0, 10) if numLines <= 2 then replicateM numLines $ pick genLine else do lines' <- replicateM (numLines - 2) $ pick genLine -- Make it a bit more likely we begin/end with an empty line begLine <- pick $ frequency [(5, return ""), (95, genLine)] endLine <- pick $ frequency [(5, return ""), (95, genLine)] return $ [begLine] ++ lines' ++ [endLine] chunkSz <- pick $ choose (1, maxChunkSz) let linesToChunks [""] = [""] -- Special case. linesToChunks lns = map B.pack . chunksOf chunkSz . B.unpack -- ["line1", "\n" , "line2"] -> "line1\nline2" . B.concat -- ["line1", "line2"] -> ["line1", "\n" , "line2"] . intersperse (B.singleton splitWd) $ lns -- Most users of streamly-archive will probably have none of these, but we test this case -- nonetheless. linesBeforeFirstFile <- genLines numFiles <- pick $ choose (0, 5) files <- replicateM numFiles $ do fileLines <- genLines fileName :: Int <- pick arbitrary return (fileName, fileLines) let chunks = map Right (linesToChunks linesBeforeFirstFile) ++ concatMap (\(fileName, fileLines) -> Left fileName : map Right (linesToChunks fileLines)) files expectedChunkOnResult = map Right linesBeforeFirstFile ++ concatMap (\(fileName, fileLines) -> Left fileName : map Right fileLines) files chunkOnResult <- S.fromList chunks & chunkOn splitWd & S.fold F.toList chunkOnFoldResult <- S.fromList chunks & S.fold (chunkOnFold splitWd F.toList) return $ chunkOnResult == expectedChunkOnResult && chunkOnFoldResult == expectedChunkOnResult testEitherByLeft :: TestTree testEitherByLeft = testCase "eitherByLeft" $ do let getRes = S.fold F.toList . eitherByLeft . S.fromList res1 <- getRes [Right 10, Left "a", Right 1, Right 2, Left "b", Left "c", Right 20] res1 @?= ([("a", 1), ("a", 2), ("c", 20)] :: [(String, Int)]) res2 <- getRes [] res2 @?= ([] :: [(String, Int)]) -- | Writes a given hierarchy of relative paths (created with 'randomHierarchy') to disk in the -- specified directory and returns the same hierarchy except with actual ByteStrings instead of -- lengths. Note: The original relative paths are returned back unaltered. writeHierarchy :: FilePath -> [(FilePath, Maybe Int)] -> IO [(FilePath, Maybe ByteString)] writeHierarchy writeDir = mapM $ \(p, mBsLen) -> let fullp = joinPath [writeDir, p] in case mBsLen of Just bsLen -> do createDirectoryIfMissing True (takeDirectory fullp) bs <- getEntropy (fromIntegral bsLen) B.writeFile fullp bs return ( p, if bsLen == 0 then Nothing -- Our library yields no ByteString at all for empty files. else Just bs ) Nothing -> createDirectoryIfMissing True fullp >> return (p, Nothing) -- | Recursively generates a random hierarchy of relative paths to files and directories. (Nothing -- is written to disk; only the paths are returned.) The initial dirPath should be "". A random -- bytestring length is provided in case of a file; 'Nothing' in the case of a directory. randomHierarchy :: FilePath -> Int -> Int -> Int -> Gen [(FilePath, Maybe Int)] randomHierarchy dirPath maxFiles maxDirs maxDepth = do numFiles <- choose (0, maxFiles) fileComps <- nub <$> vectorOf numFiles pathComponent let filePaths = map (\c -> joinPath [dirPath, c]) fileComps bsLengths <- map Just <$> vectorOf (length filePaths) ( frequency [ (1, choose (0, 5)), (1, choose (blockSize - 5, blockSize + 5)), (1, choose (0, 3 * blockSize)) ] ) numDirs <- choose (0, maxDirs) dirComps <- nub . filter (not . (`elem` fileComps)) <$> vectorOf (if maxDepth <= 0 then 0 else numDirs) pathComponent -- libarchive reads back directory paths with a trailing separator. let dirPaths = map (\c -> addTrailingPathSeparator $ joinPath [dirPath, c]) dirComps recursion <- concat <$> forM dirPaths ( \dirPath' -> randomHierarchy dirPath' (maxFiles `div` 2) (maxDirs `div` 2) (maxDepth - 1) ) return $ zip filePaths bsLengths ++ map (,Nothing) dirPaths ++ recursion -- | Generates a random path component of length between 1 and 10, e.g., "HO53UVKQ". For -- compatibility with case-insensitive file systems, uses only one case. pathComponent :: Gen String pathComponent = do len <- choose (1, 10) vectorOf len $ chr <$> frequency [ (1, choose (ord 'A', ord 'Z')), (1, choose (ord '0', ord '9')) ]