module Holumbus.Crawler.PdfToText
where
import Control.Concurrent.MVar
import qualified Control.Exception as CE
import qualified Data.ByteString.Lazy as BS
import Data.String.Unicode ( utf8ToUnicode )
import System.Directory ( getTemporaryDirectory
, removeFile
)
import System.FilePath ( (</>) )
import System.Process ( rawSystem )
import System.Posix.Process ( getProcessID )
import System.IO.Unsafe ( unsafePerformIO )
import Text.XML.HXT.Core
pdfToText :: String -> IO String
pdfToText = pdfToTextBS . BS.pack . map (toEnum . fromEnum)
pdfToTextBS :: BS.ByteString -> IO String
pdfToTextBS inp = ( do
fns@(fn1, fn2) <- requestPdf
BS.writeFile fn1 inp
_ <- rawSystem "pdftotext" ["-q", "-enc", "UTF-8", fn1, fn2]
removeFile fn1
res <- BS.readFile fn2
BS.length res `seq`
removeFile fn2
releasePdf fns
return ( fst . utf8ToUnicode . map (toEnum . fromEnum) . BS.unpack $ res )
) `mycatch` ( const $ return "" )
where
mycatch :: IO a -> (CE.SomeException -> IO a) -> IO a
mycatch = CE.catch
pdfToTextA :: IOSArrow String String
pdfToTextA = perform ( traceString 2 (("pdfToTextA input:\n" ++) . take 128 . show) )
>>>
arrIO pdfToText
>>>
perform ( traceString 2 (( "pdfToText result:\n" ++ ) . take 128 . show) )
pdfResource :: MVar (FilePath, FilePath)
pdfResource
= unsafePerformIO $ tmpFiles >>= newMVar
where
tmpFiles
= do td <- getTemporaryDirectory
pid <- getProcessID
let fn1 = fn td pid "pdfToText.pdf"
let fn2 = fn td pid "pdfToText.txt"
return (fn1, fn2)
fn d p f
= d </> (show p ++ "-" ++ f)
requestPdf :: IO (FilePath, FilePath)
requestPdf = takeMVar pdfResource
releasePdf :: (FilePath, FilePath) -> IO ()
releasePdf = putMVar pdfResource