module Network.Scraper.State (
get,
runScraper,
InpFilter (..),
FormAttr(..),
Scraper,
getCurrentCursor,
toCursor,
isDisplayed,
hasDisplayNone,
getFormBy,
getCurrentHtml,
fillForm,
hasHide,
getInputs,
postToForm,
printFormNames
) where
import Control.Applicative
import Control.Arrow ((***))
import Control.Lens ((^.))
import Control.Monad
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Trans.Class
import qualified Control.Monad.Trans.State.Strict as ST
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Map as M
import Data.Maybe (fromJust, fromMaybe, isJust,
listToMaybe)
import Data.Monoid
import qualified Data.Text as T
import Data.Text.Encoding (encodeUtf8)
import qualified Data.Text.IO as TIO
import Network.URL
import Network.Wreq (FormParam (..))
import qualified Network.Wreq as Wreq
import Network.Wreq.Session (Session (..), withSession)
import qualified Network.Wreq.Session as Sesh
import Network.Wreq.Types
import Safe
import Text.HTML.DOM (parseLBS)
import Text.XML.Cursor
import qualified Text.XML.Cursor.Generic as CG
data ScraperState =
PS { currentOptions :: Wreq.Options
, currentHtml :: LBS.ByteString
, currentCursor :: Maybe Cursor
, currentSession :: Session
, currentURL :: Maybe URL
} deriving (Show)
type Scraper = ST.StateT ScraperState IO
toCursor = fromDocument . parseLBS
withInitialState :: (ScraperState -> IO a) -> IO a
withInitialState callback = withSession $ \s -> do
let initialState = PS { currentOptions = Wreq.defaults
, currentHtml = ("" :: LBS.ByteString)
, currentCursor = Nothing
, currentSession = s
, currentURL = Nothing
}
callback initialState
runScraper :: Scraper a -> IO a
runScraper k = withInitialState (evalScraperWith k)
evalScraperWith :: Scraper a -> ScraperState -> IO a
evalScraperWith k s = ST.evalStateT k s
setCurrentOptions :: Wreq.Options -> Scraper ()
setCurrentOptions o = do
scraper <- ST.get
ST.put $ scraper { currentOptions = o }
setCurrentURL :: Maybe URL -> Scraper ()
setCurrentURL u = ST.get >>= (\s -> ST.put $ s { currentURL = u })
getCurrentURL :: Scraper(Maybe URL)
getCurrentURL = ST.get >>= return . currentURL
getCurrentHtml :: Scraper(LBS.ByteString)
getCurrentHtml = ST.get >>= return . currentHtml
getCurrentCursor :: Scraper (Maybe Cursor)
getCurrentCursor = do
scraper <- ST.get
return $ currentCursor scraper
getCurrentSession :: Scraper (Session)
getCurrentSession = do
scraper <- ST.get
return $ currentSession scraper
setCurrentSession :: Session -> Scraper ()
setCurrentSession s = do
scraper <- ST.get
ST.put $ scraper { currentSession = s}
setCurrentCursor :: Cursor -> Scraper ( )
setCurrentCursor c = do
scraper <- ST.get
ST.put $ scraper { currentCursor = Just c }
setCurrentHtml :: LBS.ByteString -> Scraper ()
setCurrentHtml html = do
scraper <- ST.get
ST.put $ scraper { currentHtml = html }
formShortInfo' f = formInfo'
where
go Nothing = "N/A"
go (Just x) = x
formInfo = (headMay . attribute "name" $ f, headMay . attribute "action" $ f)
formInfo' = (\(x,y) -> (go x, go y)) formInfo
ppTuple :: (T.Text, T.Text) -> T.Text
ppTuple = \(x,y) -> "[" <> x <> "]" <> ": " <> y
printFormNames :: Scraper ()
printFormNames = do
c <- getCurrentCursor
let c' = fromMaybe (error "No cursor set") c
forms = c' $// element "form"
formInfo = map (ppTuple . formShortInfo') forms
liftIO $ mapM_ (TIO.putStrLn) formInfo
get :: String -> Scraper (LBS.ByteString)
get urlStr = do
let url = fromMaybe (error ("invalid urlStr: " ++ urlStr)) (importURL urlStr)
urlStr' = exportURL url
opts <- ST.gets currentOptions
sesh <- ST.gets currentSession
r <- liftIO $ Sesh.getWith opts sesh urlStr'
let html = r ^. Wreq.responseBody
setCurrentURL (Just url)
setCurrentHtml html
setCurrentCursor (toCursor html)
return html
post :: Postable a => String -> a -> Scraper (LBS.ByteString)
post urlStr params = do
opts <- ST.gets currentOptions
sesh <- ST.gets currentSession
let url = fromMaybe (error ("invalid urlStr: " ++ urlStr)) (importURL urlStr)
absURL <- toAbsUrl url
let url' = exportURL absURL
r <- liftIO $ Sesh.postWith opts sesh url' params
let html = r ^. Wreq.responseBody
setCurrentHtml html
setCurrentCursor (toCursor html)
return html
toAbsUrl :: URL -> Scraper(URL)
toAbsUrl u@(URL (Absolute _) _ _) = return u
toAbsUrl u@(URL _ _ _) = do
hostUrl <- getCurrentURL
let hostUrl' = fromMaybe (error errMsg) hostUrl
absUrl = u { url_type = url_type hostUrl' }
return absUrl
where errMsg = "You must 'get' or 'post' to something before making urls absolute"
testToAbsUrl :: Scraper()
testToAbsUrl = do
setCurrentURL (importURL "http://www.google.com")
aUrl <- toAbsUrl (fromJust . importURL $ "blah.php")
liftIO . print . exportURL $ aUrl
hasDisplayNone el = fromMaybe False . fmap (== "display: none;") . headMay $ (attribute "style" el)
hasHide el = fromMaybe False . fmap (T.isInfixOf "hide") . headMay $ (attribute "class" el)
anyParentIsHidden el = isJust . listToMaybe . join $ map (\c -> (c $/ check (hasHide))) cs
where cs = ancestor el
isDisplayed :: Cursor -> Bool
isDisplayed el = all (== False) $
[ hasDisplayNone el
, hasHide el
, anyParentIsHidden el
]
getVisibleInputs :: Cursor -> M.Map T.Text T.Text
getVisibleInputs c = do
let inputs' = filter isDisplayed inputs
mayPairs = map (\e -> (listToMaybe $ attribute "name" e, listToMaybe $ attribute "value" e)) inputs'
pairs = map (fromMaybe "" *** fromMaybe "") mayPairs
M.fromList $ filter ((/= "") . fst) pairs
where inputs = c $// element "input"
data InpFilter a = Custom ([T.Text]) | AllVisible | AllInps deriving (Show)
getInputs :: InpFilter a -> Cursor -> M.Map T.Text T.Text
getInputs (Custom paramFilterList) c = do
let mayPairs = map (\e -> (listToMaybe $ attribute "name" e, listToMaybe $ attribute "value" e)) inputs
pairs = map (fromMaybe "" *** fromMaybe "") mayPairs
m = M.fromList $ filter ((/= "") . fst) pairs
M.filterWithKey (\k _ -> not . any (== k) $ paramFilterList) m
where inputs = c $// element "input"
getInputs AllVisible c = getVisibleInputs c
getInputs AllInps c = getAllInputs c
getAllInputs :: Cursor -> M.Map T.Text T.Text
getAllInputs c = do
let mayPairs = map (\e -> (listToMaybe $ attribute "name" e, listToMaybe $ attribute "value" e)) inputs
pairs = map (fromMaybe "" *** fromMaybe "") mayPairs
M.fromList $ filter ((/= "") . fst) pairs
where inputs = c $// element "input"
tgi = do
LBS.readFile "mismatchedinputkeyvalsform.html" >>= return . getAllInputs . toCursor
getLoginForm url = get url >>= return . getAllInputs . toCursor
toWreqFormParams :: [(T.Text, T.Text)] -> [FormParam]
toWreqFormParams params = map (\(k,v) -> k := v) (map (encodeUtf8 *** encodeUtf8) params)
linkWithText :: T.Text -> Cursor -> Maybe Cursor
linkWithText t cursor = listToMaybe $ filter (\c -> (any (T.isInfixOf t)) (c $/ content)) (cursor $// element "a")
addToMap pairs m = foldl (\m ->(\(k,v) -> M.insert k v m)) m pairs
getFormByName :: T.Text -> Scraper (Maybe Cursor)
getFormByName name = do
c <- getCurrentCursor
let c' = fromMaybe (error "Nocursor set") c
formList = c' $// element "form" >=> attributeIs "name" name
return . listToMaybe $ formList
data FormAttr = Name T.Text | ActionUrl T.Text | FormId T.Text deriving Show
getFormBy :: FormAttr -> Scraper (Maybe Cursor)
getFormBy formAttr = do
c <- getCurrentCursor
let c' = fromMaybe (error "Nocursor set") c
return . listToMaybe $ formList formAttr c'
where formList (Name val) c =
c $// element "form" >=> attributeIs "name" val
formList (ActionUrl val) c =
c $// element "form" >=> attributeIs "action" val
formList (FormId val) c =
c $// element "form" >=> attributeIs "id" val
fillForm :: Maybe Cursor -> Maybe [(T.Text, T.Text)] -> InpFilter a -> [FormParam]
fillForm form Nothing paramFilterList = do
let formParams = fromMaybe (error "no params in form") (getInputs paramFilterList <$> form)
toWreqFormParams . M.toList $ formParams
fillForm form (Just params) paramFilterList = do
let formParams = fromMaybe (error "no params in form") (getInputs paramFilterList <$> form)
formParams' = addToMap params formParams
toWreqFormParams . M.toList $ formParams'
postToForm :: FormAttr -> Maybe [(T.Text,T.Text)] -> InpFilter a -> Scraper (LBS.ByteString)
postToForm formAttr params paramFilterList = do
form <- getFormBy formAttr
c <- getCurrentCursor
case form of
Just _ -> do
return ()
Nothing -> do
error ("Couldn't find form: " ++ show formAttr)
let formParams = fillForm form params paramFilterList
mActionUrl = T.strip <$> (join $ listToMaybe <$> attribute "action" <$> form)
actionUrl = fromMaybe (error "Couldn't find action url in form") mActionUrl
html <- post (T.unpack actionUrl) formParams
return html