{-# Language OverloadedStrings #-}
{-# Language ScopedTypeVariables #-}

--------------------------------------------------------------------------------
-- |
-- Module      :  Text.HTML.Scalpel.Search
-- Copyright   :  © 2020 Francesco Ariis
-- License     :  GPLv3 (see LICENSE file)
--
-- Maintainer  :  Francesco Ariis <fa-ml@ariis.it>
-- Stability   :  provisional
-- Portability :  portable
--
-- "Text.HTML.Scalpel.Core" scrapers for DuckDuckGo and Google search
-- engines.
--
-- Example:
--
-- > import Text.HTML.Scalpel.Search
-- > import Text.HTML.Scalpel          -- from package `scalpel`
-- >
-- > url = buildGoogleURL "Gugliemo Oberdan"
-- > sa = scrapeURL (show url) (googleScraper :: Scraper String SearchResult)
-- > main = sa >>= print
--
--------------------------------------------------------------------------------


module Text.HTML.Scalpel.Search
                ( -- * URL building
                  -- | Check 'U.URI' to modify standard queries.
                  Query,
                  buildDuckduckgoURL,
                  buildGoogleURL,
                  url2Text,

                  -- * Scrapers
                  SearchResult (..),
                  duckduckgoScraper,
                  googleScraper
                )
                where

import Text.HTML.Scalpel.Core

-- import qualified Data.Bifunctor as B
import qualified Data.Maybe as M
import qualified Data.Text as T
import qualified Text.StringLike as SL
import qualified Text.URI as U


-- from https://api.duckduckgo.com/api
-- | Build a DuckDuckGo URL from a 'Query' string.
buildDuckduckgoURL :: Query -> U.URI
buildDuckduckgoURL :: Query -> URI
buildDuckduckgoURL Query
q =
        URI
baseURI { uriRegName :: Maybe String
U.uriRegName = forall a. a -> Maybe a
Just String
"html.duckduckgo.com",
                  uriPath :: String
U.uriPath = String
"html",
                  uriQuery :: Maybe String
U.uriQuery = forall a. a -> Maybe a
Just String
query }
    where
          query :: String
query = [(String, String)] -> String
U.pairsToQuery [(String
"q", Query -> String
T.unpack Query
q)]

-- from https://api.duckduckgo.com/api
-- | Build a Googlr URL from a 'Query' string.
buildGoogleURL :: Query -> U.URI
buildGoogleURL :: Query -> URI
buildGoogleURL Query
q =
        URI
baseURI { uriRegName :: Maybe String
U.uriRegName = forall a. a -> Maybe a
Just String
"www.google.com",
                  uriPath :: String
U.uriPath = String
"search",
                  uriQuery :: Maybe String
U.uriQuery = forall a. a -> Maybe a
Just String
query }
    where
          query :: String
query = [(String, String)] -> String
U.pairsToQuery [(String
"q", Query -> String
T.unpack Query
q),
                                  (String
"no_redirect", String
"1"),
                                  (String
"skip_disambig", String
"1"),
                                  (String
"s", String
"0")]

-- | Convert 'U.URI' to 'T.Text'. If you prefer 'String', just use `show`.
url2Text :: U.URI -> T.Text
url2Text :: URI -> Query
url2Text URI
u = String -> Query
T.pack forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> String
show URI
u

-- | Scrape a Google search page for result titles and links.
googleScraper :: (Show str, SL.StringLike str, Monad m) =>
                 ScraperT str m SearchResult
googleScraper :: forall str (m :: * -> *).
(Show str, StringLike str, Monad m) =>
ScraperT str m SearchResult
googleScraper = forall str (m :: * -> *).
(StringLike str, Monad m) =>
(TagName, String)
-> (TagName, String)
-> ScraperT str m str
-> (URI -> Maybe URI)
-> ScraperT str m SearchResult
searchScraper (TagName
"div", String
"ZINbbc") (TagName
"h3", String
"zBAuLc")
                              (forall str (m :: * -> *).
(Show str, StringLike str, Monad m) =>
String -> Selector -> ScraperT str m str
attr String
"href" Selector
"a") URI -> Maybe URI
extractGoogleUrl

-- | Scrape a DuckDuckGo search page for result titles and links.
-- DuckDuckGo is not fond of bots downloading their pages and will display
-- an error if searches are too frequent (≳ 1 search per second).
duckduckgoScraper :: (SL.StringLike str, Monad m) =>
              ScraperT str m SearchResult
duckduckgoScraper :: forall str (m :: * -> *).
(StringLike str, Monad m) =>
ScraperT str m SearchResult
duckduckgoScraper =
            forall str (m :: * -> *).
(StringLike str, Monad m) =>
(TagName, String)
-> (TagName, String)
-> ScraperT str m str
-> (URI -> Maybe URI)
-> ScraperT str m SearchResult
searchScraper (TagName
"div", String
"web-result") (TagName
"h2", String
"result__title")
                          ScraperT str m str
us forall a. a -> Maybe a
Just
    where
          us :: ScraperT str m str
us = forall a b. (StringLike a, StringLike b) => a -> b
SL.castString forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall str (m :: * -> *).
(StringLike str, Monad m) =>
TagName -> String -> ScraperT str m Query
tagClassText TagName
"h2" String
"result__title"


-----------
-- TYPES --
-----------

type Query = T.Text

type Title = T.Text
newtype SearchResult = SearchResult { SearchResult -> [(Query, URI)]
unwrapSR :: [(Title, U.URI)] }
    deriving (Int -> SearchResult -> ShowS
[SearchResult] -> ShowS
SearchResult -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [SearchResult] -> ShowS
$cshowList :: [SearchResult] -> ShowS
show :: SearchResult -> String
$cshow :: SearchResult -> String
showsPrec :: Int -> SearchResult -> ShowS
$cshowsPrec :: Int -> SearchResult -> ShowS
Show, SearchResult -> SearchResult -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: SearchResult -> SearchResult -> Bool
$c/= :: SearchResult -> SearchResult -> Bool
== :: SearchResult -> SearchResult -> Bool
$c== :: SearchResult -> SearchResult -> Bool
Eq)


-----------------
-- ANCILLARIES --
-----------------

-- uri --

baseURI :: U.URI
baseURI :: URI
baseURI = U.URI {
        uriScheme :: Maybe String
U.uriScheme = forall a. a -> Maybe a
Just String
"https",
        uriUserInfo :: Maybe String
U.uriUserInfo = forall a. Maybe a
Nothing,
        uriRegName :: Maybe String
U.uriRegName = forall a. Maybe a
Nothing,
        uriPort :: Maybe Integer
U.uriPort = forall a. Maybe a
Nothing,
        uriPath :: String
U.uriPath = String
"",
        uriQuery :: Maybe String
U.uriQuery = forall a. Maybe a
Nothing,
        uriFragment :: Maybe String
U.uriFragment = forall a. Maybe a
Nothing }

-- scrape --

-- a generic scraper: tag/class of the single entry, tag/class of the title,
-- scraper/extractor for the URL
searchScraper :: (SL.StringLike str, Monad m) =>
                 (TagName, String) -> (TagName, String) ->
                 ScraperT str m str -> (U.URI -> Maybe U.URI) ->
                 ScraperT str m SearchResult
searchScraper :: forall str (m :: * -> *).
(StringLike str, Monad m) =>
(TagName, String)
-> (TagName, String)
-> ScraperT str m str
-> (URI -> Maybe URI)
-> ScraperT str m SearchResult
searchScraper (TagName
et, String
ec) (TagName
tt, String
tc) ScraperT str m str
ts URI -> Maybe URI
cf = do
            [(Query, Maybe URI)]
es <- forall str (m :: * -> *) a.
(StringLike str, Monad m) =>
Selector -> ScraperT str m a -> ScraperT str m [a]
chroots (TagName
et TagName -> [AttributePredicate] -> Selector
@: [String -> AttributePredicate
hasClass String
ec]) ScraperT str m (Query, Maybe URI)
entry
            let es' :: [(Query, URI)]
es' = forall a b. (a -> Maybe b) -> [a] -> [b]
M.mapMaybe forall {m :: * -> *} {a} {a} {b}.
(Monad m, StringLike a, StringLike a) =>
(a, m b) -> m (a, b)
filFun [(Query, Maybe URI)]
es
            forall (m :: * -> *) a. Monad m => a -> m a
return ([(Query, URI)] -> SearchResult
SearchResult [(Query, URI)]
es')
    where
          entry :: ScraperT str m (Query, Maybe URI)
entry = (,) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall str (m :: * -> *).
(StringLike str, Monad m) =>
TagName -> String -> ScraperT str m Query
tagClassText TagName
tt String
tc
                      forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> forall str (m :: * -> *).
(StringLike str, Monad m) =>
ScraperT str m str
-> (URI -> Maybe URI) -> ScraperT str m (Maybe URI)
url ScraperT str m str
ts URI -> Maybe URI
cf
          filFun :: (a, m b) -> m (a, b)
filFun (a
a, m b
mb) = m b
mb forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \b
b -> forall (m :: * -> *) a. Monad m => a -> m a
return (forall a b. (StringLike a, StringLike b) => a -> b
SL.castString a
a, b
b)

tagClassText :: (SL.StringLike str, Monad m) =>
                TagName -> String -> ScraperT str m T.Text
tagClassText :: forall str (m :: * -> *).
(StringLike str, Monad m) =>
TagName -> String -> ScraperT str m Query
tagClassText TagName
tt String
tc = forall a b. (StringLike a, StringLike b) => a -> b
SL.castString forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall str. StringLike str => str -> String
normString forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall str (m :: * -> *).
(StringLike str, Monad m) =>
Selector -> ScraperT str m str
text (TagName
tt TagName -> [AttributePredicate] -> Selector
@: [String -> AttributePredicate
hasClass String
tc])

-- text scraper, some URI extractor if needed
url :: (SL.StringLike str, Monad m) =>
       ScraperT str m str -> (U.URI -> Maybe U.URI) ->
       ScraperT str m (Maybe U.URI)
url :: forall str (m :: * -> *).
(StringLike str, Monad m) =>
ScraperT str m str
-> (URI -> Maybe URI) -> ScraperT str m (Maybe URI)
url ScraperT str m str
ts URI -> Maybe URI
cf = do
        String
surl <- forall str. StringLike str => str -> String
normString forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ScraperT str m str
ts
        let r :: Maybe URI
r = String -> Maybe URI
U.parseURI String
surl forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= URI -> Maybe URI
cf
        forall (m :: * -> *) a. Monad m => a -> m a
return Maybe URI
r

-- Nothing: not a link but a «search this instead», embedded, etc.
extractGoogleUrl :: U.URI -> Maybe U.URI
extractGoogleUrl :: URI -> Maybe URI
extractGoogleUrl URI
u
            | String
up forall a. Eq a => a -> a -> Bool
/= String
"/url" Bool -> Bool -> Bool
||
              forall a. Maybe a -> Bool
M.isNothing Maybe String
q   = forall a. Maybe a
Nothing
            | Just String
u2 <- Maybe String
q    = String -> Maybe URI
U.parseURI String
u2
            | Bool
otherwise       = forall a. Maybe a
Nothing
                -- `otherwise` to quiet GHC -Wall
    where
          up :: String
up = URI -> String
U.uriPath URI
u
          q :: Maybe String
q = (String -> [(String, String)]
U.queryToPairs forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> URI -> Maybe String
U.uriQuery URI
u) forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup String
"q"

normString :: SL.StringLike str => str -> String
normString :: forall str. StringLike str => str -> String
normString str
cs = [String] -> String
unwords forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [String]
words forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (StringLike a, StringLike b) => a -> b
SL.castString forall a b. (a -> b) -> a -> b
$ str
cs