module Text.Html.Consolidate (
consolidate
,extract
,TArr
,consolidateArr
,extractJSArr
,initialConsState
,insertJSArr
,parseHTML
,renderHTML
) where
import Text.XML.HXT.Core hiding (swap)
import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.TagSoup
import Language.ECMAScript3.Syntax
import Language.ECMAScript3.Syntax.Annotations
import Language.ECMAScript3.PrettyPrint
import Language.ECMAScript3.Parser
import Data.List (isInfixOf)
import Data.Default.Class
import Network.HTTP
import Network.Browser (Cookie)
import Network.Browser.Simple
import Network.URI
import Network.HTTP.Encoding
import Data.ByteString.Lazy (ByteString)
import System.Random
import Data.Char
import Data.Maybe (isJust, fromJust, maybeToList)
import Control.Monad hiding (when)
data ConsState = ConsState Bool
(Maybe URI)
[Cookie]
[Statement ()]
initialConsState :: Bool
-> Maybe URI
-> [Cookie]
-> ConsState
initialConsState grace base cookies = ConsState grace base cookies []
type TArr a b = IOStateArrow ConsState a b
parseHTML :: String -> Maybe URI -> TArr a XmlTree
parseHTML s mbase_uri =
let config = map (withDefaultBaseURI . show) (maybeToList mbase_uri)
++[withParseHTML yes
,withTagSoup
,withValidate no
,withSubstDTDEntities no
,withSubstHTMLEntities yes
,withCanonicalize no
,withOutputHTML]
in readString config s
renderHTML :: ConsState -> TArr XmlTree XmlTree -> IO String
renderHTML ns a =
let state = initialState ns
in liftM head $ runXIOState state ((single a) >>> writeDocumentToString [withOutputHTML, withOutputEncoding utf8])
consolidate :: String -> Maybe URI -> IO String
consolidate s mbase_uri =
renderHTML (initialConsState True mbase_uri []) $
parseHTML s mbase_uri >>> consolidateArr
consolidateArr :: TArr XmlTree XmlTree
consolidateArr = extractJSArr >>> insertJSArr
extract :: String -> Maybe URI -> IO String
extract s mbase_uri =
let state = initialState $ initialConsState True mbase_uri [] in
do [(_, js)] <- runXIOState state $ single $
parseHTML s mbase_uri >>> extractJSArr
return js
extractJSArr :: TArr XmlTree (XmlTree, String)
extractJSArr =
((choiceA [isAJavaScript :-> ifA (hasAttr "src") extractExternalScript
extractInlineScript
,(isElem >>> hasOneOfNames src_tags >>> hasOneOfAttrs src_attrs) :-> extractURLProp
,(isElem >>> hasOneOfAttrs event_handlers) :-> extractEventHandler
,this :-> this
])
`processTopDownUntilAndWhenMatches` isFrame)
>>>returnScript
where returnScript = (returnA &&& getUserState)
>>> second (arr (\s -> let ConsState _ _ _ stmts = s
in show $ prettyPrint stmts))
isFrame = isElem >>> (hasName "frame" <+> hasName "iframe")
hasOneOfNames tagNames = (getName >>> isA (`elem` tagNames)) `guards` this
hasOneOfAttrs attrNames = (getAttrl >>> hasOneOfNames attrNames) `guards` this
src_tags = ["img", "a", "form", "frame", "iframe", "link"]
src_attrs = ["src", "href", "action"]
event_handlers = ["onabort", "onblur", "onclick", "oncompositionstart"
,"oncompositionupdate", "oncompositionend", "ondblclick"
,"onerror", "onfocus", "onfocusin", "onfocusout"
,"onkeydown", "onkeypress", "onkeyup", "onload"
,"onmousedown", "onmouseenter", "onmouseleave"
,"onmousemove", "onmouseout", "onmouseover"
,"onmouseup", "onreset", "onresize","onscroll"
,"onselect", "onsubmit", "ontextinput", "onunload"
, "onwheel"]
processTopDownUntilAndWhenMatches :: (ArrowTree a, Tree t)
=> a (t b) (t b)
-> a (t b) (t b)
-> a (t b) (t b)
processTopDownUntilAndWhenMatches t p =
t >>> (processChildren (processTopDownUntilAndWhenMatches t p) `whenNot` p)
insertJSArr :: TArr (XmlTree, String) XmlTree
insertJSArr = (swap ^<< (second scriptElement)) >>>
arr2A (\scr -> processTopDown $ changeChildren (++ [scr]) `when`
hasName "body")
extractInlineScript :: TArr XmlTree XmlTree
extractInlineScript =
(firstChild >>>
getText >>>
parseJS >>>
arr removeAnnotations >>>
appendScript >>>
cmt "Removed Inline Script")
extractExternalScript :: TArr XmlTree XmlTree
extractExternalScript =
((getAttrValue "src" >>>
(downloadArr >>>
parseJS >>>
arr removeAnnotations >>>
appendScript)
&&&
(arr ("Removed External Script: " ++) >>>
mkCmt)) >>>
arr snd)
downloadArr :: TArr String String
downloadArr =
(returnA &&& arr parseURIReference) >>>
arrIO (\(url, muri) ->
failIfNothing ("download: error parsing a URI: " ++ url) muri) >>>
consolidateURI >>>
(getUserState &&& returnA) >>>
arrIO (\(ConsState _ _ cookies _, uri) -> liftM fst (download uri cookies))
where failIfNothing :: String -> Maybe a -> IO a
failIfNothing message Nothing = fail message
failIfNothing _ (Just x) = return x
isURIRelative :: URI -> Bool
isURIRelative = null . uriScheme
consolidateURI :: TArr URI URI
consolidateURI =
(getUserState &&& returnA) >>>
arrIO (\(ConsState _ mbaseURI _ _, uri) ->
return $ if isURIRelative uri && isJust mbaseURI
then uri `relativeTo` fromJust mbaseURI
else uri)
extractURLProp :: TArr XmlTree XmlTree
extractURLProp =
addIdIfNotPresent >>>
(((selectAttrValues ["src", "href", "action"]
&&& selectId) >>>
arr (\((url, attrName), id) ->
Script () [ExprStmt () $ AssignExpr () OpAssign
(LDot () (CallExpr ()
(DotRef () (VarRef () (Id () "document")) (Id () "getElementById"))
[StringLit () id]) attrName) (StringLit () url)]) >>>
appendScript) &&&
removeAttributes ["src", "href", "action"]) >>>
arr snd
extractEventHandler :: TArr XmlTree XmlTree
extractEventHandler =
let attrNames = ["onabort", "onblur", "onclick", "oncompositionstart",
"oncompositionupdate", "oncompositionend", "ondblclick",
"onerror", "onfocus", "onfocusin", "onfocusout", "onkeydown",
"onkeypress", "onkeyup", "onload", "onmousedown",
"onmouseenter", "onmouseleave", "onmousemove", "onmouseout",
"onmouseover", "onmouseup", "onreset", "onresize","onscroll",
"onselect", "onsubmit", "ontextinput", "onunload", "onwheel"]
in addIdIfNotPresent >>>
(((selectAttrValues attrNames &&& selectId) >>>
arr (\((handler, attrName), id) ->
Script () [ExprStmt () $ AssignExpr () OpAssign
(LDot () (CallExpr ()
(DotRef () (VarRef () (Id () "document")) (Id () "getElementById"))
[StringLit () id]) attrName) (StringLit () handler)])
>>> appendScript) &&&
removeAttributes attrNames) >>>
arr snd
parseJS :: TArr String (JavaScript SourcePos)
parseJS = arr (parse program "") >>> eitherToFailure
arrowFail :: ArrowIO ar => String -> ar b c
arrowFail = arrIO . fail
isStrict :: TArr a Bool
isStrict = getUserState >>> arr (\s -> let ConsState strict _ _ _ = s in strict)
failIfStrict :: String -> TArr a a
failIfStrict msg =
proc a -> do is <- isStrict -< ()
if is then arrowFail msg -< () else returnA -< a
eitherToFailure :: (Show err, Default a) => TArr (Either err a) a
eitherToFailure = (isStrict &&& returnA) >>> arrIO f
where f (False, Left err) = return def
f (True, Left err) = fail $ show err
f (_ , Right x) = return x
maybeToFailure :: (Default a) => Maybe String -> TArr (Maybe a) a
maybeToFailure message = (isStrict &&& returnA) >>> arrIO f
where f (False, Nothing) = return def
f (True , Nothing) =
case message of
Nothing -> fail "Unexpected maybe in strict mode"
Just msg -> fail msg
f (_ , Just x) = return x
appendStatements :: TArr [Statement ()] ()
appendStatements = (getUserState &&& returnA) >>>
arr (\(state, addScript) ->
let ConsState grace baseURI cookies script = state in
ConsState grace baseURI cookies (script++addScript)) >>>
setUserState >>> arr (const ())
appendScript :: TArr (JavaScript ()) ()
appendScript = arr (\s -> let Script _ stmts = s in stmts) >>> appendStatements
scriptElement :: ArrowXml ar => ar String XmlTree
scriptElement = (mkElement (mkName "script") (sattr "type" "text/javascript")) $< arr txt
isAJavaScript :: ArrowXml ar => ar XmlTree XmlTree
isAJavaScript =
isElem >>> hasName "script" >>>
(((hasAttr "language" >>> hasAttrValue "language" (isInfixOf "javascript")) <+>
(hasAttr "type" >>> hasAttrValue "type" (isInfixOf "javascript"))) `orElse`
returnA)
firstChild :: (ArrowTree a, Tree t) => a (t b) (t b)
firstChild = single getChildren
lastChild :: (ArrowTree a, Tree t) => a (t b) (t b)
lastChild = getChildren >>. (take 1 . reverse)
html :: ArrowXml a => a XmlTree XmlTree
html = deep $ hasName "html"
body :: ArrowXml a => a XmlTree XmlTree
body = html /> hasName "body"
selectTags :: ArrowXml a => [String] -> a XmlTree XmlTree
selectTags = foldl (\arr tag -> arr <+> hasName tag) zeroArrow
selectAttrValues :: ArrowXml a => [String] -> a XmlTree (String, String)
selectAttrValues = foldl f zeroArrow
where f :: ArrowXml a =>
a XmlTree (String, String) -> String -> a XmlTree (String, String)
f a attr = a <+> (hasAttr attr >>>
(getAttrValue attr &&& arr (const attr)))
hasAnyAttrs :: ArrowXml a => [String] -> a XmlTree XmlTree
hasAnyAttrs = foldl f zeroArrow
where f :: ArrowXml a => a XmlTree XmlTree -> String -> a XmlTree XmlTree
f a attr = a <+> hasAttr attr
removeAttributes :: ArrowXml a => [String] -> a XmlTree XmlTree
removeAttributes = foldl f zeroArrow
where f :: ArrowXml a => a XmlTree XmlTree -> String -> a XmlTree XmlTree
f a attr = a >>> removeAttr attr
addIdIfNotPresent :: TArr XmlTree XmlTree
addIdIfNotPresent = proc node -> do
idval <- getAttrValue "id" -< node
if null idval
then replaceChildren repFun -< node
else returnA -< node
where repFun = replaceChildren (genIdA >>> mkText)
`when` (isAttr >>> hasName "id")
selectId :: TArr XmlTree String
selectId = isElem >>> getAttrValue "id"
genIdA :: ArrowIO ar => ar a String
genIdA = arrIO $ const genId
genId :: IO String
genId = do firstLetter <- genLetter
return [firstLetter]
length <- getStdRandom $ randomR (minIdLength1, maxIdLength1)
restId <- mapM (const genLetter) [1..length]
return $ firstLetter:restId
where minIdLength :: Int
minIdLength = 16
maxIdLength = 32
genLetter :: IO Char
genLetter = do letter <- getStdRandom $ randomR (capitalACode, capitalZCode)
lettercase <- getStdRandom $ randomR (0,1)
return $ chr $ letter + lettercase * (lowercaseACodecapitalACode)
where capitalACode = 65
capitalZCode = 90
lowercaseACode = 97
swap (a,b) = (b,a)