module Test.Mangrove.Html5Lib.Tokenizer
( tests
) where
import qualified Data.ByteString.Short as BS.S
import qualified Data.List as L
import qualified Data.Text as T
import qualified Test.HUnit as U
import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Tokenize
import Web.Willow.DOM
import Test.Mangrove.Html5Lib.Tokenizer.JSON
import Test.HUnit ( (~:), (@?=) )
tests :: IO U.Test
tests = U.TestLabel "tokenizer" . U.TestList <$> mapM runTestFile
[ "test1"
, "test2"
, "test3"
, "test4"
, "entities"
, "namedEntities"
, "numericEntities"
, "unicodeChars"
-- , "unicodeCharsProblematic" -- Seems to contradict the standard in allowing surrogates to be decoded
, "contentModelFlags"
, "domjs"
, "escapeFlag"
, "pendingSpecChanges"
-- , "xmlViolation"
]
runTestFile :: FilePath -> IO U.Test
runTestFile p = U.TestLabel (p ++ ".test") . U.TestList . map run <$> parseTestFile p
run :: JsonTest -> U.Test
run test = T.unpack (description test) ~: U.TestList $ map (runState test)
[ (show state, tokenizerMode state tokState)
| state <- initialStates test
]
where tokState = case lastStartTag test of
Just n -> tokenizerStartTag Nothing n $ defaultTokenizerState
Nothing -> defaultTokenizerState
runState :: JsonTest -> (String, TokenizerState) -> U.Test
runState test (mode, state) = mode ~: U.TestCase $ do
let (out, state') = tokenize state $ input test
final = finalizeTokenizer state'
out' = out ++ final
(L.sortOn show . concat $ map mapError out', trimEof $ map mapToken out') @?=
(map errorCode $ errors test, concat $ output test)
where normalizeToken (EndTag d) = EndTag $ emptyTagParams
{ tagName = tagName d
}
normalizeToken tok = tok
mapToken = normalizeToken . snd
normalizeError (CharacterReferenceOutsideUnicodeRange _) =
CharacterReferenceOutsideUnicodeRange 0
normalizeError (DuplicateAttribute _) =
DuplicateAttribute (T.empty, T.empty)
normalizeError (DuplicateSingletonElement _) =
DuplicateSingletonElement emptyElementParams
normalizeError (FramesetInBody _) =
FramesetInBody emptyElementParams
normalizeError (MalformedTableStructure _) =
MalformedTableStructure emptyElementParams
normalizeError (InvalidByteSequence _) =
InvalidByteSequence $ BS.S.pack []
normalizeError (NoncharacterCharacterReference _) =
NoncharacterCharacterReference '\NUL'
normalizeError (ObsoleteTagName _) =
ObsoleteTagName T.empty
normalizeError (SurrogateCharacterReference _) =
SurrogateCharacterReference '\NUL'
normalizeError (UnexpectedCharacterAfterDoctypeSystemIdentifier _) =
UnexpectedCharacterAfterDoctypeSystemIdentifier '\NUL'
normalizeError (UnexpectedDoctype _) =
UnexpectedDoctype emptyDocumentTypeParams
normalizeError (UnexpectedDescendantElement _) =
UnexpectedDescendantElement emptyElementParams
normalizeError (UnexpectedEndTag _) =
UnexpectedEndTag emptyElementParams
normalizeError (UnmatchedEndTag _) =
UnmatchedEndTag emptyElementParams
normalizeError err = err
mapError = map normalizeError . fst
trimEof ts = case reverse ts of
(EndOfStream:ts') -> reverse ts'
_ -> ts