{-# LANGUAGE ApplicativeDo #-}
{-# LANGUAGE OverloadedStrings #-}

{-|
Description:    An algorithm for guessing character encoding from file contents.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable

In an ideal internet, every server would declare the binary encoding with which
it is transmitting a file (actually, the /true/ ideal would be for it to always
be 'Utf8', but there are still a lot of legacy documents out there).  However,
that's not always the case.

A good fallback would be for every document to declare itself what encoding it
has been saved in.  However, not every one does, and the ones that do may still
get it wrong (take, for instance, the case of a server which /does/ translate
everything it sends to 'Utf8').

And so, the [HTML standard](https://html.spec.whatwg.org/) describes an
algorithm for guessing the proper bytes-to-text translation to use in
'Web.Willow.Common.Encoding.decode'.  While this does therefore assume some
HTML syntax and specific tags, none of the semantics should cause an issue for
other filetypes.
-}
module Web.Willow.Common.Encoding.Sniffer
    ( -- * Types
      Encoding ( .. )
    , Confidence ( .. )
    , ReparseData ( .. )
    , emptyReparseData
      -- * The Algorithm
    , sniff
    , SnifferEnvironment ( .. )
    , emptySnifferEnvironment
    , sniffDecoderState
      -- ** Auxiliary
    , decoderConfidence
    , confidenceEncoding
    , extractEncoding
    ) where


import qualified Control.Applicative as A

import qualified Data.ByteString as BS
import qualified Data.Maybe as Y
import qualified Data.Text.Encoding as T
import qualified Data.Word as W

import Data.Functor ( ($>) )

import Web.Willow.Common.Encoding
import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Encoding.Labels
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Util

import qualified Web.Willow.Common.Encoding.Utf8 as Utf8
import qualified Web.Willow.Common.Encoding.Utf16 as Utf16


-- | A parser specialized for recovering a single potential encoding from a
-- binary stream.
type Sniffer = ParserT BS.ByteString Maybe


-- | Guess what encoding may be in use by the binary stream, and generate a
-- collection of data based on that which results in the behaviour described by
-- the decoding algorithm at the start of the stream.
sniffDecoderState :: SnifferEnvironment -> BS.ByteString -> DecoderState
sniffDecoderState :: SnifferEnvironment -> ByteString -> DecoderState
sniffDecoderState SnifferEnvironment
env ByteString
stream = (Encoding -> DecoderState
initialDecoderState (Encoding -> DecoderState) -> Encoding -> DecoderState
forall a b. (a -> b) -> a -> b
$ Confidence -> Encoding
confidenceEncoding Confidence
conf)
    { decoderConfidence_ :: Confidence
decoderConfidence_ = Confidence
conf
    }
  where conf :: Confidence
conf = SnifferEnvironment -> ByteString -> Confidence
sniff SnifferEnvironment
env ByteString
stream


-- | __HTML:__
--      @[encoding sniffing algorithm]
--      (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@
-- 
-- Given a stream and related metadata, try to determine what encoding may have
-- been used to write it.
-- 
-- Will resolve and/or wait for the number of bytes requested by 'prescanDepth'
-- to be available in the stream (or, if it comes sooner, the end of the
-- stream), if they have not yet been produced.
sniff :: SnifferEnvironment -> BS.ByteString -> Confidence
sniff :: SnifferEnvironment -> ByteString -> Confidence
sniff SnifferEnvironment
opt ByteString
bs = Confidence
-> ((Confidence, ByteString) -> Confidence)
-> Maybe (Confidence, ByteString)
-> Confidence
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Confidence
defaultSniff (Confidence, ByteString) -> Confidence
forall a b. (a, b) -> a
fst (Maybe (Confidence, ByteString) -> Confidence)
-> Maybe (Confidence, ByteString) -> Confidence
forall a b. (a -> b) -> a -> b
$ Parser ByteString Confidence
-> ByteString -> Maybe (Confidence, ByteString)
forall stream out.
Parser stream out -> stream -> Maybe (out, stream)
runParser (SnifferEnvironment -> Parser ByteString Confidence
sniff' SnifferEnvironment
opt) ByteString
bs

-- | __HTML:__
--      @[encoding sniffing algorithm]
--      (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@
-- 
-- Dispatcher to fold the various options and parameters given by the
-- environment into a single output 'Encoding' for the stream, which may or may
-- not wind up being correct, but is still the best guess.
sniff' :: SnifferEnvironment -> Sniffer Confidence
sniff' :: SnifferEnvironment -> Parser ByteString Confidence
sniff' SnifferEnvironment
opt = [Parser ByteString Confidence] -> Parser ByteString Confidence
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
    [ ParserT ByteString Maybe Encoding
-> ParserT ByteString Maybe Encoding
forall (m :: * -> *) stream token out.
MonadParser m stream token =>
m out -> m out
lookAhead ParserT ByteString Maybe Encoding
bom ParserT ByteString Maybe Encoding
-> (Encoding -> Parser ByteString Confidence)
-> Parser ByteString Confidence
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Encoding -> Confidence)
-> Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a. Applicative f => (t -> a) -> t -> f a
sniffAlways Encoding -> Confidence
Certain
    , (Encoding -> Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a.
Alternative f =>
(t -> a) -> Maybe t -> f a
sniffMaybe Encoding -> Confidence
Certain (Maybe Encoding -> Parser ByteString Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall a b. (a -> b) -> a -> b
$ SnifferEnvironment -> Maybe Encoding
userOverride SnifferEnvironment
opt
    , (Encoding -> Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a.
Alternative f =>
(t -> a) -> Maybe t -> f a
sniffMaybe Encoding -> Confidence
Certain (Maybe Encoding -> Parser ByteString Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall a b. (a -> b) -> a -> b
$ SnifferEnvironment -> Maybe Encoding
transportHeader SnifferEnvironment
opt
    , Word -> ParserT ByteString Maybe Encoding
prescan (SnifferEnvironment -> Word
prescanDepth SnifferEnvironment
opt) ParserT ByteString Maybe Encoding
-> (Encoding -> Parser ByteString Confidence)
-> Parser ByteString Confidence
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Encoding -> Confidence)
-> Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a. Applicative f => (t -> a) -> t -> f a
sniffAlways Encoding -> Confidence
tentative
    , (Encoding -> Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a.
Alternative f =>
(t -> a) -> Maybe t -> f a
sniffMaybe Encoding -> Confidence
tentative (Maybe Encoding -> Parser ByteString Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall a b. (a -> b) -> a -> b
$ SnifferEnvironment -> Maybe Encoding
parentEncoding SnifferEnvironment
opt
    -- Try any implementation-defined autodetection ('Tentative').
    , (Encoding -> Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a.
Alternative f =>
(t -> a) -> Maybe t -> f a
sniffMaybe Encoding -> Confidence
tentative (Maybe Encoding -> Parser ByteString Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall a b. (a -> b) -> a -> b
$ SnifferEnvironment -> Maybe Encoding
cachedInfo SnifferEnvironment
opt
    , (Encoding -> Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a.
Alternative f =>
(t -> a) -> Maybe t -> f a
sniffMaybe Encoding -> Confidence
tentative (Maybe Encoding -> Parser ByteString Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall a b. (a -> b) -> a -> b
$ SnifferEnvironment -> Maybe Encoding
userDefault SnifferEnvironment
opt
    , (Encoding -> Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall (f :: * -> *) t a.
Alternative f =>
(t -> a) -> Maybe t -> f a
sniffMaybe Encoding -> Confidence
tentative (Maybe Encoding -> Parser ByteString Confidence)
-> Maybe Encoding -> Parser ByteString Confidence
forall a b. (a -> b) -> a -> b
$ SnifferEnvironment -> Maybe Encoding
localeEncoding SnifferEnvironment
opt
    ]
  where sniffMaybe :: (t -> a) -> Maybe t -> f a
sniffMaybe t -> a
conf (Just t
enc) = a -> f a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (a -> f a) -> a -> f a
forall a b. (a -> b) -> a -> b
$ t -> a
conf t
enc
        sniffMaybe t -> a
_ Maybe t
Nothing = f a
forall (f :: * -> *) a. Alternative f => f a
A.empty
        sniffAlways :: (t -> a) -> t -> f a
sniffAlways t -> a
conf t
enc = a -> f a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (a -> f a) -> a -> f a
forall a b. (a -> b) -> a -> b
$ t -> a
conf t
enc
        tentative :: Encoding -> Confidence
tentative = (Encoding -> ReparseData -> Confidence)
-> ReparseData -> Encoding -> Confidence
forall a b c. (a -> b -> c) -> b -> a -> c
flip Encoding -> ReparseData -> Confidence
Tentative ReparseData
emptyReparseData
        bom :: ParserT ByteString Maybe Encoding
bom = [ParserT ByteString Maybe Encoding]
-> ParserT ByteString Maybe Encoding
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
            [ ParserT ByteString Maybe Encoding
forall (gather :: * -> *).
(Alternative gather, Monad gather) =>
ParserT ByteString gather Encoding
Utf8.byteOrderMark
            , ParserT ByteString Maybe Encoding
forall (gather :: * -> *).
(Alternative gather, Monad gather) =>
ParserT ByteString gather Encoding
Utf16.byteOrderMarkBigEndian
            , ParserT ByteString Maybe Encoding
forall (gather :: * -> *).
(Alternative gather, Monad gather) =>
ParserT ByteString gather Encoding
Utf16.byteOrderMarkLittleEndian
            ]

-- | The fallback 'Encoding' to guess when nothing better is available, as
-- determined by the body of pre-existing content.  If nothing else, this is a
-- single-byte encoding with minimal control characters, so can generally do a
-- half-decent job of representing the underlying binary structure.
defaultSniff :: Confidence
defaultSniff :: Confidence
defaultSniff = Encoding -> ReparseData -> Confidence
Tentative Encoding
Windows1252 ReparseData
emptyReparseData


-- | Various datapoints which may indicate a document's binary encoding, to be
-- fed into the 'sniff' algorithm.  Values may be easily instantiated as
-- updates to 'emptySnifferEnvironment'.
data SnifferEnvironment = SnifferEnvironment
    { SnifferEnvironment -> Maybe Encoding
userOverride :: Maybe Encoding
        -- ^ The encoding the end user has specified should be used.  Note that
        -- even this can still be overridden by the presence of a byte-order
        -- mark at the head of the stream.
    , SnifferEnvironment -> Maybe Encoding
transportHeader :: Maybe Encoding
        -- ^ The encoding given by the transport layer (e.g. through an HTTP
        -- @Content-Type@ header).
    , SnifferEnvironment -> Word
prescanDepth :: Word
        -- ^ The number of bytes which should be skimmed for @<meta>@
        -- attributes specifying an encoding.
    , SnifferEnvironment -> Maybe Encoding
parentEncoding :: Maybe Encoding
        -- ^ The encoding used for the enclosing document (e.g., if this
        -- document is loaded via an @\<iframe\>@).
    , SnifferEnvironment -> Maybe Encoding
cachedInfo :: Maybe Encoding
        -- ^ The encoding from the last time this page was loaded, other pages
        -- on the site, or other cached data.
    , SnifferEnvironment -> Maybe Encoding
userDefault :: Maybe Encoding
        -- ^ The encoding the end user has specified as being their preferred
        -- default, if no better encoding can be determined.
    , SnifferEnvironment -> Maybe Encoding
localeEncoding :: Maybe Encoding
        -- ^ The encoding recommended as a reasonable guess based on the
        -- current language of the user's system.
    }
  deriving ( SnifferEnvironment -> SnifferEnvironment -> Bool
(SnifferEnvironment -> SnifferEnvironment -> Bool)
-> (SnifferEnvironment -> SnifferEnvironment -> Bool)
-> Eq SnifferEnvironment
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: SnifferEnvironment -> SnifferEnvironment -> Bool
$c/= :: SnifferEnvironment -> SnifferEnvironment -> Bool
== :: SnifferEnvironment -> SnifferEnvironment -> Bool
$c== :: SnifferEnvironment -> SnifferEnvironment -> Bool
Eq, Int -> SnifferEnvironment -> ShowS
[SnifferEnvironment] -> ShowS
SnifferEnvironment -> String
(Int -> SnifferEnvironment -> ShowS)
-> (SnifferEnvironment -> String)
-> ([SnifferEnvironment] -> ShowS)
-> Show SnifferEnvironment
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [SnifferEnvironment] -> ShowS
$cshowList :: [SnifferEnvironment] -> ShowS
show :: SnifferEnvironment -> String
$cshow :: SnifferEnvironment -> String
showsPrec :: Int -> SnifferEnvironment -> ShowS
$cshowsPrec :: Int -> SnifferEnvironment -> ShowS
Show, ReadPrec [SnifferEnvironment]
ReadPrec SnifferEnvironment
Int -> ReadS SnifferEnvironment
ReadS [SnifferEnvironment]
(Int -> ReadS SnifferEnvironment)
-> ReadS [SnifferEnvironment]
-> ReadPrec SnifferEnvironment
-> ReadPrec [SnifferEnvironment]
-> Read SnifferEnvironment
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [SnifferEnvironment]
$creadListPrec :: ReadPrec [SnifferEnvironment]
readPrec :: ReadPrec SnifferEnvironment
$creadPrec :: ReadPrec SnifferEnvironment
readList :: ReadS [SnifferEnvironment]
$creadList :: ReadS [SnifferEnvironment]
readsPrec :: Int -> ReadS SnifferEnvironment
$creadsPrec :: Int -> ReadS SnifferEnvironment
Read )
{-# WARNING localeEncoding "The type of this argument will be changed in a future release" #-}

-- | A neutral set of parameters to pass to the 'sniff' algorithm: no accessory
-- data, and a 'prescanDepth' limit of 1024 bytes.
emptySnifferEnvironment :: SnifferEnvironment
emptySnifferEnvironment :: SnifferEnvironment
emptySnifferEnvironment = SnifferEnvironment :: Maybe Encoding
-> Maybe Encoding
-> Word
-> Maybe Encoding
-> Maybe Encoding
-> Maybe Encoding
-> Maybe Encoding
-> SnifferEnvironment
SnifferEnvironment
    { userOverride :: Maybe Encoding
userOverride = Maybe Encoding
forall a. Maybe a
Nothing
    , transportHeader :: Maybe Encoding
transportHeader = Maybe Encoding
forall a. Maybe a
Nothing
    , prescanDepth :: Word
prescanDepth = Word
1024
    , parentEncoding :: Maybe Encoding
parentEncoding = Maybe Encoding
forall a. Maybe a
Nothing
    , cachedInfo :: Maybe Encoding
cachedInfo = Maybe Encoding
forall a. Maybe a
Nothing
    , userDefault :: Maybe Encoding
userDefault = Maybe Encoding
forall a. Maybe a
Nothing
    , localeEncoding :: Maybe Encoding
localeEncoding = Maybe Encoding
forall a. Maybe a
Nothing
    }


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      with an explicit length.
-- 
-- Guess what encoding a stream may have been written in based on suspected
-- byte sequences in its content; in practice, whether the document has any
-- @<meta>@ tags which specify one.
-- 
-- Fails if no encoding can be guessed and never consumes any input, but will
-- resolve and/or wait for @maxDepth@ bytes to be available in the stream (or,
-- if it comes sooner, the end of the stream), if they have not yet been
-- produced.
prescan :: Word -> Sniffer Encoding
prescan :: Word -> ParserT ByteString Maybe Encoding
prescan Word
maxDepth = do
    ByteString
toScan <- ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe ByteString
forall (m :: * -> *) stream token out.
MonadParser m stream token =>
m out -> m out
lookAhead (ParserT ByteString Maybe ByteString
 -> ParserT ByteString Maybe ByteString)
-> ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe ByteString
forall a b. (a -> b) -> a -> b
$ Word -> ParserT ByteString Maybe ByteString
forall (m :: * -> *) stream token.
MonadParser m stream token =>
Word -> m stream
nextChunk Word
maxDepth
    ParserT ByteString Maybe Encoding
-> ((Encoding, ByteString) -> ParserT ByteString Maybe Encoding)
-> Maybe (Encoding, ByteString)
-> ParserT ByteString Maybe Encoding
forall b a. b -> (a -> b) -> Maybe a -> b
maybe ParserT ByteString Maybe Encoding
forall (f :: * -> *) a. Alternative f => f a
A.empty (Encoding -> ParserT ByteString Maybe Encoding
forall (m :: * -> *) a. Monad m => a -> m a
return (Encoding -> ParserT ByteString Maybe Encoding)
-> ((Encoding, ByteString) -> Encoding)
-> (Encoding, ByteString)
-> ParserT ByteString Maybe Encoding
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Encoding, ByteString) -> Encoding
forall a b. (a, b) -> a
fst) (Maybe (Encoding, ByteString) -> ParserT ByteString Maybe Encoding)
-> Maybe (Encoding, ByteString)
-> ParserT ByteString Maybe Encoding
forall a b. (a -> b) -> a -> b
$ ParserT ByteString Maybe Encoding
-> ByteString -> Maybe (Encoding, ByteString)
forall stream out.
Parser stream out -> stream -> Maybe (out, stream)
runParser ParserT ByteString Maybe Encoding
prescan' ByteString
toScan

-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
-- 
-- The main loop of the @prescan@ algorithm, dispatching the various methods
-- of parsing input to guess its encoding based on content.
-- 
-- Note that this consumes input even if it fails, and will attempt to scan the
-- entire input stream if not fed a smaller section.
prescan' :: Sniffer Encoding
prescan' :: ParserT ByteString Maybe Encoding
prescan' = do
    [()]
_ <- ParserT ByteString Maybe () -> ParserT ByteString Maybe [()]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many ParserT ByteString Maybe ()
prescanSkip
    Maybe Encoding
enc' <- Sniffer (Maybe Encoding)
prescanMeta
    case Maybe Encoding
enc' of
        Just Encoding
enc -> Encoding -> ParserT ByteString Maybe Encoding
forall (m :: * -> *) a. Monad m => a -> m a
return Encoding
enc
        Maybe Encoding
Nothing -> ParserT ByteString Maybe Encoding
prescan'
  where prescanMarkup :: ParserT ByteString Maybe ()
prescanMarkup = do
            Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe Word8)
-> Word8 -> ParserT ByteString Maybe Word8
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'<'
            ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> ByteString -> Bool
`BS.elem` ByteString
"!/?")
            ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe [Word8])
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Word8
toByte Char
'>')
            pure ()
        prescanUnmatched :: ParserT ByteString Maybe ()
prescanUnmatched = do
            Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe Word8)
-> Word8 -> ParserT ByteString Maybe Word8
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'<'
            -- Need to check for this as 'prescanMeta' is not run yet.
            ParserT ByteString Maybe () -> ParserT ByteString Maybe ()
forall (m :: * -> *) stream token out.
MonadParser m stream token =>
m out -> m ()
avoiding ParserT ByteString Maybe ()
metaName
            pure ()
        prescanSkip :: ParserT ByteString Maybe ()
prescanSkip = [ParserT ByteString Maybe ()] -> ParserT ByteString Maybe ()
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
            [ ParserT ByteString Maybe ()
prescanComment
            , ParserT ByteString Maybe ()
prescanTag
            , ParserT ByteString Maybe ()
prescanMarkup
            , ParserT ByteString Maybe ()
prescanUnmatched
            , ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.some (ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Word8
toByte Char
'<')) ParserT ByteString Maybe [Word8]
-> () -> ParserT ByteString Maybe ()
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> ()
            ]


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 1
-- 
-- Consume an HTML comment @<!-- (...) -->@.
-- 
-- Fails if the stream doesn't start with an ASCII-encoded @<!--@.  Doesn't
-- fail at the end of the stream, so be sure the calling scanner does.
prescanComment :: Sniffer ()
prescanComment :: ParserT ByteString Maybe ()
prescanComment = do
    ByteString -> ParserT ByteString Maybe ByteString
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq stream) =>
stream -> trans stream
chunk ByteString
"<!--"
    ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe [Word8])
-> (Word8 -> ParserT ByteString Maybe Word8)
-> Word8
-> ParserT ByteString Maybe [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe [Word8])
-> Word8 -> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'-'
    [ParserT ByteString Maybe ()] -> ParserT ByteString Maybe ()
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
        [ Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Char -> Word8
toByte Char
'>') ParserT ByteString Maybe Word8 -> () -> ParserT ByteString Maybe ()
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> ()
        , ParserT ByteString Maybe ()
prescanCommentEnd
        ]

-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 1
-- 
-- Consume all bytes until an end @-->@, or the end of the stream.
-- 
-- Never fails, even if at the end of the stream.
prescanCommentEnd :: Sniffer ()
prescanCommentEnd :: ParserT ByteString Maybe ()
prescanCommentEnd = ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Word8
toByte Char
'-')) ParserT ByteString Maybe [Word8]
-> ParserT ByteString Maybe () -> ParserT ByteString Maybe ()
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> [ParserT ByteString Maybe ()] -> ParserT ByteString Maybe ()
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
    [ ParserT ByteString Maybe ()
endToken
    , ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe () -> ParserT ByteString Maybe ()
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> ParserT ByteString Maybe ()
prescanCommentEnd
    , ParserT ByteString Maybe ()
forall (trans :: * -> *) stream token.
MonadParser trans stream token =>
trans ()
end
    ]
  where endToken :: ParserT ByteString Maybe ()
endToken = do
            ByteString -> ParserT ByteString Maybe ByteString
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq stream) =>
stream -> trans stream
chunk ByteString
"--"
            ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe [Word8])
-> (Word8 -> ParserT ByteString Maybe Word8)
-> Word8
-> ParserT ByteString Maybe [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe [Word8])
-> Word8 -> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'-'
            Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe Word8)
-> Word8 -> ParserT ByteString Maybe Word8
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'>'
            pure ()


-- | The unambiguous start of a @<meta>@ tag.
-- 
-- Fails if the stream doesn't start with an ASCII-encoded @<meta@, case
-- insensitive, followed by either a @/@ or whitespace.
metaName :: Sniffer ()
metaName :: ParserT ByteString Maybe ()
metaName = (Word -> ParserT ByteString Maybe ByteString
forall (m :: * -> *) stream token.
MonadParser m stream token =>
Word -> m stream
nextChunk Word
4 ParserT ByteString Maybe ByteString
-> (ByteString -> ParserT ByteString Maybe ByteString)
-> ParserT ByteString Maybe ByteString
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (ByteString -> Bool)
-> ByteString -> ParserT ByteString Maybe ByteString
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
(==) ByteString
"meta" (ByteString -> Bool)
-> (ByteString -> ByteString) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Word8) -> ByteString -> ByteString
BS.map Word8 -> Word8
toAsciiLowerB)) ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe Word8
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*>
    (ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> [Word8] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` Char -> Word8
toByte Char
'/' Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: [Word8]
asciiWhitespaceB)) ParserT ByteString Maybe Word8 -> () -> ParserT ByteString Maybe ()
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> ()

-- | Consume any number of HTML-defined ASCII whitespace (does /not/ include
-- 0x11 VT).
-- 
-- Never fails, but may return 'BS.empty'.
metaWhitespace :: Sniffer BS.ByteString
metaWhitespace :: ParserT ByteString Maybe ByteString
metaWhitespace = (Maybe ByteString -> ByteString)
-> ParserT ByteString Maybe (Maybe ByteString)
-> ParserT ByteString Maybe ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (ByteString -> Maybe ByteString -> ByteString
forall a. a -> Maybe a -> a
Y.fromMaybe ByteString
BS.empty) (ParserT ByteString Maybe (Maybe ByteString)
 -> ParserT ByteString Maybe ByteString)
-> (ParserT ByteString Maybe Word8
    -> ParserT ByteString Maybe (Maybe ByteString))
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe (Maybe ByteString)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
A.optional (ParserT ByteString Maybe ByteString
 -> ParserT ByteString Maybe (Maybe ByteString))
-> (ParserT ByteString Maybe Word8
    -> ParserT ByteString Maybe ByteString)
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe (Maybe ByteString)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Word8] -> ByteString)
-> ParserT ByteString Maybe [Word8]
-> ParserT ByteString Maybe ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [Word8] -> ByteString
BS.pack (ParserT ByteString Maybe [Word8]
 -> ParserT ByteString Maybe ByteString)
-> (ParserT ByteString Maybe Word8
    -> ParserT ByteString Maybe [Word8])
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.some (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe ByteString)
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe ByteString
forall a b. (a -> b) -> a -> b
$
    ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> [Word8] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Word8]
asciiWhitespaceB)


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 2
-- 
-- Consume an HTML meta declaration @<meta (...)>@, and return any likely
-- encoding it may name.
-- 
-- Fails if the stream doesn't start with an ASCII-encoded @<meta @ or
-- @<meta/@, case insensitive.  Returns 'Nothing' if the end of the tag is
-- reached without properly providing an encoding.
prescanMeta :: Sniffer (Maybe Encoding)
prescanMeta :: Sniffer (Maybe Encoding)
prescanMeta = do
    Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe Word8)
-> Word8 -> ParserT ByteString Maybe Word8
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'<'
    ParserT ByteString Maybe ()
metaName
    [(ByteString, ByteString)]
attrs <- ParserT ByteString Maybe (ByteString, ByteString)
-> ParserT ByteString Maybe [(ByteString, ByteString)]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many ParserT ByteString Maybe (ByteString, ByteString)
prescanAttribute
    ParserT ByteString Maybe () -> ParserT ByteString Maybe ()
forall (m :: * -> *) stream token out.
MonadParser m stream token =>
m out -> m ()
avoiding ParserT ByteString Maybe ()
forall (trans :: * -> *) stream token.
MonadParser trans stream token =>
trans ()
end
    return $ case (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
-> (Maybe Bool, Bool, Maybe Encoding)
finalizeEncoding ((Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
 -> (Maybe Bool, Bool, Maybe Encoding))
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
-> (Maybe Bool, Bool, Maybe Encoding)
forall a b. (a -> b) -> a -> b
$ ((ByteString, ByteString)
 -> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
 -> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding)))
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
-> [(ByteString, ByteString)]
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (ByteString, ByteString)
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
forall a.
(Eq a, IsString a) =>
(a, ByteString)
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
gatherAttrs (Maybe Bool
forall a. Maybe a
Nothing, Bool
False, Maybe Encoding -> Either (Maybe Encoding) (Maybe Encoding)
forall a b. a -> Either a b
Left Maybe Encoding
forall a. Maybe a
Nothing) [(ByteString, ByteString)]
attrs of
        (Maybe Bool
Nothing, Bool
_, Maybe Encoding
_) -> Maybe Encoding
forall a. Maybe a
Nothing
        (Just Bool
True, Bool
False, Maybe Encoding
_) -> Maybe Encoding
forall a. Maybe a
Nothing
        (Maybe Bool
_, Bool
_, Maybe Encoding
Nothing) -> Maybe Encoding
forall a. Maybe a
Nothing
        (Maybe Bool
_, Bool
_, Just Encoding
Utf16be) -> Encoding -> Maybe Encoding
forall (f :: * -> *) a. Applicative f => a -> f a
pure Encoding
Utf8
        (Maybe Bool
_, Bool
_, Just Encoding
Utf16le) -> Encoding -> Maybe Encoding
forall (f :: * -> *) a. Applicative f => a -> f a
pure Encoding
Utf8
        (Maybe Bool
_, Bool
_, Just Encoding
UserDefined) -> Encoding -> Maybe Encoding
forall (f :: * -> *) a. Applicative f => a -> f a
pure Encoding
Windows1252
        (Maybe Bool
_, Bool
_, Just Encoding
enc) -> Encoding -> Maybe Encoding
forall (f :: * -> *) a. Applicative f => a -> f a
pure Encoding
enc
  where gatherAttrs :: (a, ByteString)
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
-> (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
gatherAttrs (a
"http-equiv", ByteString
"content-type") (Maybe Bool
need, Bool
_, Either (Maybe Encoding) (Maybe Encoding)
charset) =
            (Maybe Bool
need, Bool
True, Either (Maybe Encoding) (Maybe Encoding)
charset)
        -- Reset the "http-equiv" if an earlier attribute shadows the
        -- "=content-type" (folding right-to-left; nothing else sets this).
        gatherAttrs (a
"http-equiv", ByteString
_) (Maybe Bool
need, Bool
_, Either (Maybe Encoding) (Maybe Encoding)
charset) =
            (Maybe Bool
need, Bool
False, Either (Maybe Encoding) (Maybe Encoding)
charset)
        -- "content" is a weaker version of "charset" and so gets overwritten
        -- if the latter appears later and ignored if it appears earlier
        gatherAttrs (a
"content", ByteString
v) (Maybe Bool
need, Bool
got, Left Maybe Encoding
_) =
            case ByteString -> Maybe Encoding
extractEncoding ByteString
v of
                Just Encoding
enc -> (Bool -> Maybe Bool
forall a. a -> Maybe a
Just (Bool -> Maybe Bool) -> Bool -> Maybe Bool
forall a b. (a -> b) -> a -> b
$ Bool -> Maybe Bool -> Bool
forall a. a -> Maybe a -> a
Y.fromMaybe Bool
True Maybe Bool
need, Bool
got, Maybe Encoding -> Either (Maybe Encoding) (Maybe Encoding)
forall a b. a -> Either a b
Left (Maybe Encoding -> Either (Maybe Encoding) (Maybe Encoding))
-> Maybe Encoding -> Either (Maybe Encoding) (Maybe Encoding)
forall a b. (a -> b) -> a -> b
$ Encoding -> Maybe Encoding
forall a. a -> Maybe a
Just Encoding
enc)
                Maybe Encoding
Nothing -> (Maybe Bool
need, Bool
got, Maybe Encoding -> Either (Maybe Encoding) (Maybe Encoding)
forall a b. a -> Either a b
Left Maybe Encoding
forall a. Maybe a
Nothing)
        gatherAttrs (a
"charset", ByteString
v) (Maybe Bool
_, Bool
got, Either (Maybe Encoding) (Maybe Encoding)
_) =
            (Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False, Bool
got, Maybe Encoding -> Either (Maybe Encoding) (Maybe Encoding)
forall a b. b -> Either a b
Right (Maybe Encoding -> Either (Maybe Encoding) (Maybe Encoding))
-> (Text -> Maybe Encoding)
-> Text
-> Either (Maybe Encoding) (Maybe Encoding)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Maybe Encoding
lookupEncoding (Text -> Either (Maybe Encoding) (Maybe Encoding))
-> Text -> Either (Maybe Encoding) (Maybe Encoding)
forall a b. (a -> b) -> a -> b
$ ByteString -> Text
T.decodeLatin1 ByteString
v)
        gatherAttrs (a, ByteString)
_ (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
attrs = (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
attrs
        finalizeEncoding :: (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding)) -> (Maybe Bool, Bool, Maybe Encoding)
        finalizeEncoding :: (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding))
-> (Maybe Bool, Bool, Maybe Encoding)
finalizeEncoding (Maybe Bool
need, Bool
got, Left Maybe Encoding
enc) = (Maybe Bool
need, Bool
got, Maybe Encoding
enc)
        finalizeEncoding (Maybe Bool
need, Bool
got, Right Maybe Encoding
enc) = (Maybe Bool
need, Bool
got, Maybe Encoding
enc)


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 3
-- 
-- Consume any HTML tag fitting the pattern @</?[A-Za-z].*>?@ up to (but not
-- including) the closing @>@.
-- 
-- Fails if no such tag is found, or, for compatibility, if the tag name is
-- exactly @meta@.
prescanTag :: Sniffer ()
prescanTag :: ParserT ByteString Maybe ()
prescanTag = do
    Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe Word8)
-> Word8 -> ParserT ByteString Maybe Word8
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'<'
    -- Need to check for this as 'prescanMeta' might fail
    ParserT ByteString Maybe () -> ParserT ByteString Maybe ()
forall (m :: * -> *) stream token out.
MonadParser m stream token =>
m out -> m ()
avoiding ParserT ByteString Maybe ()
metaName
    ParserT ByteString Maybe ByteString
metaWhitespace
    ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe (Maybe Word8)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
A.optional (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe (Maybe Word8))
-> (Word8 -> ParserT ByteString Maybe Word8)
-> Word8
-> ParserT ByteString Maybe (Maybe Word8)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe (Maybe Word8))
-> Word8 -> ParserT ByteString Maybe (Maybe Word8)
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'/'
    ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying ((Word8 -> [Word8] -> Bool) -> [Word8] -> Word8 -> Bool
forall a b c. (a -> b -> c) -> b -> a -> c
flip Word8 -> [Word8] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem ([Word8] -> Word8 -> Bool)
-> (String -> [Word8]) -> String -> Word8 -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Word8) -> String -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Word8
toByte (String -> Word8 -> Bool) -> String -> Word8 -> Bool
forall a b. (a -> b) -> a -> b
$ [Char
'A'..Char
'Z'] String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char
'a'..Char
'z'])
    ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe [Word8])
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Bool -> Bool
not (Bool -> Bool) -> (Word8 -> Bool) -> Word8 -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> [Word8] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` Char -> Word8
toByte Char
'>' Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: [Word8]
asciiWhitespaceB))
    ParserT ByteString Maybe (ByteString, ByteString)
-> ParserT ByteString Maybe [(ByteString, ByteString)]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many ParserT ByteString Maybe (ByteString, ByteString)
prescanAttribute
    pure ()


-- | __HTML:__
--      @[get an attribute]
--      (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@
-- 
-- Common a tag attribute, with or without a value, and return its name and
-- either a value or 'BS.empty'.
-- 
-- Fails if the stream begins with an ASCII-encoded @>@.  Intended for use in a
-- 'A.many' loop, where it may return extra @('BS.empty', 'BS.empty')@
-- "attributes"---do not rely on 'length' being accurate.
prescanAttribute :: Sniffer (BS.ByteString, BS.ByteString)
prescanAttribute :: ParserT ByteString Maybe (ByteString, ByteString)
prescanAttribute = ParserT ByteString Maybe ByteString
metaWhitespace ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe (ByteString, ByteString)
-> ParserT ByteString Maybe (ByteString, ByteString)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> [ParserT ByteString Maybe (ByteString, ByteString)]
-> ParserT ByteString Maybe (ByteString, ByteString)
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
    -- A null return is the safest way to loop here.
    [ Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Char -> Word8
toByte Char
'/') ParserT ByteString Maybe Word8
-> (ByteString, ByteString)
-> ParserT ByteString Maybe (ByteString, ByteString)
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> (ByteString
BS.empty, ByteString
BS.empty)
    , do
        Word8
nc <- ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Word8
toByte Char
'>')
        [Word8]
ncs <- ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe [Word8])
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying Word8 -> Bool
isNameChar
        ParserT ByteString Maybe ByteString
metaWhitespace
        ByteString
value <- [ParserT ByteString Maybe ByteString]
-> ParserT ByteString Maybe ByteString
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
            [ Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Char -> Word8
toByte Char
'=') ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe ByteString
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> ParserT ByteString Maybe ByteString
metaWhitespace ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe ByteString
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> [ParserT ByteString Maybe ByteString]
-> ParserT ByteString Maybe ByteString
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
                [ ParserT ByteString Maybe ByteString
quotedValue
                , Word8 -> ParserT ByteString Maybe ByteString
unquotedValue (Word8 -> ParserT ByteString Maybe ByteString)
-> Word8 -> ParserT ByteString Maybe ByteString
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'>'
                ]
            , ByteString -> ParserT ByteString Maybe ByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure ByteString
BS.empty
            ]
        return ((Word8 -> Word8) -> ByteString -> ByteString
BS.map Word8 -> Word8
toAsciiLowerB (Word8 -> ByteString -> ByteString
BS.cons Word8
nc (ByteString -> ByteString) -> ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ [Word8] -> ByteString
BS.pack [Word8]
ncs), (Word8 -> Word8) -> ByteString -> ByteString
BS.map Word8 -> Word8
toAsciiLowerB ByteString
value)
    ]
  where isNameChar :: Word8 -> Bool
isNameChar Word8
c
            | Word8 -> ByteString -> Bool
BS.elem Word8
c ByteString
"/=>" = Bool
False
            | Word8 -> [Word8] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem Word8
c [Word8]
asciiWhitespaceB = Bool
False
            | Bool
otherwise = Bool
True

-- | __HTML:__
-- 
--      * @[get an attribute]
--        (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@
--        step 10, case 1
--      * @[algorithm for extracting a character encoding from a meta element]
--        (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
--        step 6, case 1
-- 
-- Retrieve a string contained between matched quotation marks (@"@ or @'@),
-- dropping that punctuation.
-- 
-- Fails if the stream doesn't start with one of the ASCII-encoded quotation
-- marks, or if the same mark does /not/ appear again before the end of the
-- stream.
quotedValue :: Sniffer BS.ByteString
quotedValue :: ParserT ByteString Maybe ByteString
quotedValue = [Word8] -> ByteString
BS.pack ([Word8] -> ByteString)
-> ParserT ByteString Maybe [Word8]
-> ParserT ByteString Maybe ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [ParserT ByteString Maybe [Word8]]
-> ParserT ByteString Maybe [Word8]
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
    [ Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) stream b.
(MonadParser f stream b, Eq b) =>
b -> f [b]
parseValue (Word8 -> ParserT ByteString Maybe [Word8])
-> Word8 -> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'"'
    , Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) stream b.
(MonadParser f stream b, Eq b) =>
b -> f [b]
parseValue (Word8 -> ParserT ByteString Maybe [Word8])
-> Word8 -> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'\''
    ]
  where parseValue :: b -> f [b]
parseValue b
quote = b -> f b
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token b
quote f b -> f [b] -> f [b]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> f b -> f [b]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (f b
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next f b -> (b -> f b) -> f b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (b -> Bool) -> b -> f b
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (b -> b -> Bool
forall a. Eq a => a -> a -> Bool
/= b
quote)) f [b] -> f b -> f [b]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* b -> f b
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token b
quote

-- | __HTML:__
-- 
--      * @[get an attribute]
--        (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@
--        step 10, cases 3 and 4; step 11
--      * @[algorithm for extracting a character encoding from a meta element]
--        (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
--        step 6, case 3 (via the added generality)
-- 
-- Retrieve a string written as a single, unquoted and unbroken sequence (e.g.
-- @utf8@ or @iso-ir-109@ but /not/ @Windows 1252@ or @"SHIFT-JIS"@).  The
-- additional parameter is treated as a breakpoint as well:
-- 
-- >>> parseMaybe (unquotedValue 0x3B) <* takeRest) "content;x"
-- "content"
-- 
-- Only fails if the stream begins with an ASCII-encoded @"@ or @'@, but may
-- return 'BS.empty'.
unquotedValue :: W.Word8 -> Sniffer BS.ByteString
unquotedValue :: Word8 -> ParserT ByteString Maybe ByteString
unquotedValue Word8
terminal = do
    Word8
c <- ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Bool -> Bool
not (Bool -> Bool) -> (Word8 -> Bool) -> Word8 -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> [Word8] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` Word8
terminal Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: ByteString -> [Word8]
BS.unpack ByteString
"\"'" [Word8] -> [Word8] -> [Word8]
forall a. [a] -> [a] -> [a]
++ [Word8]
asciiWhitespaceB))
    [Word8]
cs <- ParserT ByteString Maybe Word8 -> ParserT ByteString Maybe [Word8]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
A.many (ParserT ByteString Maybe Word8
 -> ParserT ByteString Maybe [Word8])
-> ParserT ByteString Maybe Word8
-> ParserT ByteString Maybe [Word8]
forall a b. (a -> b) -> a -> b
$ ParserT ByteString Maybe Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next ParserT ByteString Maybe Word8
-> (Word8 -> ParserT ByteString Maybe Word8)
-> ParserT ByteString Maybe Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool) -> Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Bool -> Bool
not (Bool -> Bool) -> (Word8 -> Bool) -> Word8 -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> [Word8] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` Word8
terminal Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: [Word8]
asciiWhitespaceB))
    ParserT ByteString Maybe ByteString
metaWhitespace
    ByteString -> ParserT ByteString Maybe ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> ParserT ByteString Maybe ByteString)
-> ([Word8] -> ByteString)
-> [Word8]
-> ParserT ByteString Maybe ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> ByteString
BS.pack ([Word8] -> ParserT ByteString Maybe ByteString)
-> [Word8] -> ParserT ByteString Maybe ByteString
forall a b. (a -> b) -> a -> b
$ Word8
c Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: [Word8]
cs


-- | The encoding scheme currently in use by the parser, along with how likely
-- that scheme actually represents the binary stream.
decoderConfidence :: DecoderState -> Confidence
-- Very simple indirection to prevent this being used as a record setter.
decoderConfidence :: DecoderState -> Confidence
decoderConfidence = DecoderState -> Confidence
decoderConfidence_
{-# INLINE decoderConfidence #-}


-- | __HTML:__
--      @[algorithm for extracting a character encoding from a meta element]
--      (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
-- 
-- Find the first occurrence of an ASCII-encoded string @charset@ in the
-- stream, and try to parse its attribute-style value into an 'Encoding'.
-- 
-- Returns 'Nothing' if the stream does not contain @charset@ followed by @=@,
-- or if the value can not be successfully parsed as an encoding label.
extractEncoding :: BS.ByteString -> Maybe Encoding
extractEncoding :: ByteString -> Maybe Encoding
extractEncoding = ((Encoding, ByteString) -> Encoding)
-> Maybe (Encoding, ByteString) -> Maybe Encoding
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Encoding, ByteString) -> Encoding
forall a b. (a, b) -> a
fst (Maybe (Encoding, ByteString) -> Maybe Encoding)
-> (ByteString -> Maybe (Encoding, ByteString))
-> ByteString
-> Maybe Encoding
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ParserT ByteString Maybe Encoding
-> ByteString -> Maybe (Encoding, ByteString)
forall stream out.
Parser stream out -> stream -> Maybe (out, stream)
runParser ParserT ByteString Maybe Encoding
extractEncoding'

-- | __HTML:__
--      @[algorithm for extracting a character encoding from a meta element]
--      (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
-- 
-- Find the first occurrence of an ASCII-encoded string @charset@ in the
-- stream, and try to parse its attribute-style value into an 'Encoding'.
-- 
-- Fails if the stream does not contain @charset@ followed by @=@, or if the
-- value can not be successfully parsed as an encoding label.
extractEncoding' :: Sniffer Encoding
extractEncoding' :: ParserT ByteString Maybe Encoding
extractEncoding' = do
    ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe ByteString
forall (parser :: * -> *) stream token out.
MonadParser parser stream token =>
parser out -> parser out
findNext (ParserT ByteString Maybe ByteString
 -> ParserT ByteString Maybe ByteString)
-> ParserT ByteString Maybe ByteString
-> ParserT ByteString Maybe ByteString
forall a b. (a -> b) -> a -> b
$ Word -> ParserT ByteString Maybe ByteString
forall (m :: * -> *) stream token.
MonadParser m stream token =>
Word -> m stream
nextChunk Word
7 ParserT ByteString Maybe ByteString
-> (ByteString -> ParserT ByteString Maybe ByteString)
-> ParserT ByteString Maybe ByteString
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (ByteString -> Bool)
-> ByteString -> ParserT ByteString Maybe ByteString
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
(==) ByteString
"charset" (ByteString -> Bool)
-> (ByteString -> ByteString) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Word8) -> ByteString -> ByteString
BS.map Word8 -> Word8
toAsciiLowerB)
    ParserT ByteString Maybe ByteString
metaWhitespace
    [ParserT ByteString Maybe Encoding]
-> ParserT ByteString Maybe Encoding
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
        [ do
            Word8 -> ParserT ByteString Maybe Word8
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq token) =>
token -> trans token
token (Word8 -> ParserT ByteString Maybe Word8)
-> Word8 -> ParserT ByteString Maybe Word8
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
'='
            ParserT ByteString Maybe ByteString
metaWhitespace
            ByteString
value <- [ParserT ByteString Maybe ByteString]
-> ParserT ByteString Maybe ByteString
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
                [ ParserT ByteString Maybe ByteString
quotedValue
                , Word8 -> ParserT ByteString Maybe ByteString
unquotedValue (Word8 -> ParserT ByteString Maybe ByteString)
-> Word8 -> ParserT ByteString Maybe ByteString
forall a b. (a -> b) -> a -> b
$ Char -> Word8
toByte Char
';'
                ]
            ParserT ByteString Maybe Encoding
-> (Encoding -> ParserT ByteString Maybe Encoding)
-> Maybe Encoding
-> ParserT ByteString Maybe Encoding
forall b a. b -> (a -> b) -> Maybe a -> b
maybe ParserT ByteString Maybe Encoding
forall (f :: * -> *) a. Alternative f => f a
A.empty Encoding -> ParserT ByteString Maybe Encoding
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Encoding -> ParserT ByteString Maybe Encoding)
-> (Text -> Maybe Encoding)
-> Text
-> ParserT ByteString Maybe Encoding
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Maybe Encoding
lookupEncoding (Text -> ParserT ByteString Maybe Encoding)
-> Text -> ParserT ByteString Maybe Encoding
forall a b. (a -> b) -> a -> b
$ ByteString -> Text
T.decodeLatin1 ByteString
value
        , ParserT ByteString Maybe Encoding
extractEncoding'
        ]