{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}
module Duckling.Types.Document
( Document
, fromText
, (!)
, length
, byteStringFromPos
, isAdjacent
, isRangeValid
) where
import Data.Array.Unboxed (UArray)
import Data.ByteString (ByteString)
import Data.List (scanl', foldl', foldr)
import Data.String
import Data.Text (Text)
import Prelude hiding (length)
import qualified Data.Array.Unboxed as Array
import qualified Data.ByteString as BS
import qualified Data.Char as Char
import qualified Data.Text.Unsafe as UText
import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text
import qualified Data.Text.Internal.Unsafe.Char as UText
data Document = Document
{ rawInput :: !Text
, utf8Encoded :: ByteString
, indexable :: UArray Int Char
, firstNonAdjacent :: UArray Int Int
, tDropToBSDrop :: UArray Int Int
, bsDropToTDrop :: UArray Int Int
, tDropToUtf16Drop :: UArray Int Int
} deriving (Show)
instance IsString Document where
fromString = fromText . fromString
fromText :: Text -> Document
fromText rawInput = Document{..}
where
utf8Encoded = Text.encodeUtf8 rawInput
rawInputLength = Text.length rawInput
unpacked = Text.unpack rawInput
indexable = Array.listArray (0, rawInputLength - 1) unpacked
firstNonAdjacent = Array.listArray (0, rawInputLength - 1) $ snd $
foldr gen (rawInputLength, []) $ zip [0..] unpacked
gen (ix, elem) (best, !acc)
| isAdjacentSeparator elem = (best, best:acc)
| otherwise = (ix, ix:acc)
tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList
tDropToUtf16Drop = Array.listArray (0, rawInputLength) $
scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $
reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
utf8CharWidth c
| w <= 0x7F = 1
| w <= 0x7FF = 2
| w <= 0xFFFF = 3
| otherwise = 4
where
w = UText.ord c
utf16CharWidth c
| w < 0x10000 = 1
| otherwise = 2
where
w = UText.ord c
isRangeValid :: Document -> Int -> Int -> Bool
isRangeValid doc start end =
(start == 0 ||
isDifferent (doc ! (start - 1)) (doc ! start)) &&
(end == length doc ||
isDifferent (doc ! (end - 1)) (doc ! end))
where
charClass :: Char -> Char
charClass c
| Char.isLower c || Char.isUpper c = 'c'
| Char.isDigit c = 'd'
| otherwise = c
isDifferent :: Char -> Char -> Bool
isDifferent a b = charClass a /= charClass b
isAdjacent :: Document -> Int -> Int -> Bool
isAdjacent Document{..} a b =
b >= a && (firstNonAdjacent Array.! a >= b)
isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator c = elem c [' ', '\t', '-']
(!) :: Document -> Int -> Char
(!) Document { indexable = s } ix = s Array.! ix
length :: Document -> Int
length Document { indexable = s } = Array.rangeSize $ Array.bounds s
{-# INLINE byteStringFromPos #-}
byteStringFromPos
:: Document
-> Int
-> ( ByteString
, (Int, Int) -> Text
, Int -> Int -> (Int, Int)
)
byteStringFromPos
Document { rawInput = rawInput
, utf8Encoded = utf8Encoded
, tDropToBSDrop = tDropToBSDrop
, bsDropToTDrop = bsDropToTDrop
, tDropToUtf16Drop = tDropToUtf16Drop
}
position = (substring, rangeToText, translateRange)
where
utf8Position = tDropToBSDrop Array.! position
substring :: ByteString
substring = BS.drop utf8Position utf8Encoded
rangeToText :: (Int, Int) -> Text
rangeToText (-1, _) = ""
rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
UText.dropWord16 start16Pos rawInput
where
start16Pos = tDropToUtf16Drop Array.! startPos
end16Pos = tDropToUtf16Drop Array.! endPos
(startPos, endPos) = uncurry translateRange r
translateRange :: Int -> Int -> (Int, Int)
translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
where
res = (startPos, endPos)
realBsStart = utf8Position + bsStart
realBsEnd = realBsStart + bsLen
startPos = bsDropToTDrop Array.! realBsStart
endPos = bsDropToTDrop Array.! realBsEnd