{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}
module Duckling.Types.Document
( Document
, fromText
, (!)
, length
, byteStringFromPos
, isAdjacent
, isRangeValid
) where
import Data.Array.Unboxed (UArray)
import Data.ByteString (ByteString)
import Data.List (scanl', foldl', foldr)
import Data.String
import Data.Text (Text)
import Prelude hiding (length)
import qualified Data.Array.Unboxed as Array
import qualified Data.ByteString as BS
import qualified Data.Char as Char
import qualified Data.Text.Unsafe as UText
import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text
import qualified Data.Text.Internal.Unsafe.Char as UText
data Document = Document
{ Document -> Text
rawInput :: !Text
, Document -> ByteString
utf8Encoded :: ByteString
, Document -> UArray Int Char
indexable :: UArray Int Char
, Document -> UArray Int Int
firstNonAdjacent :: UArray Int Int
, Document -> UArray Int Int
tDropToBSDrop :: UArray Int Int
, Document -> UArray Int Int
bsDropToTDrop :: UArray Int Int
, Document -> UArray Int Int
tDropToUtf16Drop :: UArray Int Int
} deriving (Int -> Document -> ShowS
[Document] -> ShowS
Document -> String
(Int -> Document -> ShowS)
-> (Document -> String) -> ([Document] -> ShowS) -> Show Document
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Document] -> ShowS
$cshowList :: [Document] -> ShowS
show :: Document -> String
$cshow :: Document -> String
showsPrec :: Int -> Document -> ShowS
$cshowsPrec :: Int -> Document -> ShowS
Show)
instance IsString Document where
fromString :: String -> Document
fromString = Text -> Document
fromText (Text -> Document) -> (String -> Text) -> String -> Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
forall a. IsString a => String -> a
fromString
fromText :: Text -> Document
fromText :: Text -> Document
fromText Text
rawInput = Document :: Text
-> ByteString
-> UArray Int Char
-> UArray Int Int
-> UArray Int Int
-> UArray Int Int
-> UArray Int Int
-> Document
Document{ByteString
Text
UArray Int Char
UArray Int Int
bsDropToTDrop :: UArray Int Int
tDropToUtf16Drop :: UArray Int Int
tDropToBSDrop :: UArray Int Int
firstNonAdjacent :: UArray Int Int
indexable :: UArray Int Char
utf8Encoded :: ByteString
rawInput :: Text
tDropToUtf16Drop :: UArray Int Int
bsDropToTDrop :: UArray Int Int
tDropToBSDrop :: UArray Int Int
firstNonAdjacent :: UArray Int Int
indexable :: UArray Int Char
utf8Encoded :: ByteString
rawInput :: Text
..}
where
utf8Encoded :: ByteString
utf8Encoded = Text -> ByteString
Text.encodeUtf8 Text
rawInput
rawInputLength :: Int
rawInputLength = Text -> Int
Text.length Text
rawInput
unpacked :: String
unpacked = Text -> String
Text.unpack Text
rawInput
indexable :: UArray Int Char
indexable = (Int, Int) -> String -> UArray Int Char
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) String
unpacked
firstNonAdjacent :: UArray Int Int
firstNonAdjacent = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) ([Int] -> UArray Int Int) -> [Int] -> UArray Int Int
forall a b. (a -> b) -> a -> b
$ (Int, [Int]) -> [Int]
forall a b. (a, b) -> b
snd ((Int, [Int]) -> [Int]) -> (Int, [Int]) -> [Int]
forall a b. (a -> b) -> a -> b
$
((Int, Char) -> (Int, [Int]) -> (Int, [Int]))
-> (Int, [Int]) -> [(Int, Char)] -> (Int, [Int])
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (Int, Char) -> (Int, [Int]) -> (Int, [Int])
forall a. (a, Char) -> (a, [a]) -> (a, [a])
gen (Int
rawInputLength, []) ([(Int, Char)] -> (Int, [Int])) -> [(Int, Char)] -> (Int, [Int])
forall a b. (a -> b) -> a -> b
$ [Int] -> String -> [(Int, Char)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
0..] String
unpacked
gen :: (a, Char) -> (a, [a]) -> (a, [a])
gen (a
ix, Char
elem) (a
best, ![a]
acc)
| Char -> Bool
isAdjacentSeparator Char
elem = (a
best, a
besta -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
acc)
| Bool
otherwise = (a
ix, a
ixa -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
acc)
tDropToBSDropList :: [Int]
tDropToBSDropList = (Int -> Char -> Int) -> Int -> String -> [Int]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl' (\Int
acc Char
a -> Int
acc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
forall p. Num p => Char -> p
utf8CharWidth Char
a) Int
0 String
unpacked
tDropToBSDrop :: UArray Int Int
tDropToBSDrop = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength) [Int]
tDropToBSDropList
tDropToUtf16Drop :: UArray Int Int
tDropToUtf16Drop = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength) ([Int] -> UArray Int Int) -> [Int] -> UArray Int Int
forall a b. (a -> b) -> a -> b
$
(Int -> Char -> Int) -> Int -> String -> [Int]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl' (\Int
acc Char
a -> Int
acc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
forall p. Num p => Char -> p
utf16CharWidth Char
a) Int
0 String
unpacked
bsDropToTDrop :: UArray Int Int
bsDropToTDrop = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, ByteString -> Int
BS.length ByteString
utf8Encoded) ([Int] -> UArray Int Int) -> [Int] -> UArray Int Int
forall a b. (a -> b) -> a -> b
$
[Int] -> [Int]
forall a. [a] -> [a]
reverse ([Int] -> [Int]) -> [Int] -> [Int]
forall a b. (a -> b) -> a -> b
$ (Int, [Int]) -> [Int]
forall a b. (a, b) -> b
snd ((Int, [Int]) -> [Int]) -> (Int, [Int]) -> [Int]
forall a b. (a -> b) -> a -> b
$ ((Int, [Int]) -> (Int, Int) -> (Int, [Int]))
-> (Int, [Int]) -> [(Int, Int)] -> (Int, [Int])
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (Int, [Int]) -> (Int, Int) -> (Int, [Int])
forall a. (Int, [a]) -> (a, Int) -> (Int, [a])
fun (-Int
1, []) ([(Int, Int)] -> (Int, [Int])) -> [(Int, Int)] -> (Int, [Int])
forall a b. (a -> b) -> a -> b
$ [Int] -> [Int] -> [(Int, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
0..] [Int]
tDropToBSDropList
fun :: (Int, [a]) -> (a, Int) -> (Int, [a])
fun (Int
lastPos, ![a]
acc) (a
ix, Int
elem) = (Int
elem, Int -> a -> [a]
forall a. Int -> a -> [a]
replicate (Int
elem Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
lastPos) a
ix [a] -> [a] -> [a]
forall a. [a] -> [a] -> [a]
++ [a]
acc)
utf8CharWidth :: Char -> p
utf8CharWidth Char
c
| Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7F = p
1
| Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7FF = p
2
| Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xFFFF = p
3
| Bool
otherwise = p
4
where
w :: Int
w = Char -> Int
UText.ord Char
c
utf16CharWidth :: Char -> p
utf16CharWidth Char
c
| Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x10000 = p
1
| Bool
otherwise = p
2
where
w :: Int
w = Char -> Int
UText.ord Char
c
isRangeValid :: Document -> Int -> Int -> Bool
isRangeValid :: Document -> Int -> Int -> Bool
isRangeValid Document
doc Int
start Int
end =
(Int
start Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 Bool -> Bool -> Bool
||
Char -> Char -> Bool
isDifferent (Document
doc Document -> Int -> Char
! (Int
start Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)) (Document
doc Document -> Int -> Char
! Int
start)) Bool -> Bool -> Bool
&&
(Int
end Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Document -> Int
length Document
doc Bool -> Bool -> Bool
||
Char -> Char -> Bool
isDifferent (Document
doc Document -> Int -> Char
! (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)) (Document
doc Document -> Int -> Char
! Int
end))
where
charClass :: Char -> Char
charClass :: Char -> Char
charClass Char
c
| Char -> Bool
Char.isLower Char
c Bool -> Bool -> Bool
|| Char -> Bool
Char.isUpper Char
c = Char
'c'
| Char -> Bool
Char.isDigit Char
c = Char
'd'
| Bool
otherwise = Char
c
isDifferent :: Char -> Char -> Bool
isDifferent :: Char -> Char -> Bool
isDifferent Char
a Char
b = Char -> Char
charClass Char
a Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Char
charClass Char
b
isAdjacent :: Document -> Int -> Int -> Bool
isAdjacent :: Document -> Int -> Int -> Bool
isAdjacent Document{ByteString
Text
UArray Int Char
UArray Int Int
tDropToUtf16Drop :: UArray Int Int
bsDropToTDrop :: UArray Int Int
tDropToBSDrop :: UArray Int Int
firstNonAdjacent :: UArray Int Int
indexable :: UArray Int Char
utf8Encoded :: ByteString
rawInput :: Text
tDropToUtf16Drop :: Document -> UArray Int Int
bsDropToTDrop :: Document -> UArray Int Int
tDropToBSDrop :: Document -> UArray Int Int
firstNonAdjacent :: Document -> UArray Int Int
indexable :: Document -> UArray Int Char
utf8Encoded :: Document -> ByteString
rawInput :: Document -> Text
..} Int
a Int
b =
Int
b Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
a Bool -> Bool -> Bool
&& (UArray Int Int
firstNonAdjacent UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
a Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
b)
isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator Char
c = Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem Char
c [Char
' ', Char
'\t']
(!) :: Document -> Int -> Char
(!) Document { indexable :: Document -> UArray Int Char
indexable = UArray Int Char
s } Int
ix = UArray Int Char
s UArray Int Char -> Int -> Char
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
ix
length :: Document -> Int
length :: Document -> Int
length Document { indexable :: Document -> UArray Int Char
indexable = UArray Int Char
s } = (Int, Int) -> Int
forall a. Ix a => (a, a) -> Int
Array.rangeSize ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ UArray Int Char -> (Int, Int)
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> (i, i)
Array.bounds UArray Int Char
s
{-# INLINE byteStringFromPos #-}
byteStringFromPos
:: Document
-> Int
-> ( ByteString
, (Int, Int) -> Text
, Int -> Int -> (Int, Int)
)
byteStringFromPos :: Document
-> Int
-> (ByteString, (Int, Int) -> Text, Int -> Int -> (Int, Int))
byteStringFromPos
Document { rawInput :: Document -> Text
rawInput = Text
rawInput
, utf8Encoded :: Document -> ByteString
utf8Encoded = ByteString
utf8Encoded
, tDropToBSDrop :: Document -> UArray Int Int
tDropToBSDrop = UArray Int Int
tDropToBSDrop
, bsDropToTDrop :: Document -> UArray Int Int
bsDropToTDrop = UArray Int Int
bsDropToTDrop
, tDropToUtf16Drop :: Document -> UArray Int Int
tDropToUtf16Drop = UArray Int Int
tDropToUtf16Drop
}
Int
position = (ByteString
substring, (Int, Int) -> Text
rangeToText, Int -> Int -> (Int, Int)
translateRange)
where
utf8Position :: Int
utf8Position = UArray Int Int
tDropToBSDrop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
position
substring :: ByteString
substring :: ByteString
substring = Int -> ByteString -> ByteString
BS.drop Int
utf8Position ByteString
utf8Encoded
rangeToText :: (Int, Int) -> Text
rangeToText :: (Int, Int) -> Text
rangeToText (-1, Int
_) = Text
""
rangeToText (Int, Int)
r = Int -> Text -> Text
UText.takeWord16 (Int
end16Pos Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start16Pos) (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$
Int -> Text -> Text
UText.dropWord16 Int
start16Pos Text
rawInput
where
start16Pos :: Int
start16Pos = UArray Int Int
tDropToUtf16Drop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
startPos
end16Pos :: Int
end16Pos = UArray Int Int
tDropToUtf16Drop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
endPos
(Int
startPos, Int
endPos) = (Int -> Int -> (Int, Int)) -> (Int, Int) -> (Int, Int)
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry Int -> Int -> (Int, Int)
translateRange (Int, Int)
r
translateRange :: Int -> Int -> (Int, Int)
translateRange :: Int -> Int -> (Int, Int)
translateRange !Int
bsStart !Int
bsLen = Int
startPos Int -> (Int, Int) -> (Int, Int)
`seq` Int
endPos Int -> (Int, Int) -> (Int, Int)
`seq` (Int, Int)
res
where
res :: (Int, Int)
res = (Int
startPos, Int
endPos)
realBsStart :: Int
realBsStart = Int
utf8Position Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
bsStart
realBsEnd :: Int
realBsEnd = Int
realBsStart Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
bsLen
startPos :: Int
startPos = UArray Int Int
bsDropToTDrop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
realBsStart
endPos :: Int
endPos = UArray Int Int
bsDropToTDrop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
realBsEnd