-- Copyright (c) 2016-present, Facebook, Inc.
-- All rights reserved.
--
-- This source code is licensed under the BSD-style license found in the
-- LICENSE file in the root directory of this source tree.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}

module Duckling.Types.Document
  ( Document -- abstract
  , fromText
  , (!)
  , length
  , byteStringFromPos
  , isAdjacent
  , isRangeValid
  ) where

import Data.Array.Unboxed (UArray)
import Data.ByteString (ByteString)
import Data.List (scanl', foldl', foldr)
import Data.String
import Data.Text (Text)
import Prelude hiding (length)
import qualified Data.Array.Unboxed as Array
import qualified Data.ByteString as BS
import qualified Data.Char as Char
import qualified Data.Text.Unsafe as UText
import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text
import qualified Data.Text.Internal.Unsafe.Char as UText


data Document = Document
  { Document -> Text
rawInput :: !Text
  , Document -> ByteString
utf8Encoded :: ByteString
  , Document -> UArray Int Char
indexable :: UArray Int Char -- for O(1) indexing pos -> Char
  , Document -> UArray Int Int
firstNonAdjacent :: UArray Int Int
    -- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
    -- such that isAdjacentSeparator (indexable ! j) == False
    -- eg. " a document " :: Document
    --     firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
    -- Note that in this case 12 is the length of the vector, hence not a
    -- valid index inside the array, this is intentional.
  , Document -> UArray Int Int
tDropToBSDrop :: UArray Int Int
    -- how many bytes to BS.drop from a utf8 encoded ByteString to
    -- reach the same position as Text.drop would
  , Document -> UArray Int Int
bsDropToTDrop :: UArray Int Int
    -- the inverse of tDropToBSDrop, rounds down for bytes that are
    -- not on character boundary
    -- for "żółty" :: Document
    --   tDropToBSDrop = [0,2,4,6,7,8]
    --   bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
    --   tDropToUtf16Drop = [0,1,2,3,4,5]
  , Document -> UArray Int Int
tDropToUtf16Drop :: UArray Int Int
    -- translate Text.drop to Data.Text.Unsafe.dropWord16
  } deriving (Int -> Document -> ShowS
[Document] -> ShowS
Document -> String
(Int -> Document -> ShowS)
-> (Document -> String) -> ([Document] -> ShowS) -> Show Document
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Document] -> ShowS
$cshowList :: [Document] -> ShowS
show :: Document -> String
$cshow :: Document -> String
showsPrec :: Int -> Document -> ShowS
$cshowsPrec :: Int -> Document -> ShowS
Show)

{-
Note [Regular expressions and Text]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
ByteStrings. Because we do a lot of regexp matching on the same Text,
it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
field in Document.

Moreover we do regexp matching with capture, where the captured groups
are returned as ByteString and we want them as Text. But all of the
captured groups are just a substrings of the original Text.
Fortunately PCRE has an API that returns a MatchArray - a structure with
just the ByteString indices and ByteString lengths of the matched fragments.
If we play with indices right we can translate them to offsets into the
original Text, share the underlying Text buffer and avoid all of
UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
-}

instance IsString Document where
  fromString :: String -> Document
fromString = Text -> Document
fromText (Text -> Document) -> (String -> Text) -> String -> Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
forall a. IsString a => String -> a
fromString

fromText :: Text -> Document
fromText :: Text -> Document
fromText Text
rawInput = Document :: Text
-> ByteString
-> UArray Int Char
-> UArray Int Int
-> UArray Int Int
-> UArray Int Int
-> UArray Int Int
-> Document
Document{ByteString
Text
UArray Int Char
UArray Int Int
bsDropToTDrop :: UArray Int Int
tDropToUtf16Drop :: UArray Int Int
tDropToBSDrop :: UArray Int Int
firstNonAdjacent :: UArray Int Int
indexable :: UArray Int Char
utf8Encoded :: ByteString
rawInput :: Text
tDropToUtf16Drop :: UArray Int Int
bsDropToTDrop :: UArray Int Int
tDropToBSDrop :: UArray Int Int
firstNonAdjacent :: UArray Int Int
indexable :: UArray Int Char
utf8Encoded :: ByteString
rawInput :: Text
..}
  where
  utf8Encoded :: ByteString
utf8Encoded = Text -> ByteString
Text.encodeUtf8 Text
rawInput
  rawInputLength :: Int
rawInputLength = Text -> Int
Text.length Text
rawInput
  unpacked :: String
unpacked = Text -> String
Text.unpack Text
rawInput
  indexable :: UArray Int Char
indexable = (Int, Int) -> String -> UArray Int Char
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) String
unpacked
  firstNonAdjacent :: UArray Int Int
firstNonAdjacent = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) ([Int] -> UArray Int Int) -> [Int] -> UArray Int Int
forall a b. (a -> b) -> a -> b
$ (Int, [Int]) -> [Int]
forall a b. (a, b) -> b
snd ((Int, [Int]) -> [Int]) -> (Int, [Int]) -> [Int]
forall a b. (a -> b) -> a -> b
$
   ((Int, Char) -> (Int, [Int]) -> (Int, [Int]))
-> (Int, [Int]) -> [(Int, Char)] -> (Int, [Int])
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (Int, Char) -> (Int, [Int]) -> (Int, [Int])
forall a. (a, Char) -> (a, [a]) -> (a, [a])
gen (Int
rawInputLength, []) ([(Int, Char)] -> (Int, [Int])) -> [(Int, Char)] -> (Int, [Int])
forall a b. (a -> b) -> a -> b
$ [Int] -> String -> [(Int, Char)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
0..] String
unpacked
  -- go from the end keeping track of the first nonAdjacent (best)
  gen :: (a, Char) -> (a, [a]) -> (a, [a])
gen (a
ix, Char
elem) (a
best, ![a]
acc)
    | Char -> Bool
isAdjacentSeparator Char
elem = (a
best, a
besta -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
acc)
    | Bool
otherwise = (a
ix, a
ixa -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
acc)
  tDropToBSDropList :: [Int]
tDropToBSDropList = (Int -> Char -> Int) -> Int -> String -> [Int]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl' (\Int
acc Char
a -> Int
acc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
forall p. Num p => Char -> p
utf8CharWidth Char
a) Int
0 String
unpacked
  tDropToBSDrop :: UArray Int Int
tDropToBSDrop = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength) [Int]
tDropToBSDropList
  tDropToUtf16Drop :: UArray Int Int
tDropToUtf16Drop = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, Int
rawInputLength) ([Int] -> UArray Int Int) -> [Int] -> UArray Int Int
forall a b. (a -> b) -> a -> b
$
    (Int -> Char -> Int) -> Int -> String -> [Int]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl' (\Int
acc Char
a -> Int
acc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
forall p. Num p => Char -> p
utf16CharWidth Char
a) Int
0 String
unpacked
  bsDropToTDrop :: UArray Int Int
bsDropToTDrop = (Int, Int) -> [Int] -> UArray Int Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
Array.listArray (Int
0, ByteString -> Int
BS.length ByteString
utf8Encoded) ([Int] -> UArray Int Int) -> [Int] -> UArray Int Int
forall a b. (a -> b) -> a -> b
$
    [Int] -> [Int]
forall a. [a] -> [a]
reverse ([Int] -> [Int]) -> [Int] -> [Int]
forall a b. (a -> b) -> a -> b
$ (Int, [Int]) -> [Int]
forall a b. (a, b) -> b
snd ((Int, [Int]) -> [Int]) -> (Int, [Int]) -> [Int]
forall a b. (a -> b) -> a -> b
$ ((Int, [Int]) -> (Int, Int) -> (Int, [Int]))
-> (Int, [Int]) -> [(Int, Int)] -> (Int, [Int])
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (Int, [Int]) -> (Int, Int) -> (Int, [Int])
forall a. (Int, [a]) -> (a, Int) -> (Int, [a])
fun (-Int
1, []) ([(Int, Int)] -> (Int, [Int])) -> [(Int, Int)] -> (Int, [Int])
forall a b. (a -> b) -> a -> b
$ [Int] -> [Int] -> [(Int, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int
0..] [Int]
tDropToBSDropList
  fun :: (Int, [a]) -> (a, Int) -> (Int, [a])
fun (Int
lastPos, ![a]
acc) (a
ix, Int
elem) = (Int
elem, Int -> a -> [a]
forall a. Int -> a -> [a]
replicate (Int
elem Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
lastPos) a
ix [a] -> [a] -> [a]
forall a. [a] -> [a] -> [a]
++ [a]
acc)
  utf8CharWidth :: Char -> p
utf8CharWidth Char
c
    | Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7F = p
1
    | Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7FF = p
2
    | Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xFFFF = p
3
    | Bool
otherwise = p
4
    where
    w :: Int
w = Char -> Int
UText.ord Char
c
  utf16CharWidth :: Char -> p
utf16CharWidth Char
c
    | Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x10000 = p
1
    | Bool
otherwise = p
2
    where
    w :: Int
w = Char -> Int
UText.ord Char
c

-- As regexes are matched without whitespace delimitator, we need to check
-- the reasonability of the match to actually be a word.
isRangeValid :: Document -> Int -> Int -> Bool
isRangeValid :: Document -> Int -> Int -> Bool
isRangeValid Document
doc Int
start Int
end =
  (Int
start Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 Bool -> Bool -> Bool
||
      Char -> Char -> Bool
isDifferent (Document
doc Document -> Int -> Char
! (Int
start Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)) (Document
doc Document -> Int -> Char
! Int
start)) Bool -> Bool -> Bool
&&
  (Int
end Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Document -> Int
length Document
doc Bool -> Bool -> Bool
||
      Char -> Char -> Bool
isDifferent (Document
doc Document -> Int -> Char
! (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)) (Document
doc Document -> Int -> Char
! Int
end))
  where
    charClass :: Char -> Char
    charClass :: Char -> Char
charClass Char
c
      | Char -> Bool
Char.isLower Char
c Bool -> Bool -> Bool
|| Char -> Bool
Char.isUpper Char
c = Char
'c'
      | Char -> Bool
Char.isDigit Char
c = Char
'd'
      | Bool
otherwise = Char
c
    isDifferent :: Char -> Char -> Bool
    isDifferent :: Char -> Char -> Bool
isDifferent Char
a Char
b = Char -> Char
charClass Char
a Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Char
charClass Char
b

-- True iff a is followed by whitespaces and b.
isAdjacent :: Document -> Int -> Int -> Bool
isAdjacent :: Document -> Int -> Int -> Bool
isAdjacent Document{ByteString
Text
UArray Int Char
UArray Int Int
tDropToUtf16Drop :: UArray Int Int
bsDropToTDrop :: UArray Int Int
tDropToBSDrop :: UArray Int Int
firstNonAdjacent :: UArray Int Int
indexable :: UArray Int Char
utf8Encoded :: ByteString
rawInput :: Text
tDropToUtf16Drop :: Document -> UArray Int Int
bsDropToTDrop :: Document -> UArray Int Int
tDropToBSDrop :: Document -> UArray Int Int
firstNonAdjacent :: Document -> UArray Int Int
indexable :: Document -> UArray Int Char
utf8Encoded :: Document -> ByteString
rawInput :: Document -> Text
..} Int
a Int
b =
  Int
b Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
a Bool -> Bool -> Bool
&& (UArray Int Int
firstNonAdjacent UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
a Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
b)

isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator Char
c = Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
elem Char
c [Char
' ', Char
'\t']

(!) :: Document -> Int -> Char
(!) Document { indexable :: Document -> UArray Int Char
indexable = UArray Int Char
s } Int
ix = UArray Int Char
s UArray Int Char -> Int -> Char
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
ix

length :: Document -> Int
length :: Document -> Int
length Document { indexable :: Document -> UArray Int Char
indexable = UArray Int Char
s } = (Int, Int) -> Int
forall a. Ix a => (a, a) -> Int
Array.rangeSize ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a b. (a -> b) -> a -> b
$ UArray Int Char -> (Int, Int)
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> (i, i)
Array.bounds UArray Int Char
s

-- | Given a document and an offset (think Text.drop offset),
-- returns a utf8 encoded substring of Document at that offset
-- and 2 translation functions:
--   rangeToText - given a range in the returned ByteString, gives
--     a corresponding subrange of the Document as Text
--   translateRange - given a start and a length of a range in the returned
--     ByteString, gives a corresponding subrange in the Document as pair
--     of (start, end) of Text.drop offsets
{-# INLINE byteStringFromPos #-}
-- if we don't inline we seem to pay for the tuple, there might be
-- an easier way
byteStringFromPos
  :: Document
  -> Int
  -> ( ByteString
     , (Int, Int) -> Text
     , Int -> Int -> (Int, Int)
     )
byteStringFromPos :: Document
-> Int
-> (ByteString, (Int, Int) -> Text, Int -> Int -> (Int, Int))
byteStringFromPos
  Document { rawInput :: Document -> Text
rawInput = Text
rawInput
           , utf8Encoded :: Document -> ByteString
utf8Encoded = ByteString
utf8Encoded
           , tDropToBSDrop :: Document -> UArray Int Int
tDropToBSDrop = UArray Int Int
tDropToBSDrop
           , bsDropToTDrop :: Document -> UArray Int Int
bsDropToTDrop = UArray Int Int
bsDropToTDrop
           , tDropToUtf16Drop :: Document -> UArray Int Int
tDropToUtf16Drop = UArray Int Int
tDropToUtf16Drop
           }
  Int
position = (ByteString
substring, (Int, Int) -> Text
rangeToText, Int -> Int -> (Int, Int)
translateRange)
  where
  -- See Note [Regular expressions and Text] to understand what's going
  -- on here
  utf8Position :: Int
utf8Position = UArray Int Int
tDropToBSDrop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
position
  substring :: ByteString
  substring :: ByteString
substring = Int -> ByteString -> ByteString
BS.drop Int
utf8Position ByteString
utf8Encoded
  -- get a subrange of Text reusing the underlying buffer using
  -- utf16 start and end positions
  rangeToText :: (Int, Int) -> Text
  rangeToText :: (Int, Int) -> Text
rangeToText (-1, Int
_) = Text
""
  -- this is what regexec from Text.Regex.PCRE.ByteString does
  rangeToText (Int, Int)
r = Int -> Text -> Text
UText.takeWord16 (Int
end16Pos Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start16Pos) (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$
    Int -> Text -> Text
UText.dropWord16 Int
start16Pos Text
rawInput
    where
    start16Pos :: Int
start16Pos = UArray Int Int
tDropToUtf16Drop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
startPos
    end16Pos :: Int
end16Pos = UArray Int Int
tDropToUtf16Drop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
endPos
    (Int
startPos, Int
endPos) = (Int -> Int -> (Int, Int)) -> (Int, Int) -> (Int, Int)
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry Int -> Int -> (Int, Int)
translateRange (Int, Int)
r
  -- from utf8 offset and length to Text character start and end position
  translateRange :: Int -> Int -> (Int, Int)
  translateRange :: Int -> Int -> (Int, Int)
translateRange !Int
bsStart !Int
bsLen = Int
startPos Int -> (Int, Int) -> (Int, Int)
`seq` Int
endPos Int -> (Int, Int) -> (Int, Int)
`seq` (Int, Int)
res
    where
    res :: (Int, Int)
res = (Int
startPos, Int
endPos)
    realBsStart :: Int
realBsStart = Int
utf8Position Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
bsStart
    realBsEnd :: Int
realBsEnd = Int
realBsStart Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
bsLen
    startPos :: Int
startPos = UArray Int Int
bsDropToTDrop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
realBsStart
    endPos :: Int
endPos = UArray Int Int
bsDropToTDrop UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
Array.! Int
realBsEnd