{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
module NLP.Nerf.Tokenize
(
tokenize
, Word (..)
, sync
) where
import Prelude hiding (Word)
import Control.Arrow (second)
import Control.Monad ((>=>))
import qualified Data.Char as Char
import qualified Data.List as L
import qualified Data.Tree as T
import qualified Data.Traversable as Tr
import qualified Data.Text as Text
import qualified Data.Text.Lazy as LazyText
import qualified NLP.Tokenize as Tok
import qualified Data.IntervalMap.Strict as I
import Data.Named.Tree (NeForest, NeTree)
defaultTokenizer :: Tok.Tokenizer
defaultTokenizer
= Tok.whitespace
>=> Tok.uris
>=> Tok.punctuation
tokenize :: String -> [String]
tokenize = Tok.run defaultTokenizer
class Word a where
word :: a -> String
instance Word String where
word = id
instance Word Text.Text where
word = Text.unpack
instance Word LazyText.Text where
word = LazyText.unpack
essence :: Word a => a -> Int
essence = length . filter (not . Char.isSpace) . word
{-# INLINE essence #-}
unGroupLeaves :: NeForest a [b] -> NeForest a b
unGroupLeaves = concatMap unGroupLeavesT
unGroupLeavesT :: NeTree a [b] -> [NeTree a b]
unGroupLeavesT (T.Node (Left v) xs) =
[T.Node (Left v) (unGroupLeaves xs)]
unGroupLeavesT (T.Node (Right vs) _) =
[T.Node (Right v) [] | v <- vs]
type Range = I.Interval Int
ranged :: Word a => Int -> a -> (Int, (Range, a))
ranged p w =
(q, (i, w))
where
q = p + essence w
i = I.IntervalCO p q
rangedList :: Word a => [a] -> [(Range, a)]
rangedList = snd . L.mapAccumL ranged 0
rangedForest :: Word b => NeForest a b -> NeForest a (Range, b)
rangedForest =
snd . L.mapAccumL (Tr.mapAccumL f) 0
where
f acc (Left x) = (acc, Left x)
f acc (Right x) =
let (acc', y) = ranged acc x
in (acc', Right y)
replaceToks
:: I.IntervalMap Int c
-> NeForest a (Range, b)
-> ( I.IntervalMap Int c
, NeForest a (Range, c) )
replaceToks ivMap nes
= second unGroupLeaves
$ L.mapAccumL (Tr.mapAccumL replace) ivMap nes
where
replace im (Left x) = (im, Left x)
replace im (Right (ran, _)) =
let rsXs = I.assocs $ I.intersecting im ran
im' = L.foldl' (flip I.delete) im (map fst rsXs)
in (im', Right rsXs)
liftRange :: NeTree a (Range, b) -> (Range, NeTree a b)
liftRange (T.Node (Left v) xs) =
(ran, T.Node (Left v) (map snd ys))
where
ys = map liftRange xs
ran = maybeHead $ map fst ys
maybeHead (x:_) = x
maybeHead [] = error "liftRange: invalid NE tree"
liftRange (T.Node (Right (ran, v)) _) = (ran, T.Node (Right v) [])
sync
:: (Word b, Word c)
=> NeForest a b
-> [c]
-> NeForest a c
sync nes0 xs0
= map snd . I.toList . I.fromList
$ map (second mkLeaf) (I.toList ivMap')
++ map liftRange nes'
where
ivMap = I.fromList $ rangedList xs0
nes = filter internal $ rangedForest nes0
(ivMap', nes') = replaceToks ivMap nes
internal x = case T.rootLabel x of
Left _ -> True
Right _ -> False
mkLeaf x = T.Node (Right x) []