{-# LANGUAGE OverloadedStrings #-}
module NLP.Tokenize.Text
( EitherList(..)
, Tokenizer
, tokenize
, run
, defaultTokenizer
, whitespace
, uris
, punctuation
, finalPunctuation
, initialPunctuation
, allPunctuation
, contractions
, negatives
)
where
import qualified Data.Char as Char
import Data.Maybe
import Control.Applicative
import Control.Monad
import Data.Text (Text)
import qualified Data.Text as T
type Tokenizer = Text -> EitherList Text Text
newtype EitherList a b = E { forall a b. EitherList a b -> [Either a b]
unE :: [Either a b] }
tokenize :: Text -> [Text]
tokenize :: Text -> [Text]
tokenize = Tokenizer -> Text -> [Text]
run Tokenizer
defaultTokenizer
run :: Tokenizer -> (Text -> [Text])
run :: Tokenizer -> Text -> [Text]
run Tokenizer
f = \Text
txt -> (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Text
T.copy ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ ((Either Text Text -> Text) -> [Either Text Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Either Text Text -> Text
forall a. Either a a -> a
unwrap ([Either Text Text] -> [Text])
-> (Text -> [Either Text Text]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EitherList Text Text -> [Either Text Text]
forall a b. EitherList a b -> [Either a b]
unE (EitherList Text Text -> [Either Text Text])
-> Tokenizer -> Text -> [Either Text Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Tokenizer
f) Text
txt
defaultTokenizer :: Tokenizer
defaultTokenizer :: Tokenizer
defaultTokenizer = Tokenizer
whitespace
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
uris
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
punctuation
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
contractions
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
negatives
uris :: Tokenizer
uris :: Tokenizer
uris Text
x | Text -> Bool
isUri Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. a -> Either a b
Left Text
x]
| Bool
True = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. b -> Either a b
Right Text
x]
where isUri :: Text -> Bool
isUri Text
u = (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> Text -> Bool
`T.isPrefixOf` Text
u) [Text
"http://",Text
"ftp://",Text
"mailto:"]
punctuation :: Tokenizer
punctuation :: Tokenizer
punctuation = Tokenizer
finalPunctuation Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
initialPunctuation
hyphens :: Tokenizer
hyphens :: Tokenizer
hyphens Text
xs = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w | Text
w <- (Char -> Bool) -> Text -> [Text]
T.split (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'-') Text
xs ]
finalPunctuation :: Tokenizer
finalPunctuation :: Tokenizer
finalPunctuation Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E ([Either Text Text] -> EitherList Text Text)
-> [Either Text Text] -> EitherList Text Text
forall a b. (a -> b) -> a -> b
$ (Either Text Text -> Bool)
-> [Either Text Text] -> [Either Text Text]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either Text Text -> Bool) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null (Text -> Bool)
-> (Either Text Text -> Text) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either Text Text -> Text
forall a. Either a a -> a
unwrap) [Either Text Text]
res
where
res :: [Either Text Text]
res :: [Either Text Text]
res = case (Char -> Bool) -> Text -> (Text, Text)
T.span Char -> Bool
Char.isPunctuation (Text -> Text
T.reverse Text
x) of
(Text
ps, Text
w) | Text -> Bool
T.null Text
ps -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.reverse Text
w ]
| Bool
otherwise -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.reverse Text
w
, Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.reverse Text
ps]
initialPunctuation :: Tokenizer
initialPunctuation :: Tokenizer
initialPunctuation Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E ([Either Text Text] -> EitherList Text Text)
-> [Either Text Text] -> EitherList Text Text
forall a b. (a -> b) -> a -> b
$ (Either Text Text -> Bool)
-> [Either Text Text] -> [Either Text Text]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either Text Text -> Bool) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null (Text -> Bool)
-> (Either Text Text -> Text) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either Text Text -> Text
forall a. Either a a -> a
unwrap) ([Either Text Text] -> [Either Text Text])
-> [Either Text Text] -> [Either Text Text]
forall a b. (a -> b) -> a -> b
$
case (Char -> Bool) -> Text -> (Text, Text)
T.span Char -> Bool
Char.isPunctuation Text
x of
(Text
ps,Text
w) | Text -> Bool
T.null Text
ps -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w ]
| Bool
otherwise -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
ps
, Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w ]
allPunctuation :: Tokenizer
allPunctuation :: Tokenizer
allPunctuation = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E ([Either Text Text] -> EitherList Text Text)
-> (Text -> [Either Text Text]) -> Tokenizer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> Either Text Text) -> [Text] -> [Either Text Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Either Text Text
forall a b. b -> Either a b
Right
([Text] -> [Either Text Text])
-> (Text -> [Text]) -> Text -> [Either Text Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool) -> Text -> [Text]
T.groupBy (\Char
a Char
b -> Char -> Bool
Char.isPunctuation Char
a Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Bool
Char.isPunctuation Char
b)
negatives :: Tokenizer
negatives :: Tokenizer
negatives Text
x | Text
"n't" Text -> Text -> Bool
`T.isSuffixOf` Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [ Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text)
-> (Text -> Text) -> Text -> Either Text Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
T.reverse (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.drop Int
3 (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
T.reverse (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text
x
, Text -> Either Text Text
forall a b. a -> Either a b
Left Text
"n't" ]
| Bool
True = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
x ]
contractions :: Tokenizer
contractions :: Tokenizer
contractions Text
x = case [Maybe (Text, Text)] -> [(Text, Text)]
forall a. [Maybe a] -> [a]
catMaybes ([Maybe (Text, Text)] -> [(Text, Text)])
-> ([Text] -> [Maybe (Text, Text)]) -> [Text] -> [(Text, Text)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> Maybe (Text, Text)) -> [Text] -> [Maybe (Text, Text)]
forall a b. (a -> b) -> [a] -> [b]
map (Text -> Text -> Maybe (Text, Text)
splitSuffix Text
x) ([Text] -> [(Text, Text)]) -> [Text] -> [(Text, Text)]
forall a b. (a -> b) -> a -> b
$ [Text]
cts of
[] -> Tokenizer
forall a. a -> EitherList Text a
forall (m :: * -> *) a. Monad m => a -> m a
return Text
x
((Text
w,Text
s):[(Text, Text)]
_) -> [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w,Text -> Either Text Text
forall a b. a -> Either a b
Left Text
s]
where cts :: [Text]
cts = [Text
"'m",Text
"'s",Text
"'d",Text
"'ve",Text
"'ll"]
splitSuffix :: Text -> Text -> Maybe (Text, Text)
splitSuffix Text
w Text
sfx =
let w' :: Text
w' = Text -> Text
T.reverse Text
w
len :: Int
len = Text -> Int
T.length Text
sfx
in if Text
sfx Text -> Text -> Bool
`T.isSuffixOf` Text
w
then (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just (Int -> Text -> Text
T.take (Text -> Int
T.length Text
w Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len) Text
w, Text -> Text
T.reverse (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.take Int
len (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Text
w')
else Maybe (Text, Text)
forall a. Maybe a
Nothing
whitespace :: Tokenizer
whitespace :: Tokenizer
whitespace Text
xs = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w | Text
w <- Text -> [Text]
T.words Text
xs ]
instance Monad (EitherList a) where
E [Either a a]
xs >>= :: forall a b.
EitherList a a -> (a -> EitherList a b) -> EitherList a b
>>= a -> EitherList a b
f = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ (Either a a -> [Either a b]) -> [Either a a] -> [Either a b]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((a -> [Either a b])
-> (a -> [Either a b]) -> Either a a -> [Either a b]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Either a b -> [Either a b]
forall a. a -> [a]
forall (m :: * -> *) a. Monad m => a -> m a
return (Either a b -> [Either a b])
-> (a -> Either a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> Either a b
forall a b. a -> Either a b
Left) (EitherList a b -> [Either a b]
forall a b. EitherList a b -> [Either a b]
unE (EitherList a b -> [Either a b])
-> (a -> EitherList a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> EitherList a b
f)) [Either a a]
xs
instance Applicative (EitherList a) where
pure :: forall a. a -> EitherList a a
pure a
x = [Either a a] -> EitherList a a
forall a b. [Either a b] -> EitherList a b
E [a -> Either a a
forall a b. b -> Either a b
Right a
x]
EitherList a (a -> b)
f <*> :: forall a b.
EitherList a (a -> b) -> EitherList a a -> EitherList a b
<*> EitherList a a
x = EitherList a (a -> b)
f EitherList a (a -> b) -> EitherList a a -> EitherList a b
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` EitherList a a
x
instance Functor (EitherList a) where
fmap :: forall a b. (a -> b) -> EitherList a a -> EitherList a b
fmap a -> b
f (E [Either a a]
xs) = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ ((Either a a -> Either a b) -> [Either a a] -> [Either a b]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Either a a -> Either a b) -> [Either a a] -> [Either a b])
-> ((a -> b) -> Either a a -> Either a b)
-> (a -> b)
-> [Either a a]
-> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> b) -> Either a a -> Either a b
forall a b. (a -> b) -> Either a a -> Either a b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap) a -> b
f [Either a a]
xs
unwrap :: Either a a -> a
unwrap :: forall a. Either a a -> a
unwrap (Left a
x) = a
x
unwrap (Right a
x) = a
x
examples :: [Text]
examples :: [Text]
examples =
[Text
"This shouldn't happen."
,Text
"Some 'quoted' stuff"
,Text
"This is a URL: http://example.org."
,Text
"How about an email@example.com"
,Text
"ReferenceError #1065 broke my debugger!"
,Text
"I would've gone."
,Text
"They've been there."
,Text
"Hyphen-words"
,Text
"Yes/No questions"
]