module NLP.Tokenize.String
( EitherList(..)
, Tokenizer
, tokenize
, run
, defaultTokenizer
, whitespace
, uris
, punctuation
, finalPunctuation
, initialPunctuation
, allPunctuation
, contractions
, negatives
)
where
import qualified Data.Char as Char
import Data.List
import Data.Maybe
import Control.Applicative
import Data.List.Split
import Control.Monad
type Tokenizer = String -> EitherList String String
newtype EitherList a b = E { forall a b. EitherList a b -> [Either a b]
unE :: [Either a b] }
tokenize :: String -> [String]
tokenize :: String -> [String]
tokenize = Tokenizer -> String -> [String]
run Tokenizer
defaultTokenizer
run :: Tokenizer -> (String -> [String])
run :: Tokenizer -> String -> [String]
run Tokenizer
f = (Either String String -> String)
-> [Either String String] -> [String]
forall a b. (a -> b) -> [a] -> [b]
map Either String String -> String
forall {a}. Either a a -> a
unwrap ([Either String String] -> [String])
-> (String -> [Either String String]) -> String -> [String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EitherList String String -> [Either String String]
forall a b. EitherList a b -> [Either a b]
unE (EitherList String String -> [Either String String])
-> Tokenizer -> String -> [Either String String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Tokenizer
f
defaultTokenizer :: Tokenizer
defaultTokenizer :: Tokenizer
defaultTokenizer = Tokenizer
whitespace
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
uris
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
punctuation
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
contractions
Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
negatives
uris :: Tokenizer
uris :: Tokenizer
uris String
x | String -> Bool
isUri String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. a -> Either a b
Left String
x]
| Bool
True = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. b -> Either a b
Right String
x]
where isUri :: String -> Bool
isUri String
x = (String -> Bool) -> [String] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (String -> String -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isPrefixOf` String
x) [String
"http://",String
"ftp://",String
"mailto:"]
punctuation :: Tokenizer
punctuation :: Tokenizer
punctuation = Tokenizer
finalPunctuation Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
initialPunctuation
finalPunctuation :: Tokenizer
finalPunctuation :: Tokenizer
finalPunctuation String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E ([Either String String] -> EitherList String String)
-> ([Either String String] -> [Either String String])
-> [Either String String]
-> EitherList String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either String String -> Bool)
-> [Either String String] -> [Either String String]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either String String -> Bool) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (String -> Bool)
-> (Either String String -> String) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either String String -> String
forall {a}. Either a a -> a
unwrap) ([Either String String] -> EitherList String String)
-> [Either String String] -> EitherList String String
forall a b. (a -> b) -> a -> b
$
case (Char -> Bool) -> String -> (String, String)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span Char -> Bool
Char.isPunctuation (String -> (String, String))
-> (String -> String) -> String -> (String, String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> (String, String)) -> String -> (String, String)
forall a b. (a -> b) -> a -> b
$ String
x of
([],String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
w]
(String
ps,String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
w, String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
ps]
initialPunctuation :: Tokenizer
initialPunctuation :: Tokenizer
initialPunctuation String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E ([Either String String] -> EitherList String String)
-> ([Either String String] -> [Either String String])
-> [Either String String]
-> EitherList String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either String String -> Bool)
-> [Either String String] -> [Either String String]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either String String -> Bool) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (String -> Bool)
-> (Either String String -> String) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either String String -> String
forall {a}. Either a a -> a
unwrap) ([Either String String] -> EitherList String String)
-> [Either String String] -> EitherList String String
forall a b. (a -> b) -> a -> b
$
case (Char -> Bool) -> String -> (String, String)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span Char -> Bool
Char.isPunctuation(String -> (String, String)) -> String -> (String, String)
forall a b. (a -> b) -> a -> b
$ String
x of
([],String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right String
w]
(String
ps,String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right String
ps, String -> Either String String
forall a b. b -> Either a b
Right String
w]
allPunctuation :: Tokenizer
allPunctuation :: Tokenizer
allPunctuation = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E ([Either String String] -> EitherList String String)
-> (String -> [Either String String]) -> Tokenizer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String -> Either String String)
-> [String] -> [Either String String]
forall a b. (a -> b) -> [a] -> [b]
map String -> Either String String
forall a b. b -> Either a b
Right
([String] -> [Either String String])
-> (String -> [String]) -> String -> [Either String String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool) -> String -> [String]
forall a. (a -> a -> Bool) -> [a] -> [[a]]
groupBy (\Char
a Char
b -> Char -> Bool
Char.isPunctuation Char
a Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Bool
Char.isPunctuation Char
b)
negatives :: Tokenizer
negatives :: Tokenizer
negatives String
x | String
"n't" String -> String -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isSuffixOf` String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [ String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> String) -> (String -> String) -> String -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> String -> String
forall a. Int -> [a] -> [a]
drop Int
3 (String -> String) -> (String -> String) -> String -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
x
, String -> Either String String
forall a b. a -> Either a b
Left String
"n't" ]
| Bool
True = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. b -> Either a b
Right String
x]
contractions :: Tokenizer
contractions :: Tokenizer
contractions String
x = case [Maybe (String, String)] -> [(String, String)]
forall a. [Maybe a] -> [a]
catMaybes ([Maybe (String, String)] -> [(String, String)])
-> ([String] -> [Maybe (String, String)])
-> [String]
-> [(String, String)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String -> Maybe (String, String))
-> [String] -> [Maybe (String, String)]
forall a b. (a -> b) -> [a] -> [b]
map (String -> String -> Maybe (String, String)
forall {a}. Eq a => [a] -> [a] -> Maybe ([a], [a])
splitSuffix String
x) ([String] -> [(String, String)]) -> [String] -> [(String, String)]
forall a b. (a -> b) -> a -> b
$ [String]
cts of
[] -> Tokenizer
forall a. a -> EitherList String a
forall (m :: * -> *) a. Monad m => a -> m a
return String
x
((String
w,String
s):[(String, String)]
_) -> [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [ String -> Either String String
forall a b. b -> Either a b
Right String
w,String -> Either String String
forall a b. a -> Either a b
Left String
s]
where cts :: [String]
cts = [String
"'m",String
"'s",String
"'d",String
"'ve",String
"'ll"]
splitSuffix :: [a] -> [a] -> Maybe ([a], [a])
splitSuffix [a]
w [a]
sfx =
let w' :: [a]
w' = [a] -> [a]
forall a. [a] -> [a]
reverse [a]
w
len :: Int
len = [a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
sfx
in if [a]
sfx [a] -> [a] -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isSuffixOf` [a]
w
then ([a], [a]) -> Maybe ([a], [a])
forall a. a -> Maybe a
Just (Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take ([a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
w Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len) [a]
w, [a] -> [a]
forall a. [a] -> [a]
reverse ([a] -> [a]) -> ([a] -> [a]) -> [a] -> [a]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take Int
len ([a] -> [a]) -> [a] -> [a]
forall a b. (a -> b) -> a -> b
$ [a]
w')
else Maybe ([a], [a])
forall a. Maybe a
Nothing
whitespace :: Tokenizer
whitespace :: Tokenizer
whitespace String
xs = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. b -> Either a b
Right String
w | String
w <- String -> [String]
words String
xs ]
instance Monad (EitherList a) where
E [Either a a]
xs >>= :: forall a b.
EitherList a a -> (a -> EitherList a b) -> EitherList a b
>>= a -> EitherList a b
f = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ (Either a a -> [Either a b]) -> [Either a a] -> [Either a b]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((a -> [Either a b])
-> (a -> [Either a b]) -> Either a a -> [Either a b]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Either a b -> [Either a b]
forall a. a -> [a]
forall (m :: * -> *) a. Monad m => a -> m a
return (Either a b -> [Either a b])
-> (a -> Either a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> Either a b
forall a b. a -> Either a b
Left) (EitherList a b -> [Either a b]
forall a b. EitherList a b -> [Either a b]
unE (EitherList a b -> [Either a b])
-> (a -> EitherList a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> EitherList a b
f)) [Either a a]
xs
instance Applicative (EitherList a) where
pure :: forall a. a -> EitherList a a
pure a
x = [Either a a] -> EitherList a a
forall a b. [Either a b] -> EitherList a b
E [a -> Either a a
forall a b. b -> Either a b
Right a
x]
EitherList a (a -> b)
f <*> :: forall a b.
EitherList a (a -> b) -> EitherList a a -> EitherList a b
<*> EitherList a a
x = EitherList a (a -> b)
f EitherList a (a -> b) -> EitherList a a -> EitherList a b
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` EitherList a a
x
instance Functor (EitherList a) where
fmap :: forall a b. (a -> b) -> EitherList a a -> EitherList a b
fmap a -> b
f (E [Either a a]
xs) = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ ((Either a a -> Either a b) -> [Either a a] -> [Either a b]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Either a a -> Either a b) -> [Either a a] -> [Either a b])
-> ((a -> b) -> Either a a -> Either a b)
-> (a -> b)
-> [Either a a]
-> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> b) -> Either a a -> Either a b
forall a b. (a -> b) -> Either a a -> Either a b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap) a -> b
f [Either a a]
xs
unwrap :: Either a a -> a
unwrap (Left a
x) = a
x
unwrap (Right a
x) = a
x
examples :: [String]
examples =
[String
"This shouldn't happen."
,String
"Some 'quoted' stuff"
,String
"This is a URL: http://example.org."
,String
"How about an email@example.com"
,String
"ReferenceError #1065 broke my debugger!"
,String
"I would've gone."
,String
"They've been there."
,String
"Hyphen-words"
,String
"Yes/No questions"
]