module NLP.Tokenize.String
    ( EitherList(..)
    , Tokenizer
    , tokenize
    , run
    , defaultTokenizer
    , whitespace
    , uris
    , punctuation
    , finalPunctuation
    , initialPunctuation
    , allPunctuation
    , contractions
    , negatives
    )
where

import qualified Data.Char as Char
import Data.List
import Data.Maybe
import Control.Applicative
import Data.List.Split
import Control.Monad

-- | A Tokenizer is function which takes a list and returns a list of Eithers
--  (wrapped in a newtype). Right Strings will be passed on for processing
--  to tokenizers down
--  the pipeline. Left Strings will be passed through the pipeline unchanged.
--  Use a Left String in a tokenizer to protect certain tokens from further
--  processing (e.g. see the 'uris' tokenizer).
--  You can define your own custom tokenizer pipelines by chaining tokenizers together:
---
-- > myTokenizer :: Tokenizer
-- > myTokenizer = whitespace >=> allPunctuation
---

type Tokenizer =  String -> EitherList String String

-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b =  E { forall a b. EitherList a b -> [Either a b]
unE :: [Either a b] }

-- | Split string into words using the default tokenizer pipeline
tokenize :: String -> [String]
tokenize :: String -> [String]
tokenize  = Tokenizer -> String -> [String]
run Tokenizer
defaultTokenizer

-- | Run a tokenizer
run :: Tokenizer -> (String -> [String])
run :: Tokenizer -> String -> [String]
run Tokenizer
f = (Either String String -> String)
-> [Either String String] -> [String]
forall a b. (a -> b) -> [a] -> [b]
map Either String String -> String
forall {a}. Either a a -> a
unwrap ([Either String String] -> [String])
-> (String -> [Either String String]) -> String -> [String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EitherList String String -> [Either String String]
forall a b. EitherList a b -> [Either a b]
unE (EitherList String String -> [Either String String])
-> Tokenizer -> String -> [Either String String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Tokenizer
f

defaultTokenizer :: Tokenizer
defaultTokenizer :: Tokenizer
defaultTokenizer =     Tokenizer
whitespace
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
uris
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
punctuation
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
contractions
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
negatives

-- | Detect common uris and freeze them
uris :: Tokenizer
uris :: Tokenizer
uris String
x | String -> Bool
isUri String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. a -> Either a b
Left String
x]
       | Bool
True    = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. b -> Either a b
Right String
x]
    where isUri :: String -> Bool
isUri String
x = (String -> Bool) -> [String] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (String -> String -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isPrefixOf` String
x) [String
"http://",String
"ftp://",String
"mailto:"]

-- | Split off initial and final punctuation
punctuation :: Tokenizer
punctuation :: Tokenizer
punctuation = Tokenizer
finalPunctuation Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
initialPunctuation

-- | Split off word-final punctuation
finalPunctuation :: Tokenizer
finalPunctuation :: Tokenizer
finalPunctuation String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E ([Either String String] -> EitherList String String)
-> ([Either String String] -> [Either String String])
-> [Either String String]
-> EitherList String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either String String -> Bool)
-> [Either String String] -> [Either String String]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either String String -> Bool) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (String -> Bool)
-> (Either String String -> String) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either String String -> String
forall {a}. Either a a -> a
unwrap) ([Either String String] -> EitherList String String)
-> [Either String String] -> EitherList String String
forall a b. (a -> b) -> a -> b
$
    case (Char -> Bool) -> String -> (String, String)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span Char -> Bool
Char.isPunctuation (String -> (String, String))
-> (String -> String) -> String -> (String, String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> (String, String)) -> String -> (String, String)
forall a b. (a -> b) -> a -> b
$ String
x of
      ([],String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
w]
      (String
ps,String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
w, String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
ps]

-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer
initialPunctuation :: Tokenizer
initialPunctuation String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E ([Either String String] -> EitherList String String)
-> ([Either String String] -> [Either String String])
-> [Either String String]
-> EitherList String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either String String -> Bool)
-> [Either String String] -> [Either String String]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either String String -> Bool) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (String -> Bool)
-> (Either String String -> String) -> Either String String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either String String -> String
forall {a}. Either a a -> a
unwrap) ([Either String String] -> EitherList String String)
-> [Either String String] -> EitherList String String
forall a b. (a -> b) -> a -> b
$
    case (Char -> Bool) -> String -> (String, String)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span Char -> Bool
Char.isPunctuation(String -> (String, String)) -> String -> (String, String)
forall a b. (a -> b) -> a -> b
$ String
x of
      ([],String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right String
w]
      (String
ps,String
w) -> [String -> Either String String
forall a b. b -> Either a b
Right String
ps, String -> Either String String
forall a b. b -> Either a b
Right String
w]

-- | Split tokens on transitions between punctuation and
-- non-punctuation characters. This tokenizer is not included in
-- defaultTokenizer pipeline because dealing with word-internal
-- punctuation is quite application specific.
allPunctuation :: Tokenizer
allPunctuation :: Tokenizer
allPunctuation = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E ([Either String String] -> EitherList String String)
-> (String -> [Either String String]) -> Tokenizer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String -> Either String String)
-> [String] -> [Either String String]
forall a b. (a -> b) -> [a] -> [b]
map String -> Either String String
forall a b. b -> Either a b
Right
                 ([String] -> [Either String String])
-> (String -> [String]) -> String -> [Either String String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool) -> String -> [String]
forall a. (a -> a -> Bool) -> [a] -> [[a]]
groupBy (\Char
a Char
b -> Char -> Bool
Char.isPunctuation Char
a Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Bool
Char.isPunctuation Char
b)

-- | Split words ending in n't, and freeze n't
negatives :: Tokenizer
negatives :: Tokenizer
negatives String
x | String
"n't" String -> String -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isSuffixOf` String
x = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [ String -> Either String String
forall a b. b -> Either a b
Right (String -> Either String String)
-> (String -> String) -> String -> Either String String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> String) -> (String -> String) -> String -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> String -> String
forall a. Int -> [a] -> [a]
drop Int
3 (String -> String) -> (String -> String) -> String -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> Either String String) -> String -> Either String String
forall a b. (a -> b) -> a -> b
$ String
x
                                       , String -> Either String String
forall a b. a -> Either a b
Left String
"n't" ]
            | Bool
True                 = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. b -> Either a b
Right String
x]

-- | Split common contractions off and freeze them.
-- | Currently deals with: 'm, 's, 'd, 've, 'll
contractions :: Tokenizer
contractions :: Tokenizer
contractions String
x = case [Maybe (String, String)] -> [(String, String)]
forall a. [Maybe a] -> [a]
catMaybes ([Maybe (String, String)] -> [(String, String)])
-> ([String] -> [Maybe (String, String)])
-> [String]
-> [(String, String)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String -> Maybe (String, String))
-> [String] -> [Maybe (String, String)]
forall a b. (a -> b) -> [a] -> [b]
map (String -> String -> Maybe (String, String)
forall {a}. Eq a => [a] -> [a] -> Maybe ([a], [a])
splitSuffix String
x) ([String] -> [(String, String)]) -> [String] -> [(String, String)]
forall a b. (a -> b) -> a -> b
$ [String]
cts of
                   [] -> Tokenizer
forall a. a -> EitherList String a
forall (m :: * -> *) a. Monad m => a -> m a
return String
x
                   ((String
w,String
s):[(String, String)]
_) -> [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [ String -> Either String String
forall a b. b -> Either a b
Right String
w,String -> Either String String
forall a b. a -> Either a b
Left String
s]
    where cts :: [String]
cts = [String
"'m",String
"'s",String
"'d",String
"'ve",String
"'ll"]
          splitSuffix :: [a] -> [a] -> Maybe ([a], [a])
splitSuffix [a]
w [a]
sfx =
              let w' :: [a]
w' = [a] -> [a]
forall a. [a] -> [a]
reverse [a]
w
                  len :: Int
len = [a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
sfx
              in if [a]
sfx [a] -> [a] -> Bool
forall a. Eq a => [a] -> [a] -> Bool
`isSuffixOf` [a]
w
                 then ([a], [a]) -> Maybe ([a], [a])
forall a. a -> Maybe a
Just (Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take ([a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
w Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len) [a]
w, [a] -> [a]
forall a. [a] -> [a]
reverse ([a] -> [a]) -> ([a] -> [a]) -> [a] -> [a]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take Int
len ([a] -> [a]) -> [a] -> [a]
forall a b. (a -> b) -> a -> b
$ [a]
w')
                 else Maybe ([a], [a])
forall a. Maybe a
Nothing


-- | Split string on whitespace. This is just a wrapper for Data.List.words
whitespace :: Tokenizer
whitespace :: Tokenizer
whitespace String
xs = [Either String String] -> EitherList String String
forall a b. [Either a b] -> EitherList a b
E [String -> Either String String
forall a b. b -> Either a b
Right String
w | String
w <- String -> [String]
words String
xs ]

instance Monad (EitherList a) where
    E [Either a a]
xs >>= :: forall a b.
EitherList a a -> (a -> EitherList a b) -> EitherList a b
>>= a -> EitherList a b
f = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ (Either a a -> [Either a b]) -> [Either a a] -> [Either a b]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((a -> [Either a b])
-> (a -> [Either a b]) -> Either a a -> [Either a b]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Either a b -> [Either a b]
forall a. a -> [a]
forall (m :: * -> *) a. Monad m => a -> m a
return (Either a b -> [Either a b])
-> (a -> Either a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> Either a b
forall a b. a -> Either a b
Left) (EitherList a b -> [Either a b]
forall a b. EitherList a b -> [Either a b]
unE (EitherList a b -> [Either a b])
-> (a -> EitherList a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> EitherList a b
f)) [Either a a]
xs

instance Applicative (EitherList a) where
    pure :: forall a. a -> EitherList a a
pure a
x = [Either a a] -> EitherList a a
forall a b. [Either a b] -> EitherList a b
E [a -> Either a a
forall a b. b -> Either a b
Right a
x]
    EitherList a (a -> b)
f <*> :: forall a b.
EitherList a (a -> b) -> EitherList a a -> EitherList a b
<*> EitherList a a
x = EitherList a (a -> b)
f EitherList a (a -> b) -> EitherList a a -> EitherList a b
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` EitherList a a
x

instance Functor (EitherList a) where
    fmap :: forall a b. (a -> b) -> EitherList a a -> EitherList a b
fmap a -> b
f (E [Either a a]
xs) = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ ((Either a a -> Either a b) -> [Either a a] -> [Either a b]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Either a a -> Either a b) -> [Either a a] -> [Either a b])
-> ((a -> b) -> Either a a -> Either a b)
-> (a -> b)
-> [Either a a]
-> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> b) -> Either a a -> Either a b
forall a b. (a -> b) -> Either a a -> Either a b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap) a -> b
f [Either a a]
xs

unwrap :: Either a a -> a
unwrap (Left a
x) = a
x
unwrap (Right a
x) = a
x

examples :: [String]
examples =
    [String
"This shouldn't happen."
    ,String
"Some 'quoted' stuff"
    ,String
"This is a URL: http://example.org."
    ,String
"How about an email@example.com"
    ,String
"ReferenceError #1065 broke my debugger!"
    ,String
"I would've gone."
    ,String
"They've been there."
    ,String
"Hyphen-words"
    ,String
"Yes/No questions"
    ]