{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PackageImports #-}

module NLP.Punkt.Match (
    re_split_impl,
    re_split_pos,
    re_split,
    re_compile,
    word_seps,
    intrasep
    ) where

import Data.Text (Text)
import Data.Array ((!))
import "regex-tdfa-text" Text.Regex.TDFA.Text (compile)
import "regex-tdfa" Text.Regex.TDFA (Regex, matchOnceText, blankCompOpt,
                                     ExecOption(..))
import Data.Maybe (maybe)
import Data.Either (lefts)

re_split_impl :: Regex -> Text -> [Either Text Text]
re_split_impl re str = filter not_blank $ chunk re str
    where
    not_blank xs = if xs == Left "" || xs == Right "" then False else True
    chunk re str = maybe [Left str] link $ matchOnceText re str
    link (pre, match, post) = Left pre : Right (fst $ match ! 0) : chunk re post

re_split_pos :: Regex -> Text -> [Either (Text, Int) (Text, Int)]
re_split_pos re str = filter not_blank $ chunk re str 0
    where
    not_blank xs =
        case xs of { Left ("", _) -> False; Right ("", _) -> False; _ -> True; }
    chunk re str relpos = case matchOnceText re str of
        Nothing -> [Left (str, relpos)]
        Just (pre, match, post) ->
            let (mtext, (moffset, mlen)) = match ! 0
                (mpos, relpos') = (relpos + moffset, mpos + mlen)
            in Left (pre, relpos) : Right (mtext, mpos) : chunk re post relpos'

re_split :: Regex -> Text -> [Text]
re_split re str = lefts $ re_split_impl re str

re_compile :: Text -> Regex
re_compile re = rv where Right rv = compile blankCompOpt (ExecOption False) re

word_seps = re_compile "([ \t\n]+|-{2,}|—|\\.{2,}|\\.( \\.)+|…|[!\\?;:]{1,})"
intrasep = re_compile "[-'’]"