{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BinaryLiterals #-}
module Regex.KDE.Match
( matchRegex
) where
import qualified Data.ByteString as B
import Data.ByteString (ByteString)
import qualified Data.ByteString.UTF8 as U
import qualified Data.Set as Set
import Data.Set (Set)
import Regex.KDE.Regex
import qualified Data.IntMap.Strict as M
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup ((<>))
#endif
data Match =
Match { Match -> ByteString
matchBytes :: !ByteString
, Match -> Int
matchOffset :: !Int
, Match -> IntMap (Int, Int)
matchCaptures :: !(M.IntMap (Int, Int))
} deriving (Int -> Match -> ShowS
[Match] -> ShowS
Match -> String
(Int -> Match -> ShowS)
-> (Match -> String) -> ([Match] -> ShowS) -> Show Match
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Match] -> ShowS
$cshowList :: [Match] -> ShowS
show :: Match -> String
$cshow :: Match -> String
showsPrec :: Int -> Match -> ShowS
$cshowsPrec :: Int -> Match -> ShowS
Show, Match -> Match -> Bool
(Match -> Match -> Bool) -> (Match -> Match -> Bool) -> Eq Match
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Match -> Match -> Bool
$c/= :: Match -> Match -> Bool
== :: Match -> Match -> Bool
$c== :: Match -> Match -> Bool
Eq)
instance Ord Match where
Match
m1 <= :: Match -> Match -> Bool
<= Match
m2
| Match -> Int
matchOffset Match
m1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Match -> Int
matchOffset Match
m2 = Bool
True
| Match -> Int
matchOffset Match
m1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Match -> Int
matchOffset Match
m2 = Bool
False
| Bool
otherwise = Match -> IntMap (Int, Int)
matchCaptures Match
m1 IntMap (Int, Int) -> IntMap (Int, Int) -> Bool
forall a. Ord a => a -> a -> Bool
>= Match -> IntMap (Int, Int)
matchCaptures Match
m2
mapMatching :: (Match -> Match) -> Set Match -> Set Match
mapMatching :: (Match -> Match) -> Set Match -> Set Match
mapMatching Match -> Match
f = (Match -> Bool) -> Set Match -> Set Match
forall a. (a -> Bool) -> Set a -> Set a
Set.filter ((Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0) (Int -> Bool) -> (Match -> Int) -> Match -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Match -> Int
matchOffset) (Set Match -> Set Match)
-> (Set Match -> Set Match) -> Set Match -> Set Match
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Match -> Match) -> Set Match -> Set Match
forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map Match -> Match
f
sizeLimit :: Int
sizeLimit :: Int
sizeLimit = Int
2000
prune :: Set Match -> Set Match
prune :: Set Match -> Set Match
prune Set Match
ms = if Set Match -> Int
forall a. Set a -> Int
Set.size Set Match
ms Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
sizeLimit
then Int -> Set Match -> Set Match
forall a. Int -> Set a -> Set a
Set.take Int
sizeLimit Set Match
ms
else Set Match
ms
exec :: Regex -> Direction -> Regex -> Set Match -> Set Match
exec :: Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
_ Direction
_ Regex
MatchNull = Set Match -> Set Match
forall a. a -> a
id
exec Regex
top Direction
dir (Lazy Regex
re) =
Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir (Regex -> Regex -> Regex
MatchConcat (Regex -> Regex
Lazy Regex
re) Regex
MatchNull)
exec Regex
top Direction
dir (Possessive Regex
re) =
(Match -> Set Match -> Set Match)
-> Set Match -> Set Match -> Set Match
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr
(\Match
elt Set Match
s -> case Set Match -> Maybe Match
forall a. Set a -> Maybe a
Set.lookupMin (Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
re (Match -> Set Match
forall a. a -> Set a
Set.singleton Match
elt)) of
Maybe Match
Nothing -> Set Match
s
Just Match
m -> Match -> Set Match -> Set Match
forall a. Ord a => a -> Set a -> Set a
Set.insert Match
m Set Match
s)
Set Match
forall a. Monoid a => a
mempty
exec Regex
top Direction
dir (MatchDynamic Int
n) =
Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir ((Char -> Bool) -> Regex
MatchChar (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'%') Regex -> Regex -> Regex
forall a. Semigroup a => a -> a -> a
<>
[Regex] -> Regex
forall a. Monoid a => [a] -> a
mconcat ((Char -> Regex) -> String -> [Regex]
forall a b. (a -> b) -> [a] -> [b]
map (\Char
c -> (Char -> Bool) -> Regex
MatchChar (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c)) (Int -> String
forall a. Show a => a -> String
show Int
n)))
exec Regex
_ Direction
_ Regex
AssertEnd = (Match -> Bool) -> Set Match -> Set Match
forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Match -> Int
matchOffset Match
m Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length (Match -> ByteString
matchBytes Match
m))
exec Regex
_ Direction
_ Regex
AssertBeginning = (Match -> Bool) -> Set Match -> Set Match
forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Match -> Int
matchOffset Match
m Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0)
exec Regex
top Direction
_ (AssertPositive Direction
dir Regex
regex) =
(Match -> Bool) -> Set Match -> Set Match
forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Bool -> Bool
not (Set Match -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
regex (Match -> Set Match
forall a. a -> Set a
Set.singleton Match
m))))
exec Regex
top Direction
_ (AssertNegative Direction
dir Regex
regex) =
(Match -> Bool) -> Set Match -> Set Match
forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Set Match -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
regex (Match -> Set Match
forall a. a -> Set a
Set.singleton Match
m)))
exec Regex
_ Direction
_ Regex
AssertWordBoundary = (Match -> Bool) -> Set Match -> Set Match
forall a. (a -> Bool) -> Set a -> Set a
Set.filter Match -> Bool
atWordBoundary
exec Regex
_ Direction
Forward Regex
MatchAnyChar = (Match -> Match) -> Set Match -> Set Match
mapMatching ((Match -> Match) -> Set Match -> Set Match)
-> (Match -> Match) -> Set Match -> Set Match
forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m)) of
Maybe (Char, Int)
Nothing -> Match
m{ matchOffset :: Int
matchOffset = - Int
1}
Just (Char
_,Int
n) -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
n }
exec Regex
_ Direction
Backward Regex
MatchAnyChar = (Match -> Match) -> Set Match -> Set Match
mapMatching ((Match -> Match) -> Set Match -> Set Match)
-> (Match -> Match) -> Set Match -> Set Match
forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
Maybe Int
Nothing -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
Just Int
off -> Match
m{ matchOffset :: Int
matchOffset = Int
off }
exec Regex
_ Direction
Forward (MatchChar Char -> Bool
f) = (Match -> Match) -> Set Match -> Set Match
mapMatching ((Match -> Match) -> Set Match -> Set Match)
-> (Match -> Match) -> Set Match -> Set Match
forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m)) of
Just (Char
c,Int
n) | Char -> Bool
f Char
c -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
n }
Maybe (Char, Int)
_ -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec Regex
_ Direction
Backward (MatchChar Char -> Bool
f) = (Match -> Match) -> Set Match -> Set Match
mapMatching ((Match -> Match) -> Set Match -> Set Match)
-> (Match -> Match) -> Set Match -> Set Match
forall a b. (a -> b) -> a -> b
$ \Match
m ->
case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
Maybe Int
Nothing -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
Just Int
off ->
case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop Int
off (Match -> ByteString
matchBytes Match
m)) of
Just (Char
c,Int
_) | Char -> Bool
f Char
c -> Match
m{ matchOffset :: Int
matchOffset = Int
off }
Maybe (Char, Int)
_ -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec Regex
top Direction
dir (MatchConcat (MatchConcat Regex
r1 Regex
r2) Regex
r3) =
Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir (Regex -> Regex -> Regex
MatchConcat Regex
r1 (Regex -> Regex -> Regex
MatchConcat Regex
r2 Regex
r3))
exec Regex
top Direction
Forward (MatchConcat (Lazy Regex
r1) Regex
r2) =
(Set Match -> Set Match -> Set Match)
-> Set Match -> Set (Set Match) -> Set Match
forall a b. (a -> b -> a) -> a -> Set b -> a
Set.foldl Set Match -> Set Match -> Set Match
forall a. Ord a => Set a -> Set a -> Set a
Set.union Set Match
forall a. Monoid a => a
mempty (Set (Set Match) -> Set Match)
-> (Set Match -> Set (Set Match)) -> Set Match -> Set Match
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Match -> Set Match) -> Set Match -> Set (Set Match)
forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map
(\Match
m ->
let ms1 :: Set Match
ms1 = Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
Forward Regex
r1 (Match -> Set Match
forall a. a -> Set a
Set.singleton Match
m)
in if Set Match -> Bool
forall a. Set a -> Bool
Set.null Set Match
ms1
then Set Match
ms1
else Set Match -> Set Match
go Set Match
ms1)
where
go :: Set Match -> Set Match
go Set Match
ms = case Set Match -> Maybe Match
forall a. Set a -> Maybe a
Set.lookupMax Set Match
ms of
Maybe Match
Nothing -> Set Match
forall a. Set a
Set.empty
Just Match
m' ->
let s' :: Set Match
s' = Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
Forward Regex
r2 (Match -> Set Match
forall a. a -> Set a
Set.singleton Match
m')
in if Set Match -> Bool
forall a. Set a -> Bool
Set.null Set Match
s'
then Set Match -> Set Match
go (Match -> Set Match -> Set Match
forall a. Ord a => a -> Set a -> Set a
Set.delete Match
m' Set Match
ms)
else Set Match
s'
exec Regex
top Direction
Forward (MatchConcat Regex
r1 Regex
r2) =
\Set Match
ms ->
let ms1 :: Set Match
ms1 = Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
Forward Regex
r1 Set Match
ms
in if Set Match -> Bool
forall a. Set a -> Bool
Set.null Set Match
ms1
then Set Match
ms1
else Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
Forward Regex
r2 (Set Match -> Set Match
prune Set Match
ms1)
exec Regex
top Direction
Backward (MatchConcat Regex
r1 Regex
r2) =
Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
Backward Regex
r1 (Set Match -> Set Match)
-> (Set Match -> Set Match) -> Set Match -> Set Match
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
Backward Regex
r2
exec Regex
top Direction
dir (MatchAlt Regex
r1 Regex
r2) = \Set Match
ms -> Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
r1 Set Match
ms Set Match -> Set Match -> Set Match
forall a. Semigroup a => a -> a -> a
<> Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
r2 Set Match
ms
exec Regex
top Direction
dir (MatchSome Regex
re) = Set Match -> Set Match
go
where
go :: Set Match -> Set Match
go Set Match
ms = case Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
re Set Match
ms of
Set Match
ms' | Set Match -> Bool
forall a. Set a -> Bool
Set.null Set Match
ms' -> Set Match
forall a. Set a
Set.empty
| Set Match
ms' Set Match -> Set Match -> Bool
forall a. Eq a => a -> a -> Bool
== Set Match
ms -> Set Match
ms
| Bool
otherwise -> let ms'' :: Set Match
ms'' = Set Match -> Set Match
prune Set Match
ms'
in Set Match
ms'' Set Match -> Set Match -> Set Match
forall a. Semigroup a => a -> a -> a
<> Set Match -> Set Match
go Set Match
ms''
exec Regex
top Direction
dir (MatchCapture Int
i Regex
re) =
(Set Match -> Set Match -> Set Match)
-> Set Match -> Set (Set Match) -> Set Match
forall a b. (a -> b -> b) -> b -> Set a -> b
Set.foldr Set Match -> Set Match -> Set Match
forall a. Ord a => Set a -> Set a -> Set a
Set.union Set Match
forall a. Set a
Set.empty (Set (Set Match) -> Set Match)
-> (Set Match -> Set (Set Match)) -> Set Match -> Set Match
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
(Match -> Set Match) -> Set Match -> Set (Set Match)
forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (\Match
m ->
(Match -> Match) -> Set Match -> Set Match
forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (Match -> Match -> Match
captureDifference Match
m) (Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
re (Match -> Set Match
forall a. a -> Set a
Set.singleton Match
m)))
where
captureDifference :: Match -> Match -> Match
captureDifference Match
m Match
m' =
let len :: Int
len = Match -> Int
matchOffset Match
m' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Match -> Int
matchOffset Match
m
in Match
m'{ matchCaptures :: IntMap (Int, Int)
matchCaptures = Int -> (Int, Int) -> IntMap (Int, Int) -> IntMap (Int, Int)
forall a. Int -> a -> IntMap a -> IntMap a
M.insert Int
i (Match -> Int
matchOffset Match
m, Int
len)
(Match -> IntMap (Int, Int)
matchCaptures Match
m') }
exec Regex
_ Direction
dir (MatchCaptured Int
n) = (Match -> Match) -> Set Match -> Set Match
mapMatching Match -> Match
matchCaptured
where
matchCaptured :: Match -> Match
matchCaptured Match
m =
case Int -> IntMap (Int, Int) -> Maybe (Int, Int)
forall a. Int -> IntMap a -> Maybe a
M.lookup Int
n (Match -> IntMap (Int, Int)
matchCaptures Match
m) of
Just (Int
offset, Int
len) ->
let capture :: ByteString
capture = Int -> ByteString -> ByteString
B.take Int
len (ByteString -> ByteString) -> ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
B.drop Int
offset (ByteString -> ByteString) -> ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ Match -> ByteString
matchBytes Match
m
in case Direction
dir of
Direction
Forward | ByteString -> ByteString -> Bool
B.isPrefixOf ByteString
capture
(Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m))
-> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m Int -> Int -> Int
forall a. Num a => a -> a -> a
+ ByteString -> Int
B.length ByteString
capture }
Direction
Backward | ByteString -> ByteString -> Bool
B.isSuffixOf ByteString
capture
(Int -> ByteString -> ByteString
B.take (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m))
-> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m Int -> Int -> Int
forall a. Num a => a -> a -> a
- ByteString -> Int
B.length ByteString
capture }
Direction
_ -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
Maybe (Int, Int)
Nothing -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec Regex
top Direction
dir Regex
Recurse = \Set Match
ms -> if Set Match -> Bool
forall a. Set a -> Bool
Set.null Set Match
ms
then Set Match
ms
else Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
top Direction
dir Regex
top Set Match
ms
atWordBoundary :: Match -> Bool
atWordBoundary :: Match -> Bool
atWordBoundary Match
m =
case Match -> Int
matchOffset Match
m of
Int
0 -> Bool
True
Int
n | Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length (Match -> ByteString
matchBytes Match
m) -> Bool
True
| Bool
otherwise ->
case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
Maybe Int
Nothing -> Bool
True
Just Int
off ->
case ByteString -> String
U.toString (Int -> ByteString -> ByteString
B.drop (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) (Match -> ByteString
matchBytes Match
m)) of
(Char
prev:Char
cur:Char
next:String
_) ->
(Char -> Bool
isWordChar Char
cur Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Bool
isWordChar Char
next) Bool -> Bool -> Bool
||
(Char -> Bool
isWordChar Char
cur Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
/= Char -> Bool
isWordChar Char
prev)
String
_ -> Bool
True
lastCharOffset :: ByteString -> Int -> Maybe Int
lastCharOffset :: ByteString -> Int -> Maybe Int
lastCharOffset ByteString
_ Int
0 = Maybe Int
forall a. Maybe a
Nothing
lastCharOffset ByteString
_ Int
1 = Maybe Int
forall a. Maybe a
Nothing
lastCharOffset ByteString
bs Int
n =
case ByteString -> Int -> Word8
B.index ByteString
bs (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2) of
Word8
w | Word8
w Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0b10000000 -> Int -> Maybe Int
forall a. a -> Maybe a
Just (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
| Word8
w Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0b11000000 -> Int -> Maybe Int
forall a. a -> Maybe a
Just (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
| Bool
otherwise -> ByteString -> Int -> Maybe Int
lastCharOffset ByteString
bs (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
matchRegex :: Regex
-> ByteString
-> Maybe (ByteString, M.IntMap (Int, Int))
matchRegex :: Regex -> ByteString -> Maybe (ByteString, IntMap (Int, Int))
matchRegex Regex
re ByteString
bs =
Match -> (ByteString, IntMap (Int, Int))
toResult (Match -> (ByteString, IntMap (Int, Int)))
-> Maybe Match -> Maybe (ByteString, IntMap (Int, Int))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Set Match -> Maybe Match
forall a. Set a -> Maybe a
Set.lookupMin
(Regex -> Direction -> Regex -> Set Match -> Set Match
exec Regex
re Direction
Forward Regex
re (Match -> Set Match
forall a. a -> Set a
Set.singleton (ByteString -> Int -> IntMap (Int, Int) -> Match
Match ByteString
bs Int
0 IntMap (Int, Int)
forall a. IntMap a
M.empty)))
where
toResult :: Match -> (ByteString, IntMap (Int, Int))
toResult Match
m = (Int -> ByteString -> ByteString
B.take (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m), (Match -> IntMap (Int, Int)
matchCaptures Match
m))