{-| Module : Control.Lens.Regex.Text Description : Text PCRE Regex library with a lensy interface. Copyright : (c) Chris Penner, 2019 License : BSD3 -} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PartialTypeSignatures #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE TemplateHaskell #-} module Control.Lens.Regex.Text ( -- * Basics regex , match , groups , group , namedGroups , namedGroup , matchAndGroups -- * Compiling regexes to Traversals , regexing , mkRegexTraversalQQ -- * Types , RBS.Match , PCRE.Regex ) where import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.Encoding.Error as T import qualified Data.ByteString as BS import qualified Text.Regex.PCRE.Heavy as PCRE import qualified Data.Map as M import Control.Lens hiding (re, matching) import qualified Language.Haskell.TH as TH import qualified Language.Haskell.TH.Quote as TH import qualified Control.Lens.Regex.ByteString as RBS -- $setup -- >>> :set -XQuasiQuotes -- >>> :set -XOverloadedStrings -- >>> :set -XTypeApplications -- >>> import Data.Text.Lens (unpacked) -- >>> import qualified Data.Text as T -- >>> import Data.List (sort) utf8 :: Iso' T.Text BS.ByteString utf8 = iso T.encodeUtf8 (T.decodeUtf8With T.lenientDecode) -- | Builds a traversal over text using a Regex pattern -- -- It's a 'TH.QuasiQuoter' which creates a Traversal out of the given regex string. -- It's equivalent to calling 'regexing' on a 'PCRE.Regex' created using the -- 'PCRE.re' QuasiQuoter. -- -- The "real" type is: -- -- > regex :: Regex -> IndexedTraversal' Int T.Text Match -- -- It's a traversal which selects 'RBS.Match'es; compose it with 'match' or 'groups' -- to get the relevant parts of your match. -- -- >>> txt = "raindrops on roses and whiskers on kittens" -- -- Search -- -- >>> has ([regex|whisk|]) txt -- True -- -- Get matches -- -- >>> txt ^.. [regex|\br\w+|] . match -- ["raindrops","roses"] -- -- Edit matches -- -- >>> txt & [regex|\br\w+|] . match %~ T.intersperse '-' . T.toUpper -- "R-A-I-N-D-R-O-P-S on R-O-S-E-S and whiskers on kittens" -- -- Get Groups -- -- >>> txt ^.. [regex|(\w+) on (\w+)|] . groups -- [["raindrops","roses"],["whiskers","kittens"]] -- -- Edit Groups -- -- >>> txt & [regex|(\w+) on (\w+)|] . groups %~ reverse -- "roses on raindrops and kittens on whiskers" -- -- Get the third match -- -- >>> txt ^? [regex|\w+|] . index 2 . match --Just "roses" -- -- Edit matches -- -- >>> txt & [regex|\br\w+|] . match %~ T.intersperse '-' . T.toUpper -- "R-A-I-N-D-R-O-P-S on R-O-S-E-S and whiskers on kittens" -- -- Get Groups -- -- >>> txt ^.. [regex|(\w+) on (\w+)|] . groups -- [["raindrops","roses"],["whiskers","kittens"]] -- -- Edit Groups -- -- >>> txt & [regex|(\w+) on (\w+)|] . groups %~ reverse -- "roses on raindrops and kittens on whiskers" -- -- Get the third match -- -- >>> txt ^? [regex|\w+|] . index 2 . match -- Just "roses" -- -- Match integers, 'Read' them into ints, then sort them in-place -- dumping them back into the source text afterwards. -- -- >>> "Monday: 29, Tuesday: 99, Wednesday: 3" & partsOf ([regex|\d+|] . match . unpacked . _Show @Int) %~ sort -- "Monday: 3, Tuesday: 29, Wednesday: 99" -- -- To alter behaviour of the regex you may wish to pass 'PCRE.PCREOption's when compiling it. -- The default behaviour may seem strange in certain cases; e.g. it operates in 'single-line' -- mode. You can 'PCRE.compile' the 'PCRE.Regex' separately and add any options you like, then pass the resulting -- 'PCRE.Regex' into 'regex'; -- Alternatively can make your own version of the QuasiQuoter with any options you want embedded -- by using 'PCRE.mkRegexQQ'. -- regex :: Regex -> IndexedTraversal' Int T.Text RBS.Match regex :: TH.QuasiQuoter regex = PCRE.re{TH.quoteExp=quoter} where quoter str = do rgx <- TH.quoteExp PCRE.re str regexExpr <- TH.varE 'regexing return $ TH.AppE regexExpr rgx -- | Build a QuasiQuoter just like 'regex' but with the provided 'PCRE.PCREOption' overrides. mkRegexTraversalQQ :: [PCRE.PCREOption] -> TH.QuasiQuoter mkRegexTraversalQQ opts = (PCRE.mkRegexQQ opts){TH.quoteExp=quoter} where quoter str = do rgx <- TH.quoteExp (PCRE.mkRegexQQ opts) str regexExpr <- TH.varE 'regexing return $ TH.AppE regexExpr rgx -- | Build a traversal from the provided 'PCRE.Regex', this is handy if you're QuasiQuoter -- averse, or if you already have a 'PCRE.Regex' object floating around. -- -- Also see 'mkRegexTraversalQQ' regexing :: PCRE.Regex -> IndexedTraversal' Int T.Text RBS.Match regexing pat = utf8 . RBS.regexing pat -- | Access all groups of a match as a list. Also keeps full match text as the index in case -- you need it. -- -- Note that you can edit the groups through this lens, -- Changing the length of the list has behaviour similar to 'partsOf'. -- -- Get all matched groups: -- -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups -- [["raindrops","roses"],["whiskers","kittens"]] -- -- You can access a specific group combining with 'ix', or just use 'group' instead -- -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups . ix 1 -- ["roses","kittens"] -- -- Editing groups: -- -- >>> "raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups . ix 1 %~ T.toUpper -- "raindrops on ROSES and whiskers on KITTENS" -- -- Editing the list rearranges groups -- -- >>> "raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups %~ Prelude.reverse -- "roses on raindrops and kittens on whiskers" -- -- You can traverse the list to flatten out all groups -- -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups . traversed -- ["raindrops","roses","whiskers","kittens"] -- -- This replaces each group with the full match text wrapped in parens: -- -- >>> "one-two" & [regex|(\w+)-(\w+)|] . groups <. traversed %@~ \mtch grp -> grp <> ":(" <> mtch <> ")" -- "one:(one-two)-two:(one-two)" groups :: IndexedLens' T.Text RBS.Match [T.Text] groups = reindexed (view $ from utf8) RBS.groups <. mapping (from utf8) -- | Access a specific group of a match. Numbering starts at 0. -- -- Stashes the full match text as the index in case you need it. -- -- See 'groups' for more info on grouping -- -- >>> "key:value, a:b" ^.. [regex|(\w+):(\w+)|] . group 0 -- ["key","a"] -- -- >>> "key:value, a:b" ^.. [regex|(\w+):(\w+)|] . group 1 -- ["value","b"] -- -- >>> "key:value, a:b" & [regex|(\w+):(\w+)|] . group 1 %~ T.toUpper -- "key:VALUE, a:B" -- -- >>> "key:value, a:b" & [regex|(\w+):(\w+)|] . group 1 %~ T.toUpper -- "key:VALUE, a:B" -- -- Replace the first capture group with the full match: -- -- >>> "a, b" & [regex|(\w+), (\w+)|] . group 0 .@~ \i -> "(" <> i <> ")" -- "(a, b), b" group :: Int -> IndexedTraversal' T.Text RBS.Match T.Text group n = groups <. ix n -- | Access all the named groups of a match as a 'M.Map'. Stashes the full match text as the index in case -- you need it. -- -- Note that you can edit the groups through this lens, but the behaviour is undefined when editing inner elements of __nested__ groups. -- Behaviour is undefined if groups are removed from the map (so don't do that). -- -- NOTE: There's currently some strange behaviour in pcre-heavy where trailing unmatched optional groups are omitted, I'm looking into getting that patched, but for now, note the difference in behaviour: -- -- >>> "A" ^? [regex|(?<a>A)|(?<b>B)|] . namedGroups -- Just (fromList [("a","A")]) -- -- >>> "B" ^? [regex|(?<a>A)|(?<b>B)|] . namedGroups -- Just (fromList [("a",""),("b","B")]) -- -- Get all matched groups: -- -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(?<first>\w+) on (?<second>\w+)|] . namedGroups -- [fromList [("first","raindrops"),("second","roses")],fromList [("first","whiskers"),("second","kittens")]] -- -- You can access a specific group combining with 'ix', or just use 'namedGroup' instead -- -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(?<first>\w+) on (?<second>\w+)|] . namedGroups . ix "second" -- ["roses","kittens"] -- -- Editing groups: -- -- >>> "raindrops on roses and whiskers on kittens" & [regex|(?<first>\w+) on (?<second>\w+)|] . namedGroups . ix "second" %~ T.toUpper -- "raindrops on ROSES and whiskers on KITTENS" -- -- Use indexed helpers to access the full match when operating on a group. -- -- This replaces the "first" group with the full match text wrapped in parens: -- -- >>> "one-two" & [regex|(?<first>\w+)-(\w+)|] . namedGroups <. ix "first" %@~ \mtch grp -> grp <> ":(" <> mtch <> ")" -- "one:(one-two)-two" namedGroups :: IndexedLens' T.Text RBS.Match (M.Map T.Text T.Text) namedGroups = reindexed (view $ from utf8) RBS.namedGroups <. mapAsTxt where mapAsTxt :: Iso' (M.Map BS.ByteString BS.ByteString) (M.Map T.Text T.Text) mapAsTxt = iso (M.mapKeys (review utf8)) (M.mapKeys (view utf8)) . mapping (from utf8) -- | Access a specific named group of a match -- -- See 'namedGroups' for caveats and more info. -- -- Stashes the full match text as the index in case you need it. -- -- >>> "key:value, a:b" ^.. [regex|(?<first>\w+):(?<second>\w+)|] . namedGroup "first" -- ["key","a"] -- -- >>> "key:value, a:b" ^.. [regex|(?<first>\w+):(?<second>\w+)|] . namedGroup "second" -- ["value","b"] -- -- >>> "key:value, a:b" & [regex|(?<first>\w+):(?<second>\w+)|] . namedGroup "second" %~ T.toUpper -- "key:VALUE, a:B" -- -- Replace the first capture group with the full match: -- -- >>> "a, b" & [regex|(?<first>\w+), (?<second>\w+)|] . namedGroup "first" .@~ \i -> "(" <> i <> ")" -- "(a, b), b" namedGroup :: T.Text -> IndexedTraversal' T.Text RBS.Match T.Text namedGroup name = namedGroups <. ix name -- | Traverse each match -- -- Stashes any matched groups into the index in case you need them. -- -- Get a match if one exists: -- -- >>> "find a needle in a haystack" ^? [regex|n..dle|] . match -- Just "needle" -- -- Collect all matches -- -- >>> "one _two_ three _four_" ^.. [regex|_\w+_|] . match -- ["_two_","_four_"] -- -- You can edit the traversal to perform a regex replace/substitution -- -- >>> "one _two_ three _four_" & [regex|_\w+_|] . match %~ T.toUpper -- "one _TWO_ three _FOUR_" -- -- Here we use the group matches stored in the index to form key-value pairs, replacing the entire match. -- -- >>> "abc-def, ghi-jkl" & [regex|(\w+)-(\w+)|] . match %@~ \[k, v] _ -> "{" <> k <> ":" <> v <> "}" -- "{abc:def}, {ghi:jkl}" match :: IndexedTraversal' [T.Text] RBS.Match T.Text match = reindexed (fmap (view $ from utf8)) RBS.match <. from utf8 -- | Collect both the match text AND all the matching groups -- -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . matchAndGroups -- [("raindrops on roses",["raindrops","roses"]),("whiskers on kittens",["whiskers","kittens"])] matchAndGroups :: Getter RBS.Match (T.Text, [T.Text]) matchAndGroups = to $ \m -> (m ^. match, m ^. groups)