Copyright | Copyright (C) 2014- Uwe Schmidt |
---|---|
License | MIT |
Maintainer | Uwe Schmidt <uwe@fh-wedel.de> |
Stability | stable |
Portability | portable |
Safe Haskell | Safe-Inferred |
Language | Haskell2010 |
Convenient functions for W3C XML Schema Regular Expression Matcher.
For internals see Regex
Grammar can be found under http://www.w3.org/TR/xmlschema11-2/#regexs
Synopsis
- grep :: StringLike s => s -> [s] -> [s]
- grepExt :: StringLike s => s -> [s] -> [s]
- grepRE :: StringLike s => GenRegex s -> [s] -> [s]
- grepREwithLineNum :: StringLike s => GenRegex s -> [s] -> [(Int, s)]
- match :: StringLike s => s -> s -> Bool
- matchExt :: StringLike s => s -> s -> Bool
- matchSubex :: StringLike s => s -> s -> [(s, s)]
- sed :: StringLike s => (s -> s) -> s -> s -> s
- sedExt :: StringLike s => (s -> s) -> s -> s -> s
- split :: StringLike s => s -> s -> (s, s)
- splitExt :: StringLike s => s -> s -> (s, s)
- splitSubex :: StringLike s => s -> s -> ([(s, s)], s)
- tokenize :: StringLike s => s -> s -> [s]
- tokenizeExt :: StringLike s => s -> s -> [s]
- tokenize' :: StringLike s => s -> s -> [Either s s]
- tokenizeExt' :: StringLike s => s -> s -> [Either s s]
- tokenizeSubex :: StringLike s => s -> s -> [(s, s)]
- matchRE :: StringLike s => GenRegex s -> s -> Bool
- matchSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)]
- sedRE :: StringLike s => (s -> s) -> GenRegex s -> s -> s
- splitRE :: StringLike s => GenRegex s -> s -> Maybe (s, s)
- splitSubexRE :: StringLike s => GenRegex s -> s -> Maybe ([(s, s)], s)
- tokenizeRE :: StringLike s => GenRegex s -> s -> [s]
- tokenizeRE' :: StringLike s => GenRegex s -> s -> [Either s s]
- tokenizeSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)]
Documentation
grep :: StringLike s => s -> [s] -> [s] Source #
grep like filter for lists of strings
The regular expression may be prefixed with the usual context spec "^" for start of string, and "\<" for start of word. and suffixed with "$" for end of text and "\>" end of word. Word chars are defined by the multi char escape sequence "\w"
Examples
grep "a" ["_a_", "_a", "a_", "a", "_"] => ["_a_", "_a", "a_", "a"] grep "^a" ["_a_", "_a", "a_", "a", "_"] => ["a_", "a"] grep "a$" ["_a_", "_a", "a_", "a", "_"] => ["_a", "a"] grep "^a$" ["_a_", "_a", "a_", "a", "_"] => ["a"] grep "\\<a" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " ax "] grep "a\\>" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " xa "]
grepExt :: StringLike s => s -> [s] -> [s] Source #
grep with extended regular expressions
grepRE :: StringLike s => GenRegex s -> [s] -> [s] Source #
grep with already prepared Regex (ususally with parseContextRegex
)
grepREwithLineNum :: StringLike s => GenRegex s -> [s] -> [(Int, s)] Source #
grep with Regex and line numbers
match :: StringLike s => s -> s -> Bool Source #
convenient function for matchRE
Examples:
match "x*" "xxx" = True match "x" "xxx" = False match "[" "xxx" = False
matchExt :: StringLike s => s -> s -> Bool Source #
match with extended regular expressions
matchSubex :: StringLike s => s -> s -> [(s, s)] Source #
convenient function for matchRE
Examples:
matchSubex "({1}x*)" "xxx" = [("1","xxx")] matchSubex "({1}x*)" "y" = [] matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] matchSubex "[" "xxx" = []
sed :: StringLike s => (s -> s) -> s -> s -> s Source #
convenient function for sedRE
examples:
sed (const "b") "a" "xaxax" = "xbxbx" sed (\ x -> x ++ x) "a" "xax" = "xaax" sed undefined "[" "xxx" = "xxx"
sedExt :: StringLike s => (s -> s) -> s -> s -> s Source #
split :: StringLike s => s -> s -> (s, s) Source #
convenient function for splitRE
examples:
split "a*b" "abc" = ("ab","c") split "a*" "bc" = ("", "bc") -- "a*" matches "" split "a+" "bc" = ("", "bc") -- "a+" does not match, no split split "[" "abc" = ("", "abc") -- "[" syntax error, no split
splitExt :: StringLike s => s -> s -> (s, s) Source #
split with extended syntax
splitSubex :: StringLike s => s -> s -> ([(s, s)], s) Source #
convenient function for splitSubex
, uses extended syntax
examples:
splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") splitSubex "({2}a*)" "bc" = ([("2","")], "bc") splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") -- nondeterminism: 3 matches for a* splitSubex "({1}do)|({2}[a-z]+)" "do you know" = ([("1","do"),("2","do")]," you know") -- nondeterminism: 2 matches for do splitSubex "({1}do){|}({2}[a-z]+)" "do you know" = ([("1","do")]," you know") -- no nondeterminism with {|}: 1. match for do splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match splitSubex "[" "abc" = ([], "abc") -- syntax error
tokenize :: StringLike s => s -> s -> [s] Source #
split a string into tokens (words) by giving a regular expression which all tokens must match.
Convenient function for tokenizeRE
This can be used for simple tokenizers.
It is recommended to use regular expressions where the empty word does not match.
Else there will appear a lot of probably useless empty tokens in the output.
All none matching chars are discarded. If the given regex contains syntax errors,
Nothing
is returned
examples:
tokenize "a" "aabba" = ["a","a","a"] tokenize "a*" "aaaba" = ["aaa","a"] tokenize "a*" "bbb" = ["","",""] tokenize "a+" "bbb" = [] tokenize "a*b" "" = [] tokenize "a*b" "abc" = ["ab"] tokenize "a*b" "abaab ab" = ["ab","aab","ab"] tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" = ["ab","123","456.7","abc"] tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" = ["cab","123","456.7","abc"] tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" = ["abc","def","xyz"] tokenize ".*" "\nabc\n123\n\nxyz\n" = ["","abc","123","","xyz"] tokenize ".*" = lines tokenize "[^ \t\n\r]*" = words
tokenizeExt :: StringLike s => s -> s -> [s] Source #
tokenize with extended syntax
tokenize' :: StringLike s => s -> s -> [Either s s] Source #
convenient function for tokenizeRE'
When the regular expression parses as Zero, [Left input]
is returned, that means no tokens are found
tokenizeExt' :: StringLike s => s -> s -> [Either s s] Source #
tokenizeSubex :: StringLike s => s -> s -> [(s, s)] Source #
convenient function for tokenizeSubexRE
a string
examples:
tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" "cab123 456.7abc" = [("name","cab") ,("num","123") ,("real","456.7") ,("name","abc")] tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" "12.34" = [("real","12.34") ,("n","12") ,("f","34")] tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" "12 34" = [("real","12"),("n","12") ,("real","34"),("n","34")] tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" "12 34.56" = [("real","12"),("n","12"),("f","") ,("real","34.56"),("n","34"),("f","56")]
matchSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] Source #
match a string with a regular expression and extract subexpression matches
sedRE :: StringLike s => (s -> s) -> GenRegex s -> s -> s Source #
sed like editing function
All matching tokens are edited by the 1. argument, the editing function, all other chars remain as they are
splitRE :: StringLike s => GenRegex s -> s -> Maybe (s, s) Source #
split a string by taking the longest prefix matching a regular expression
Nothing
is returned in case there is no matching prefix,
else the pair of prefix and rest is returned
splitSubexRE :: StringLike s => GenRegex s -> s -> Maybe ([(s, s)], s) Source #
split a string by removing the longest prefix matching a regular expression and then return the list of subexpressions found in the matching part
Nothing
is returned in case of no matching prefix,
else the list of pairs of labels and submatches and the
rest is returned
tokenizeRE :: StringLike s => GenRegex s -> s -> [s] Source #
The function, that does the real work for tokenize
tokenizeRE' :: StringLike s => GenRegex s -> s -> [Either s s] Source #
split a string into tokens and delimierter by giving a regular expression which all tokens must match
This is a generalisation of the above tokenizeRE
functions.
The none matching char sequences are marked with Left
, the matching ones are marked with Right
If the regular expression contains syntax errors Nothing
is returned
The following Law holds:
concat . map (either id id) . tokenizeRE' re == id
tokenizeSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] Source #
split a string into tokens (pair of labels and words) by giving a regular expression containing labeled subexpressions.
This function should not be called with regular expressions without any labeled subexpressions. This does not make sense, because the result list will always be empty.
Result is the list of matching subexpressions
This can be used for simple tokenizers.
At least one char is consumed by parsing a token.
The pairs in the result list contain the matching substrings.
All none matching chars are discarded. If the given regex contains syntax errors,
Nothing
is returned