{-# LANGUAGE CPP #-}
--------------------------------------------------------------------
-- |
-- Module   : Text.Regex.PCRE.Light
-- Copyright: Copyright (c) 2007-2008, Don Stewart
-- License  : BSD3
--
-- Maintainer:  Don Stewart <dons@galois.com>
-- Stability :  experimental
-- Portability: H98 + CPP
--
--------------------------------------------------------------------
--
-- A simple, portable binding to perl-compatible regular expressions
-- (PCRE) via strict ByteStrings.
--

module Text.Regex.PCRE.Light (

        -- * The abstract PCRE Regex type
          Regex

        -- * ByteString interface
        , compile, compileM
        , match
        , captureCount
        , captureNames

        -- * Regex types and constructors externally visible

        -- ** PCRE compile-time bit flags
        , PCREOption

        , anchored
        , auto_callout
        {-, bsr_anycrlf-}
        {-, bsr_unicode-}
        , caseless
        , dollar_endonly
        , dotall
        , dupnames
        , extended
        , extra
        , firstline
        , multiline
        {-, newline_any-}
        {-, newline_anycrlf-}
        , newline_cr
        , newline_crlf
        , newline_lf
        , no_auto_capture
        , ungreedy
        , utf8
        , no_utf8_check

        -- ** PCRE exec-time bit flags
        , PCREExecOption

        , exec_anchored
        {-, exec_newline_any     -}
        {-, exec_newline_anycrlf -}
        , exec_newline_cr
        , exec_newline_crlf
        , exec_newline_lf
        , exec_notbol
        , exec_noteol
        , exec_notempty
        , exec_no_utf8_check
        , exec_partial

    ) where

import Text.Regex.PCRE.Light.Base

-- Strings
import qualified Data.ByteString          as S

#if __GLASGOW_HASKELL__ >= 608
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Unsafe   as S
#else
import qualified Data.ByteString.Base     as S
#endif

import System.IO.Unsafe (unsafePerformIO)
import Data.List (sortBy)
import Data.Function (on)

-- Foreigns
import Foreign (newForeignPtr, withForeignPtr)
import Foreign.Ptr
import Foreign.C.Types
import Foreign.C.String
import Foreign.Storable
import Foreign.Marshal.Alloc

-- | 'compile'
--
-- Compile a perl-compatible regular expression stored in a strict bytestring.
--
-- An example
--
-- > let r = compile (pack "^(b+|a){1,2}?bc") []
--
-- Or using GHC's -XOverloadedStrings flag, and importing
-- Data.ByteString.Char8, we can avoid the pack:
--
-- > let r = compile "^(b+|a){1,2}?bc" []
--
-- If the regular expression is invalid, an exception is thrown.
-- If this is unsuitable, 'compileM' is availlable, which returns failure
-- in a monad.
--
-- To do case insentive matching,
--
-- > compile "^(b+|a){1,2}?bc" [caseless]
--
-- Other flags are documented below.
--
-- The resulting abstract regular expression can be passed to 'match'
-- for matching against a subject string.
--
-- The arguments are:
--
-- * 'pat': A ByteString containing the regular expression to be compiled.
--
-- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
--
-- Valid compile-time flags are:
--
-- * 'anchored'        - Force pattern anchoring
--
-- * 'auto_callout'    - Compile automatic callouts
--
-- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - \\R matches all Unicode line endings
--
-- * 'caseless'        - Do caseless matching
--
-- * 'dollar_endonly'  - '$' not to match newline at end
--
-- * 'dotall'          - matches anything including NL
--
-- * 'dupnames'        - Allow duplicate names for subpatterns
--
-- * 'extended'        - Ignore whitespace and # comments
--
-- * 'extra'           - PCRE extra features (not much use currently)
--
-- * 'firstline'       - Force matching to be  before  newline
--
-- * 'multiline'       - '^' and '$' match newlines within data
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
--
-- * 'ungreedy'        - Invert greediness of quantifiers
--
-- * 'utf8'            - Run in UTF-8 mode
--
-- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
--
-- The regex is allocated via malloc on the C side, and will be
-- deallocated by the runtime when the Haskell value representing it
-- goes out of scope.
--
-- See 'man pcreapi for more details.
--
-- Caveats: patterns with embedded nulls, such as "\0*" seem to be
-- mishandled, as this won't currently match the subject "\0\0\0".
--
compile :: S.ByteString -> [PCREOption] -> Regex
compile s o = case compileM s o of
    Right r -> r
    Left  e -> error ("Text.Regex.PCRE.Light: Error in regex: " ++ e)

------------------------------------------------------------------------

-- | 'compileM'
-- A safe version of 'compile' with failure wrapped in an Either.
--
-- Examples,
--
-- > > compileM ".*" [] :: Either String Regex
-- > Right (Regex 0x000000004bb5b980 ".*")
--
-- > > compileM "*" [] :: Either String Regex
-- > Left "nothing to repeat"
--
compileM :: S.ByteString -> [PCREOption] -> Either String Regex
compileM str os = unsafePerformIO $
  S.useAsCString str $ \pattern -> do
    alloca $ \errptr       -> do
    alloca $ \erroffset    -> do
        pcre_ptr <- c_pcre_compile pattern (combineOptions os) errptr erroffset nullPtr
        if pcre_ptr == nullPtr
            then do
                err <- peekCString =<< peek errptr
                return (Left err)
            else do
                reg <- newForeignPtr finalizerFree pcre_ptr -- release with free()
                return (Right (Regex reg str))

-- Possible improvements: an 'IsString' instance could be defined
-- for 'Regex', which would allow the compiler to insert calls to
-- 'compile' based on the type:
--
-- The following would be valid:
--
-- > match "a.*b" "abcdef" []
--
-- and equivalent to:
--
-- > match (either error id (compile "a.*b")) "abcdef" []

-- | 'match'
--
-- Matches a compiled regular expression against a given subject string,
-- using a matching algorithm that is similar to Perl's. If the subject
-- string doesn't match the regular expression, 'Nothing' is returned,
-- otherwise the portion of the string that matched is returned, along
-- with any captured subpatterns.
--
-- The arguments are:
--
-- * 'regex', a PCRE regular expression value produced by compile
--
-- * 'subject', the subject string to match against
--
-- * 'options', an optional set of exec-time flags to exec.
--
-- Available runtime options are:
--
-- * 'exec_anchored'        - Match only at the first position
--
-- * 'exec_newline_any'     - Recognize any Unicode newline sequence
--
-- * 'exec_newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'exec_newline_cr'      - Set CR as the newline sequence
--
-- * 'exec_newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'exec_newline_lf'      - Set LF as the newline sequence
--
-- * 'exec_notbol'          - Subject is not the beginning of a line
--
-- * 'exec_noteol'          - Subject is not the end of a line
--
-- * 'exec_notempty'        - An empty string is not a valid match
--
-- * 'exec_no_utf8_check'   - Do not check the subject for UTF-8
--
-- * 'exec_partial'         - Return PCRE_ERROR_PARTIAL for a partial match
--
-- The result value, and any captured subpatterns, are returned.
-- If the regex is invalid, or the subject string is empty, Nothing
-- is returned.
--
match :: Regex -> S.ByteString -> [PCREExecOption] -> Maybe [S.ByteString]
match (Regex pcre_fp _) subject os = unsafePerformIO $ do
  withForeignPtr pcre_fp $ \pcre_ptr -> do
    n_capt <- fullInfoInt pcre_ptr info_capturecount

    -- The smallest  size  for ovector that will allow for n captured
    -- substrings, in addition to the offsets  of  the  substring
    -- matched by the whole pattern, is (n+1)*3. (man pcreapi)

    let ovec_size = (n_capt + 1) * 3
        ovec_bytes = ovec_size * size_of_cint

    allocaBytes ovec_bytes $ \ovec -> do

        let (str_fp, off, len) = S.toForeignPtr subject
        withForeignPtr str_fp $ \cstr -> do
            r <- c_pcre_exec
                         pcre_ptr
                         nullPtr
                         (cstr `plusPtr` off) -- may contain binary zero bytes.
                         (fromIntegral len)
                         0
                         (combineExecOptions os)
                         ovec
                         (fromIntegral ovec_size)

            if r < 0 -- errors, or error_no_match
                then return Nothing
                else let loop n o acc =
                            if n == r
                              then return (Just (reverse acc))
                              else do
                                    i <- peekElemOff ovec $! o
                                    j <- peekElemOff ovec (o+1)
                                    let s = substring i j subject
                                    s `seq` loop (n+1) (o+2) (s : acc)
                     in loop 0 0 []

    -- The  first  two-thirds  of ovec is used to pass back captured
    -- substrings When  a  match  is  successful, information about captured
    -- substrings is returned in pairs of integers,  starting  at the
    -- beginning of ovector, and continuing up to two-thirds of its length at
    -- the most.  The first pair, ovector[0] and ovector[1], identify the
    -- portion of the subject string matched  by  the entire pattern.  The next
    -- pair is used for the first capturing subpattern,  and  so on.  The
    -- value returned  by pcre_exec() is one more than the highest num- bered
    -- pair that has been set. For  example,  if  two  sub- strings  have been
    -- captured, the returned value is 3.

  where
    -- The first element of a pair is set  to  the offset of the first
    -- character in a substring, and the second is set to the offset of the
    -- first character after  the  end of a substring.
    substring :: CInt -> CInt -> S.ByteString -> S.ByteString
    substring x y _ | x == y = S.empty -- XXX an unset subpattern
    substring a b s = end -- note that we're not checking...
        where
            start = S.unsafeDrop (fromIntegral a) s
            end   = S.unsafeTake (fromIntegral (b-a)) start


-- Wrapper around c_pcre_fullinfo for integer values
fullInfoInt pcre_ptr what =
  alloca $ \n_ptr -> do
    c_pcre_fullinfo pcre_ptr nullPtr what n_ptr
    return . fromIntegral =<< peek (n_ptr :: Ptr CInt)


-- | 'captureCount'
--
-- Returns the number of captures in a 'Regex'. Correctly ignores non-capturing groups
-- like @(?:abc)@.
--
-- >>> captureCount (compile "(?<one>abc) (def) (?:non-captured) (?<three>ghi)" [])
-- 3
captureCount :: Regex -> Int
captureCount (Regex pcre_fp _) = unsafePerformIO $
  withForeignPtr pcre_fp $ \pcre_ptr ->
    fullInfoInt pcre_ptr info_capturecount


-- | 'captureNames'
--
-- Returns the names and numbers of all named subpatterns in the regular
-- expression. Groups are zero-indexed. Unnamed groups are counted, but don't appear in the
-- result list.
--
-- >>> captureNames (compile "(?<one>abc) (def) (?<three>ghi)")
-- [("one", 0), ("three", 2)]
captureNames :: Regex -> [(S.ByteString, Int)]
captureNames (Regex pcre_fp _) = unsafePerformIO $
  withForeignPtr pcre_fp $ \pcre_ptr -> do
    count     <- fullInfoInt pcre_ptr info_namecount
    entrysize <- fullInfoInt pcre_ptr info_nameentrysize

    buf <- alloca $ \n_ptr -> do
      c_pcre_fullinfo pcre_ptr nullPtr info_nametable n_ptr
      buf <- peek n_ptr
      S.packCStringLen (buf, count*entrysize)

    let results = split entrysize buf
        zeroIndexed = fmap (subtract 1) <$> results
        sorted = sortBy (compare `on` snd) zeroIndexed
    return sorted

  where
    -- Split the nametable buffer into entries. Each entry has a fixed size in
    -- bytes. The first two bytes in each entry store the pattern number in
    -- big-endian format, the bytes following that contain the nul-terminated
    -- name of the subpattern.
    split :: Int -> S.ByteString -> [(S.ByteString, Int)]
    split entrysize buf
      | S.null buf = []
      | otherwise =
        let
          (entry, tail) = S.splitAt entrysize buf
          idx = fromIntegral . S.index entry
          num = idx 0 * 256 + idx 1
          name = S.takeWhile (/= 0) $ S.drop 2 entry
        in (name, num) : split entrysize tail