{-# OPTIONS_GHC -Wall -fwarn-tabs #-} {-# LANGUAGE ForeignFunctionInterface #-} ---------------------------------------------------------------- -- 2010.10.09 -- | -- Module : IsSpace -- Copyright : Copyright (c) 2010 wren ng thornton -- License : BSD -- Maintainer : wren@community.haskell.org -- Stability : experimental -- Portability : portable (FFI) -- -- A benchmark for comparing different definitions of predicates -- for detecting whitespace. As of the last run the results are: -- -- * Data.Char.isSpace : 14.44786 us +/- 258.0377 ns -- * isSpace_DataChar : 43.25154 us +/- 655.7037 ns -- * isSpace_Char : 29.26598 us +/- 454.1445 ns -- * isPerlSpace : -- * Data.Attoparsec.Char8.isSpace : 81.87335 us +/- 1.195903 us -- * isSpace_Char8 : 11.84677 us +/- 178.9795 ns -- * isSpace_w8 : 11.55470 us +/- 133.7644 ns ---------------------------------------------------------------- module IsSpace (main) where import qualified Data.Char as C import Data.Word (Word8) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import Foreign.C.Types (CInt) import Criterion (bench, nf) import Criterion.Main (defaultMain) ---------------------------------------------------------------- ----- Character predicates -- N.B. \x9..\xD == "\t\n\v\f\r" -- | Recognize the same characters as Perl's @/\s/@ in Unicode mode. -- In particular, we recognize POSIX 1003.2 @[[:space:]]@ except -- @\'\v\'@, and recognize the Unicode @\'\x85\'@, @\'\x2028\'@, -- @\'\x2029\'@. Notably, @\'\x85\'@ belongs to Latin-1 (but not -- ASCII) and therefore does not belong to POSIX 1003.2 @[[:space:]]@ -- (nor non-Unicode @/\s/@). isPerlSpace :: Char -> Bool isPerlSpace c = (' ' == c) || ('\t' <= c && c <= '\r' && c /= '\v') || ('\x85' == c) || ('\x2028' == c) || ('\x2029' == c) {-# INLINE isPerlSpace #-} -- | 'Data.Attoparsec.Char8.isSpace', duplicated here because it's -- not exported. This is the definition as of attoparsec-0.8.1.0. isSpace :: Char -> Bool isSpace c = c `B8.elem` spaces where spaces = B8.pack " \n\r\t\v\f" {-# NOINLINE spaces #-} {-# INLINE isSpace #-} -- | An alternate version of 'Data.Attoparsec.Char8.isSpace'. isSpace_Char8 :: Char -> Bool isSpace_Char8 c = (' ' == c) || ('\t' <= c && c <= '\r') {-# INLINE isSpace_Char8 #-} -- | An alternate version of 'Data.Char.isSpace'. This uses the -- same trick as 'isSpace_Char8' but we include Unicode whitespaces -- too, in order to have the same results as 'Data.Char.isSpace' -- (whereas 'isSpace_Char8' doesn't recognize Unicode whitespace). isSpace_Char :: Char -> Bool isSpace_Char c = (' ' == c) || ('\t' <= c && c <= '\r') || ('\xA0' == c) || (iswspace (fromIntegral (C.ord c)) /= 0) {-# INLINE isSpace_Char #-} foreign import ccall unsafe "u_iswspace" iswspace :: CInt -> CInt -- | Verbatim version of 'Data.Char.isSpace' (i.e., 'GHC.Unicode.isSpace' -- as of base-4.2.0.2) in order to try to figure out why 'isSpace_Char' -- is slower than 'Data.Char.isSpace'. It appears to be something -- special in how the base library was compiled. isSpace_DataChar :: Char -> Bool isSpace_DataChar c = c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v' || c == '\xa0' || iswspace (fromIntegral (C.ord c)) /= 0 {-# INLINE isSpace_DataChar #-} -- | A 'Word8' version of 'Data.Attoparsec.Char8.isSpace'. isSpace_w8 :: Word8 -> Bool isSpace_w8 w = (w == 32) || (9 <= w && w <= 13) {-# INLINE isSpace_w8 #-} ---------------------------------------------------------------- main :: IO () main = defaultMain [ bench "Data.Char.isSpace" $ nf (map C.isSpace) ['\x0'..'\255'] , bench "isSpace_DataChar" $ nf (map isSpace_DataChar) ['\x0'..'\255'] , bench "isSpace_Char" $ nf (map isSpace_Char) ['\x0'..'\255'] , bench "isPerlSpace" $ nf (map isPerlSpace) ['\x0'..'\255'] , bench "Data.Attoparsec.Char8.isSpace" $ nf (map isSpace) ['\x0'..'\255'] , bench "isSpace_Char8" $ nf (map isSpace_Char8) ['\x0'..'\255'] , bench "isSpace_w8" $ nf (map isSpace_w8) [0..255] ] ---------------------------------------------------------------- ----------------------------------------------------------- fin.