{-# LANGUAGE BangPatterns, RecordWildCards, CPP #-}
-- |
-- Module : Data.Text.ICU.Break.Pure
-- Copyright : (c) 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- String breaking functions for Unicode, implemented as bindings to
-- the International Components for Unicode (ICU) libraries.
--
-- The text boundary positions are found according to the rules described in
-- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex
-- #14, Line Breaking Properties. These are available at
-- and
-- .
module Data.Text.ICU.Break.Pure
(
-- * Types
Breaker
, Break
, brkPrefix
, brkBreak
, brkSuffix
, brkStatus
, Line(..)
, Data.Text.ICU.Break.Word(..)
-- * Breaking functions
, breakCharacter
, breakLine
, breakSentence
, breakWord
-- * Iteration
, breaks
, breaksRight
) where
import Control.DeepSeq (NFData(..))
import Data.Text (Text, empty)
import Data.Text.ICU.Break (Line, Word)
import Data.Text.ICU.Break.Types (BreakIterator(..))
import Data.Text.ICU.Internal (LocaleName, takeWord, dropWord)
import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO)
import qualified Data.Text.ICU.Break as IO
-- | A boundary analyser.
newtype Breaker a = B (BreakIterator a)
new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a
new act loc = unsafePerformIO $ B `fmap` act loc empty
-- | Break a string on character boundaries.
--
-- Character boundary analysis identifies the boundaries of "Extended
-- Grapheme Clusters", which are groupings of codepoints that should be
-- treated as character-like units for many text operations. Please see
-- Unicode Standard Annex #29, Unicode Text Segmentation,
-- for additional information on
-- grapheme clusters and guidelines on their use.
breakCharacter :: LocaleName -> Breaker ()
breakCharacter = new IO.breakCharacter
-- | Break a string on line boundaries.
--
-- Line boundary analysis determines where a text string can be broken when
-- line wrapping. The mechanism correctly handles punctuation and hyphenated
-- words.
breakLine :: LocaleName -> Breaker Line
breakLine = new IO.breakLine
-- | Break a string on sentence boundaries.
--
-- Sentence boundary analysis allows selection with correct interpretation
-- of periods within numbers and abbreviations, and trailing punctuation
-- marks such as quotation marks and parentheses.
breakSentence :: LocaleName -> Breaker ()
breakSentence = new IO.breakSentence
-- | Break a string on word boundaries.
--
-- Word boundary analysis is used by search and replace functions, as well
-- as within text editing applications that allow the user to select words
-- with a double click. Word selection provides correct interpretation of
-- punctuation marks within and following words. Characters that are not
-- part of a word, such as symbols or punctuation marks, have word breaks on
-- both sides.
breakWord :: LocaleName -> Breaker Data.Text.ICU.Break.Word
breakWord = new IO.breakWord
-- | A break in a string.
data Break a = Break {
brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break.
, brkBreak :: {-# UNPACK #-} !Text -- ^ Text of the current break.
, brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break.
, brkStatus :: !a
-- ^ Status of the current break (only meaningful if 'Line' or 'Word').
} deriving (Eq, Show)
instance (NFData a) => NFData (Break a) where
rnf Break{..} = rnf brkStatus
-- | Return a list of all breaks in a string, from left to right.
breaks :: Breaker a -> Text -> [Break a]
breaks (B b) t = unsafePerformIO $ do
bi <- IO.clone b
IO.setText bi t
let go p = do
mix <- IO.next bi
case mix of
Nothing -> return []
Just n -> do
s <- IO.getStatus bi
let d = n-p
u = dropWord p t
(Break (takeWord p t) (takeWord d u) (dropWord d u) s :) `fmap` go n
unsafeInterleaveIO $ go =<< IO.first bi
-- | Return a list of all breaks in a string, from right to left.
breaksRight :: Breaker a -> Text -> [Break a]
breaksRight (B b) t = unsafePerformIO $ do
bi <- IO.clone b
IO.setText bi t
let go p = do
mix <- IO.previous bi
case mix of
Nothing -> return []
Just n -> do
s <- IO.getStatus bi
let d = p-n
u = dropWord n t
(Break (takeWord n t) (takeWord d u) (dropWord d u) s :) `fmap` go n
unsafeInterleaveIO $ go =<< IO.last bi