{-# LANGUAGE BangPatterns, OverloadedStrings, RecordWildCards #-}
module Data.Text.Offset
( OffsetTable
, createOffsetTable
, lineColToByteOffsetDetail
, lineColToByteOffset
, OffsetError(..)
, OverLineKind(..)
) where
import Control.Monad (unless)
import Data.Foldable (foldl')
import Data.Char (ord)
import qualified Data.Text.Lazy as TL
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as UV
type CharOffset = Int
type ByteOffset = Int
type LineOffsets = (CharOffset, ByteOffset)
type CharBytes = Int
type MultibyteChar = (Int, CharBytes)
type ContentCharCount = Int
type LineCharInfo = (ContentCharCount, UV.Vector MultibyteChar)
data OffsetTable = OffsetTable
{ lineOffsets :: !(UV.Vector LineOffsets)
, lineCharInfo :: !(V.Vector LineCharInfo)
} deriving (Eq, Show)
createOffsetTable :: TL.Text -> OffsetTable
createOffsetTable
= adjust
. foldl' addLineOffsets [((0, 0), undefined)]
. TL.splitOn "\n"
where
addLineOffsets
:: [(LineOffsets, LineCharInfo)]
-> TL.Text
-> [(LineOffsets, LineCharInfo)]
addLineOffsets ofs@(((cs,bs), _):_) s =
let !charCount = fromIntegral (TL.length s)
multis = multibytes s
!byteCount = bytesUntilCharPosition charCount multis
!newOffs = (cs + charCount + 1, bs + byteCount + 1)
in (newOffs, (charCount, multis)):ofs
addLineOffsets _ _ = error "addLineOffsets call with empty accumulator"
adjust :: [(LineOffsets, LineCharInfo)] -> OffsetTable
adjust xs =
let (ofs, info) = unzip xs
in OffsetTable
{ lineOffsets = UV.fromList . reverse . tail $ ofs
, lineCharInfo = V.fromList . tail . reverse $ info
}
data OffsetError
= NoSuchLine
| EmptyLine
| OverLineEnd !ByteOffset !OverLineKind
deriving (Eq, Ord, Show)
data OverLineKind
= JustAtLineEnd
| AfterLineEnd
deriving (Eq, Ord, Show)
lineColToByteOffsetDetail
:: OffsetTable -> Int -> Int
-> Either OffsetError Int
lineColToByteOffsetDetail OffsetTable{..} line col = do
(_, lineByteOffs) <- note NoSuchLine (lineOffsets UV.!? line)
(charCount, multis) <- note NoSuchLine (lineCharInfo V.!? line)
unless (charCount > 0) (Left EmptyLine)
let fileOffset = bytesUntilCharPosition col multis + lineByteOffs
unless (col < charCount) $
let kind = if col == charCount then JustAtLineEnd else AfterLineEnd
in Left $! OverLineEnd fileOffset kind
return $! fileOffset
where
note :: e -> Maybe a -> Either e a
note e = maybe (Left e) Right
lineColToByteOffset :: OffsetTable -> Int -> Int -> Maybe Int
lineColToByteOffset t l c = case lineColToByteOffsetDetail t l c of
Left _ -> Nothing
Right a -> Just a
bytesUntilCharPosition :: Int -> UV.Vector MultibyteChar -> Int
bytesUntilCharPosition n ms
| UV.null ms = n
| otherwise =
let multis = UV.map snd . UV.takeWhile ((< n) . fst) $ ms
!res = n - UV.length multis + UV.sum multis
in res
multibytes :: TL.Text -> UV.Vector MultibyteChar
multibytes =
UV.fromList . reverse . snd . TL.foldl' addMulti (0, [])
where
addMulti
:: (CharOffset, [MultibyteChar]) -> Char
-> (CharOffset, [MultibyteChar])
addMulti (coffs, acc) c =
let !n = charBytes c
!acc1 | n == 1 = acc
| otherwise = (coffs, n):acc
!coffs1 = coffs + 1
in (coffs1, acc1)
charBytes :: Char -> Int
charBytes = ordBytes . ord
where
ordBytes c
| c < 0x80 = 1
| c < 0x0800 = 2
| c < 0x10000 = 3
| otherwise = 4