{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE PatternSynonyms #-}

-- | Create a map of the lines in the file to allow fast seeking later.
-- Specifically, for each line, we output:
--
-- - the byte offset from the start of the file of the start of the line
-- - the length of the line in number of bytes (including the line terminator, if any)
-- - the type of line terminator that ended the line, if any
-- - the non-decoded bytes of that line.
--
-- There is an associated file format to serialize this data, based on CSV.
-- See documentation for 'display'.
--
-- Currently, we only support utf8-encoded text with Unix line-endings (LF).
module Text.Newline.LineMap
  ( Line(..)
  , display
  , breakLines_unixUtf8
  , breakLine_unixUtf8
  ) where

import Prelude hiding (length)

import Text.Newline (Newline,pattern Unix)

import qualified Data.ByteString.Lazy as LBS

-- | Holds a detected line.
-- The main result type for this module.
data Line a = Line
  { forall a. Line a -> Int
startOffset :: {-# UNPACK #-} !Int -- ^ offset in bytes of the start of the line from the start of the input file
  , forall a. Line a -> a
content :: a -- ^ generally, does not include newline
  , forall a. Line a -> Maybe Newline
nlType :: Maybe Newline -- ^ the terminator for this line, if any
  , forall a. Line a -> Int
length :: {-# UNPACK #-} !Int -- ^ length of the line in bytes, including the line terminator
  }
  deriving ((forall a b. (a -> b) -> Line a -> Line b)
-> (forall a b. a -> Line b -> Line a) -> Functor Line
forall a b. a -> Line b -> Line a
forall a b. (a -> b) -> Line a -> Line b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
<$ :: forall a b. a -> Line b -> Line a
$c<$ :: forall a b. a -> Line b -> Line a
fmap :: forall a b. (a -> b) -> Line a -> Line b
$cfmap :: forall a b. (a -> b) -> Line a -> Line b
Functor)

-- | Render contents for a linemap file.
--
-- The format is simply a three-colum CSV with header row.
-- The columns are offset, length, and terminator, as above.
-- Offset and length are decimal-encoded unsigned integers.
-- The terminator column must hold one of the following strings:
--
-- - @unix@ for LF (ASCII 0x0A),
-- - @dos@ for CRLF (ASCOO 0x0D 0x0A),
-- - @eof@ for end of file/input.
--
-- The output CSV does not require quoting,
--   so the output actually abides by RFC 4180
--   (with the exception that I'm using LF instead of CRLF, sigh).
display :: [Line a] -> String
display :: forall a. [Line a] -> [Char]
display [Line a]
ls0 = [Char]
"offset,length,terminator\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Line a] -> [Char]
forall {a} {a} {r}.
(Show a, Show a, HasField "length" r a, HasField "startOffset" r a,
 HasField "nlType" r (Maybe Newline)) =>
[r] -> [Char]
go [Line a]
ls0
  where
  go :: [r] -> [Char]
go [] = [Char]
""
  go (r
l:[r]
ls) = r -> [Char]
forall {a} {a} {r}.
(Show a, Show a, HasField "length" r a, HasField "startOffset" r a,
 HasField "nlType" r (Maybe Newline)) =>
r -> [Char]
go1 r
l [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [r] -> [Char]
go [r]
ls
  go1 :: r -> [Char]
go1 r
l = [[Char]] -> [Char]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat
    [ a -> [Char]
forall a. Show a => a -> [Char]
show r
l.startOffset
    , [Char]
","
    , a -> [Char]
forall a. Show a => a -> [Char]
show r
l.length
    , [Char]
","
    , case r
l.nlType of
      Maybe Newline
Nothing -> [Char]
"eof"
      Just Newline
Unix -> [Char]
"unix"
      Maybe Newline
_ -> [Char]
"<UNKNOWN>"
    , [Char]
"\n"
    ]

-- | Split input into lines.
-- Assumes utf8-encoded text with LF (ASCII 0x0A) line terminators.
-- See 'breakLine_unixUtf8' to take a single line.
--
-- Does not include newlines in any 'Line' 'content'.
breakLines_unixUtf8 ::
     LBS.ByteString -- ^ all bytes of a file
  -> [Line LBS.ByteString]
breakLines_unixUtf8 :: ByteString -> [Line ByteString]
breakLines_unixUtf8 = Int -> ByteString -> [Line ByteString]
go Int
0
  where
  go :: Int -> ByteString -> [Line ByteString]
go Int
_ ByteString
bs | ByteString -> Bool
LBS.null ByteString
bs = []
  go Int
off ByteString
bs =
    let (Line ByteString
l, ByteString
bs') = Int -> ByteString -> (Line ByteString, ByteString)
breakLine_unixUtf8 Int
off ByteString
bs
        off' :: Int
off' = Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Line ByteString
l.length
     in Line ByteString
l Line ByteString -> [Line ByteString] -> [Line ByteString]
forall a. a -> [a] -> [a]
: Int -> ByteString -> [Line ByteString]
go Int
off' ByteString
bs'

-- | Take one line of input, and also return the remaining input.
-- Assumes utf8-encoded text with LF (ASCII 0x0A) line terminators.
-- See 'breakLines_unixUtf8' to produce a list of all lines.
--
-- Does not include newlines in any 'Line' 'content'.
breakLine_unixUtf8 ::
     Int -- ^ byte offset within file of input
  -> LBS.ByteString -- ^ non-empty input bytes
  -> (Line LBS.ByteString, LBS.ByteString) -- ^ resuling line and remaining input
breakLine_unixUtf8 :: Int -> ByteString -> (Line ByteString, ByteString)
breakLine_unixUtf8 Int
off ByteString
bs =
  let (ByteString
pre, ByteString
atpost) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
LBS.break (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
==Word8
0x0A) ByteString
bs
   in case ByteString -> Maybe (Word8, ByteString)
LBS.uncons ByteString
atpost of
    Maybe (Word8, ByteString)
Nothing -> (Line ByteString
l, ByteString
atpost)
      where
      l :: Line ByteString
l = Line
        { startOffset :: Int
startOffset = Int
off
        , content :: ByteString
content = ByteString
pre
        , nlType :: Maybe Newline
nlType = Maybe Newline
forall a. Maybe a
Nothing
        , length :: Int
length = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int64 -> Int) -> Int64 -> Int
forall a b. (a -> b) -> a -> b
$ ByteString -> Int64
LBS.length ByteString
pre
        }
    Just (Word8
0x0A, ByteString
post) -> (Line ByteString
l, ByteString
post)
      where
      l :: Line ByteString
l = Line
        { startOffset :: Int
startOffset = Int
off
        , content :: ByteString
content = ByteString
pre
        , nlType :: Maybe Newline
nlType = Newline -> Maybe Newline
forall a. a -> Maybe a
Just Newline
Unix
        , length :: Int
length = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int64 -> Int) -> Int64 -> Int
forall a b. (a -> b) -> a -> b
$ ByteString -> Int64
LBS.length ByteString
pre Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
1
        }
    Just (Word8
c, ByteString
_) -> [Char] -> (Line ByteString, ByteString)
forall a. [Char] -> a
errorWithoutStackTrace ([Char] -> (Line ByteString, ByteString))
-> [Char] -> (Line ByteString, ByteString)
forall a b. (a -> b) -> a -> b
$
      [Char]
"internal error: newline delimited by byte " [Char] -> [Char] -> [Char]
forall a. Semigroup a => a -> a -> a
<> Word8 -> [Char]
forall a. Show a => a -> [Char]
show Word8
c