{-
  Copyright (c) Meta Platforms, Inc. and affiliates.
  All rights reserved.

  This source code is licensed under the BSD-style license found in the
  LICENSE file in the root directory of this source tree.
-}

{-# OPTIONS_GHC -fno-warn-unused-imports #-}

{-# LANGUAGE DefaultSignatures #-}
{-# LANGUAGE CPP #-}

module Util.Text
  ( cStringLenToText
  , cStringLenToTextAndByteString
  , cStringLenToTextLenient
  , cStringToText
  , cStringToTextLenient
  , newCStringFromText
  , useMaybeTextAsCString
  , useTextAsCString
  , useTextAsCStringLen
  , useTextsAsCStrings
  , useTextsAsCStringLens
  , withCStrings
  , textShow
  , TextRead(..)
  , TextAndByteString(..)
  , TextOrByteString(..)
  , mkTextAndByteString
  , nl
  , capitalize
  , decapitalize
  , insertCommasAndAnd
  , toCamelCase
  , toPascalCase
  , toText
  , toUnderscore
  , isCfInfixOf
  , UnicodeException
  , Util.Text.withCStringLen
  , useAsPtr
  , InvalidConversion(..)
  , textToInt
  , hexToInt
  , hexToBytes
  , wrapText
  , slice
  ) where

import Control.Arrow (first)
import Control.Exception (Exception, throw)
import qualified Data.Text.Array as A
import Data.Binary
import Data.ByteString (ByteString, useAsCStringLen)
import qualified Data.ByteString as BS
import Data.Char
import Data.Function (on)
import Data.Hashable (Hashable(..))
import qualified Data.HashMap.Strict as HashMap
import Data.String
import Data.Text.Encoding (encodeUtf8, decodeUtf8, decodeUtf8', decodeUtf8With)
import Data.Text.Encoding.Error (UnicodeException, lenientDecode)
import Data.Typeable
import Foreign
import Foreign.C
import qualified Data.ByteString.Char8 as Char8
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Unsafe as B
import qualified Data.Text as Text
import Data.Text.Internal
import qualified Data.Text.Foreign as Text

import TextShow hiding (toText)
import Util.ByteString
import qualified Util.ASan as ASan

-- | Executes an 'IO' action with an array of 'CString's marshalled from
-- 'String'.
withCStrings :: [String] -> (Ptr CString -> IO a) -> IO a
withCStrings strs f = go [] strs
  where go cs (t:ts) = withCString t $ \c -> go (c:cs) ts
        go cs [] = withArray0 nullPtr (reverse cs) f

-- | Converts a null-terminated 'CString' to 'Text'.
cStringToText :: CString -> IO Text
cStringToText s = do
  bs <- B.unsafePackCString s
  return $! decodeUtf8 bs


cStringToTextLenient :: CString -> IO Text
cStringToTextLenient s = do
  bs <- B.unsafePackCString s
  return $! decodeUtf8With lenientDecode bs

-- | Converts a 'CString' with known length to 'Text'.
cStringLenToText :: CString -> Int -> IO Text
cStringLenToText s len = do
  bs <- B.unsafePackCStringLen (s, len)
  return $! decodeUtf8 bs

cStringLenToTextAndByteString :: CString -> Int -> IO TextAndByteString
cStringLenToTextAndByteString s len = do
  bs <- Char8.packCStringLen (s, len)
  return $! mkTextAndByteString bs

cStringLenToTextLenient :: CString -> Int -> IO Text
cStringLenToTextLenient s len = do
  bs <- B.unsafePackCStringLen (s, len)
  return $! decodeUtf8With lenientDecode bs

-- | Executes an 'IO' action with a 'CString' marshalled from 'Just'
-- 'Text', or a 'nullPtr' from 'Nothing'.
useMaybeTextAsCString :: Maybe Text -> (CString -> IO a) -> IO a
useMaybeTextAsCString (Just text) m = useTextAsCString text m
useMaybeTextAsCString Nothing m = m nullPtr

-- | Executes an 'IO' action with a 'Text' value marshalled as a UTF-8
-- 'CString'.
useTextAsCString :: Text -> (CString -> IO a) -> IO a
-- TODO(T24195918): We need to do this for ABI compatibility between
-- sigma.service and sigma.service.asan. Codemod to __SANITIZE_ADDRESS__
-- to enable ASAN checks for Haskell allocations.
#ifdef __HASKELL_SANITIZE_ADDRESS__
useTextAsCString = ASan.textWithCString
#else
useTextAsCString = BS.useAsCString . encodeUtf8
#endif

-- | Executes an 'IO' action with a 'Text' value marshalled as a UTF-8
-- 'CStringLen'.
useTextAsCStringLen :: Text -> (CStringLen -> IO a) -> IO a
#ifdef __HASKELL_SANITIZE_ADDRESS__
useTextAsCStringLen = ASan.textWithCStringLen
#else
useTextAsCStringLen = BS.useAsCStringLen . encodeUtf8
#endif

-- | Executes an 'IO' action with a '[Text]' value marshalled as a UTF-8
-- 'Ptr CStringLen'.
useTextsAsCStringLens :: [Text]
                      -> (Ptr CString -> Ptr CSize -> CSize -> IO a)
                      -> IO a
useTextsAsCStringLens = bsListAsCStrLenArr . map encodeUtf8

{-# INLINE withCStringLen #-}
withCStringLen :: Text -> (CStringLen -> IO a) -> IO a
#ifdef __HASKELL_SANITIZE_ADDRESS__
withCStringLen = ASan.textWithCStringLen
#else
withCStringLen = Text.withCStringLen
#endif

-- | Executes an 'IO' action with an array of 'CStrings' marshalled from
-- 'Text'.
useTextsAsCStrings :: [Text] -> (Ptr CString -> IO a) -> IO a
useTextsAsCStrings texts f = go [] texts
  where
  go cs (t:ts) =
    Util.ByteString.useAsCString (encodeUtf8 t) $ \c -> go (c:cs) ts
  go cs [] = withArray0 nullPtr (reverse cs) f

-- | Converts a 'Text' to a 'CString'. The resulting 'CString' must be
-- 'free'd.
newCStringFromText :: Text -> IO CString
newCStringFromText = newCStringFromLazyByteString
  . B.fromChunks . (:[]) . encodeUtf8

-- | A convenience wrapper for 'Show'.
textShow :: Show a => a -> Text
textShow = Text.pack . show

{-# INLINE useAsPtr #-}
useAsPtr :: Text -> (Ptr Word16 -> Text.I16 -> IO a) -> IO a
#ifdef __HASKELL_SANITIZE_ADDRESS__
useAsPtr = ASan.textUseAsPtr
#else
useAsPtr = Text.useAsPtr
#endif

-- | A pair of a 'Text' and a 'ByteString'. Useful for when we get a
-- value in one format and we want to lazily compute and cache the
-- conversion to the other format.
--
-- /Note:/ if/when conversion is free (i.e. 'Text' is stored as UTF-8)
-- then we can get rid of this.

data TextAndByteString = TextAndByteString
  { toTextOrError :: Either UnicodeException Text -- deliberately lazy
  , toByteString :: ByteString }
  deriving Typeable

toText :: TextAndByteString -> Text
toText v
  = case toTextOrError v of
      Left e -> throw e
      Right v -> v

instance Eq TextAndByteString where
  a == b = toByteString a == toByteString b

instance Binary TextAndByteString where
  put = put . toByteString
  get = mkTextAndByteString <$> get

-- We don't need to show both fields, and indeed the contents might
-- not be valid Text, so the Show instance must show only the
-- ByteString component.
instance Show TextAndByteString where
  showsPrec p tbs =
    showParen (p > 10)
    $ showString "mkTextAndByteString "
    . shows (Char8.unpack (fromTextAndByteString tbs))
    -- the showParen is necessary so that instead of e.g.
    --    Foo mkTextAndByteString ""
    -- we get
    --    Foo (mkTextAndByteString "")

instance Hashable TextAndByteString where
  hashWithSalt s = hashWithSalt s . toByteString

-- | Type-specialised constructor
mkTextAndByteString :: ByteString -> TextAndByteString
mkTextAndByteString = toTextAndByteString

instance IsString TextAndByteString where
  fromString xs = toTextAndByteString (Text.pack xs)

class TextOrByteString a where
  fromTextAndByteString :: TextAndByteString -> a
  toTextAndByteString :: a -> TextAndByteString

instance TextOrByteString Text where
  fromTextAndByteString = toText
  toTextAndByteString txt = TextAndByteString {
    toByteString = encodeUtf8 txt,
    toTextOrError = Right txt
  }

instance TextOrByteString ByteString where
  fromTextAndByteString = toByteString
  toTextAndByteString str = TextAndByteString {
    toByteString = str,
    toTextOrError = decodeUtf8' str
  }


-- | Create a number of newlines.
nl :: Int -> Text
nl = flip Text.replicate "\n"

-- | Capitalize the first non-whitespace character of a Text string.
capitalize :: Text -> Text
capitalize s = ws <> c <> cs
  where
    (ws, s') = Text.span isSpace s
    (c, cs) = first Text.toUpper . Text.splitAt 1 $ s'

-- | Lowercase the first non-whitespace character of a Text string.
decapitalize :: Text -> Text
decapitalize s =
  case Text.uncons t of
    Just (c, cs) | c /= toLower c -> ws <> Text.cons (toLower c) cs
    _ -> s
  where
    (ws, t) = Text.span isSpace s

toCamelCase :: Text -> Text
toCamelCase = decapitalize . toPascalCase

toPascalCase :: Text -> Text
toPascalCase t = Text.concat $ map capitalize (Text.splitOn "_" t)

toUnderscore :: Text -> Text
toUnderscore t = Text.intercalate "_" $
  map Text.toLower $ Text.groupBy (\_ i -> not $ isUpper i) t

-- | Search for substring whilst ignoring case.
isCfInfixOf :: Text -> Text -> Bool
isCfInfixOf = Text.isInfixOf `on` Text.toCaseFold

class TextRead a where
  readText :: Text -> Maybe a

  -- Default implementation for enumerations
  default readText :: (Bounded a, Enum a, TextShow a) => Text -> Maybe a
  readText =
    let
      m = HashMap.fromList [ (showt v, v) | v <- [minBound..maxBound] ]
    in \t -> HashMap.lookup t m
  {-# INLINE readText #-}

newtype InvalidConversion = InvalidConversion Text
  deriving (Typeable, Eq, Show)

instance Exception InvalidConversion

-- | Attempts to convert a decimal representation of an integer to an
-- 'Int'.  Returns 'Left' with an error message if the input 'Text'
-- was not a valid integer, or 'Right' with the value otherwise.
textToInt :: Text -> Either InvalidConversion Int
textToInt txt@(Text arr off len)
  | len == 0 = err
  | fromIntegral (A.unsafeIndex arr off) == ord '-' =
    if len > 1
       then negate <$> go 0 (off+1)
       else err
  | otherwise = go 0 off
  where
  -- Use Text internals for speed.  We know that digits are never in
  -- the upper range of UTF-16.
  go !n !i
    | i == len + off = Right n
    | c >= 0 && c <= 9 = go (n * 10 + c) (i + 1)
    | otherwise = err
    where
      c = fromIntegral (A.unsafeIndex arr i) - ord '0'

  err = Left . InvalidConversion $
    "textToInt: string \"" <> txt <> "\" is not an integer"

-- | Attempts to convert a hexadecimal representation of an integer to an
-- 'Int'.  Returns 'Left' with an error message if the input 'Text'
-- was not a valid integer, or 'Right' with the value otherwise.
hexToInt :: Text -> Either InvalidConversion Int
hexToInt txt@(Text arr off len)
  | len == 0 = err
  | fromIntegral (A.unsafeIndex arr off) == ord '-' =
    if len > 1
       then negate <$> go 0 (off+1)
       else err
  | otherwise = go 0 off
  where
  -- Use Text internals for speed.  We know that digits are never in
  -- the upper range of UTF-16.
  go !n !i
    | i == len + off = Right n
    | c >= ord '0' && c <= ord '9' = go (n * 16 + c - ord '0') (i + 1)
    | c >= ord 'a' && c <= ord 'f' = go (n * 16 + c - ord 'a' + 10) (i + 1)
    | c >= ord 'A' && c <= ord 'F' = go (n * 16 + c - ord 'A' + 10) (i + 1)
    | otherwise = err
    where
      c = fromIntegral (A.unsafeIndex arr i)

  err = Left $ InvalidConversion $
          "hexToInt: string \"" <> txt <> "\" is not an integer"

-- | Convert a hexadecimal representation of an integer to a
-- big-endian ByteString.
hexToBytes :: Text -> Either InvalidConversion ByteString
hexToBytes txt@(Text arr off len)
  | len == 0 = err
  | otherwise = go [] (off+len-1)
  where
  go acc !i
    | i < off = Right (BS.pack acc)
    | i == off, Just x <- nibble l = Right (BS.pack (x:acc))
    | Just x <- nibble l, Just y <- nibble h =
        go (y `shiftL` 4 + x : acc) (i-2)
    | otherwise = err
    where
      l = fromIntegral (A.unsafeIndex arr i)
      h = fromIntegral (A.unsafeIndex arr (i-1))

  {-# INLINE nibble #-}
  nibble :: Int -> Maybe Word8
  nibble x
    | x >= ord '0' && x <= ord '9' = Just (fromIntegral (x - ord '0'))
    | x >= ord 'a' && x <= ord 'f' = Just (10 + fromIntegral (x - ord 'a'))
    | otherwise = Nothing

  err = Left $ InvalidConversion $
    "hexToBytes: string \"" <> txt <> "\" is not a hex integer"

insertCommasAndAnd :: [Text] -> Text
insertCommasAndAnd [] = ""
insertCommasAndAnd [x] = x
insertCommasAndAnd [x, y] = x <> " and " <> y
insertCommasAndAnd [x, y, z] = x <> ", " <> y <> ", and " <> z
insertCommasAndAnd (x : xs) = x <> ", " <> insertCommasAndAnd xs

-- | Break a Text into lines such that each one fits within the given number of
-- characters.
-- NB: this function does not preserve existing whitespace formatting.
wrapText
  :: Int  -- ^ Indent Amount
  -> Int  -- ^ Max characters per line
  -> Text -- ^ Text to wrap
  -> [Text]
wrapText indent maxLength text = concatMap wrapLine $ Text.lines text
  where
    wrapLine line = case Text.words line of
      [] -> []
      [word] -> [word]
      w:ws -> wrap (margin <> w) (indent + Text.length w) ws

    wrap acc _ [] = [acc]
    wrap acc n (w:ws)
      -- If adding the word exceeds the limit, then wrap
      | len > maxLength = acc : wrap (margin <> w) (indent + Text.length w) ws
      -- Otherwise, add to the current line. Not that there is no length check
      -- here. If the word is longer than the max line length then we will put
      -- it in anyway.
      | otherwise = wrap (acc <> " " <> w) len ws
      where
        len = n + Text.length w + 1
    margin = Text.replicate indent " "

-- | The obvious implementation has a performance bug in platform < 011:
--    https://github.com/channable/haskell-string-slicing-benchmarks
slice :: Int -> Int -> Text -> Text
#if MIN_VERSION_text(2,0,0)
slice from len = Text.take len . Text.drop from
#else
slice from len t =
  let !t' = Text.drop from t in Text.take len t'
#endif