{-# LANGUAGE TupleSections #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.Pandoc.Readers.JATS ( readJATS ) where
import Control.Monad.State.Strict
import Data.Char (isDigit, isSpace, toUpper)
import Data.Default
import Data.Generics
import Data.List (foldl', intersperse)
import qualified Data.Map as Map
import Data.Maybe (maybeToList, fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter, safeRead)
import Text.TeXMath (readMathML, writeTeX)
import Text.XML.Light
import qualified Data.Set as S (fromList, member)
import Data.Set ((\\))
type JATS m = StateT JATSState m
data JATSState = JATSState{ jatsSectionLevel :: Int
, jatsQuoteType :: QuoteType
, jatsMeta :: Meta
, jatsBook :: Bool
, jatsContent :: [Content]
} deriving Show
instance Default JATSState where
def = JATSState{ jatsSectionLevel = 0
, jatsQuoteType = DoubleQuote
, jatsMeta = mempty
, jatsBook = False
, jatsContent = [] }
readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readJATS _ inp = do
let tree = normalizeTree . parseXML
$ T.unpack $ crFilter inp
(bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
normalizeTree :: [Content] -> [Content]
normalizeTree = everywhere (mkT go)
where go :: [Content] -> [Content]
go (Text (CData CDataRaw _ _):xs) = xs
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
Text (CData CDataText (s1 ++ s2) z):xs
go (Text (CData CDataText s1 z):CRef r:xs) =
Text (CData CDataText (s1 ++ convertEntity r) z):xs
go (CRef r:Text (CData CDataText s1 z):xs) =
Text (CData CDataText (convertEntity r ++ s1) z):xs
go (CRef r1:CRef r2:xs) =
Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs
go xs = xs
convertEntity :: String -> String
convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e)
attrValue :: String -> Element -> Text
attrValue attr =
fromMaybe "" . maybeAttrValue attr
maybeAttrValue :: String -> Element -> Maybe Text
maybeAttrValue attr elt =
T.pack <$> lookupAttrBy (\x -> qName x == attr) (elAttribs elt)
named :: String -> Element -> Bool
named s e = qName (elName e) == s
addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> JATS m ()
addMeta field val = modify (setMeta field val)
instance HasMeta JATSState where
setMeta field v s = s {jatsMeta = setMeta field v (jatsMeta s)}
deleteMeta field s = s {jatsMeta = deleteMeta field (jatsMeta s)}
isBlockElement :: Content -> Bool
isBlockElement (Elem e) = qName (elName e) `S.member` blocktags
where blocktags = S.fromList (paragraphLevel ++ lists ++ mathML ++ other) \\ S.fromList inlinetags
paragraphLevel = ["address", "array", "boxed-text", "chem-struct-wrap",
"code", "fig", "fig-group", "graphic", "media", "preformat",
"supplementary-material", "table-wrap", "table-wrap-group",
"alternatives", "disp-formula", "disp-formula-group"]
lists = ["def-list", "list"]
mathML = ["tex-math", "mml:math"]
other = ["p", "related-article", "related-object", "ack", "disp-quote",
"speech", "statement", "verse-group", "x"]
inlinetags = ["email", "ext-link", "uri", "inline-supplementary-material",
"related-article", "related-object", "hr", "bold", "fixed-case",
"italic", "monospace", "overline", "overline-start", "overline-end",
"roman", "sans-serif", "sc", "strike", "underline", "underline-start",
"underline-end", "ruby", "alternatives", "inline-graphic", "private-char",
"chem-struct", "inline-formula", "tex-math", "mml:math", "abbrev",
"milestone-end", "milestone-start", "named-content", "styled-content",
"fn", "target", "xref", "sub", "sup", "x", "address", "array",
"boxed-text", "chem-struct-wrap", "code", "fig", "fig-group", "graphic",
"media", "preformat", "supplementary-material", "table-wrap",
"table-wrap-group", "disp-formula", "disp-formula-group",
"citation-alternatives", "element-citation", "mixed-citation",
"nlm-citation", "award-id", "funding-source", "open-access",
"def-list", "list", "ack", "disp-quote", "speech", "statement",
"verse-group"]
isBlockElement _ = False
trimNl :: Text -> Text
trimNl = T.dropAround (== '\n')
getGraphic :: PandocMonad m
=> Maybe (Inlines, Text) -> Element -> JATS m Inlines
getGraphic mbfigdata e = do
let atVal a = attrValue a e
(ident, title, capt) =
case mbfigdata of
Just (capt', i) -> (i, "fig:" <> atVal "title", capt')
Nothing -> (atVal "id", atVal "title",
text (atVal "alt-text"))
attr = (ident, T.words $ atVal "role", [])
imageUrl = atVal "href"
return $ imageWith attr imageUrl title capt
getBlocks :: PandocMonad m => Element -> JATS m Blocks
getBlocks e = mconcat <$>
mapM parseBlock (elContent e)
parseBlock :: PandocMonad m => Content -> JATS m Blocks
parseBlock (Text (CData CDataRaw _ _)) = return mempty
parseBlock (Text (CData _ s _)) = if all isSpace s
then return mempty
else return $ plain $ trimInlines $ text $ T.pack s
parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x
parseBlock (Elem e) =
case qName (elName e) of
"p" -> parseMixed para (elContent e)
"code" -> codeBlockWithLang
"preformat" -> codeBlockWithLang
"disp-quote" -> parseBlockquote
"list" -> case attrValue "list-type" e of
"bullet" -> bulletList <$> listitems
listType -> do
let start = fromMaybe 1 $
(filterElement (named "list-item") e
>>= filterElement (named "label"))
>>= safeRead . textContent
orderedListWith (start, parseListStyleType listType, DefaultDelim)
<$> listitems
"def-list" -> definitionList <$> deflistitems
"sec" -> gets jatsSectionLevel >>= sect . (+1)
"graphic" -> para <$> getGraphic Nothing e
"journal-meta" -> parseMetadata e
"article-meta" -> parseMetadata e
"custom-meta" -> parseMetadata e
"title" -> return mempty
"label" -> return mempty
"table" -> parseTable
"fig" -> parseFigure
"fig-group" -> divWith (attrValue "id" e, ["fig-group"], [])
<$> getBlocks e
"table-wrap" -> divWith (attrValue "id" e, ["table-wrap"], [])
<$> getBlocks e
"caption" -> divWith (attrValue "id" e, ["caption"], []) <$> sect 6
"ref-list" -> parseRefList e
"?xml" -> return mempty
_ -> getBlocks e
where parseMixed container conts = do
let (ils,rest) = break isBlockElement conts
ils' <- (trimInlines . mconcat) <$> mapM parseInline ils
let p = if ils' == mempty then mempty else container ils'
case rest of
[] -> return p
(r:rs) -> do
b <- parseBlock r
x <- parseMixed container rs
return $ p <> b <> x
codeBlockWithLang = do
let classes' = case attrValue "language" e of
"" -> []
x -> [x]
return $ codeBlockWith (attrValue "id" e, classes', [])
$ trimNl $ textContentRecursive e
parseBlockquote = do
attrib <- case filterChild (named "attribution") e of
Nothing -> return mempty
Just z -> (para . (str "— " <>) . mconcat)
<$>
mapM parseInline (elContent z)
contents <- getBlocks e
return $ blockQuote (contents <> attrib)
parseListStyleType "roman-lower" = LowerRoman
parseListStyleType "roman-upper" = UpperRoman
parseListStyleType "alpha-lower" = LowerAlpha
parseListStyleType "alpha-upper" = UpperAlpha
parseListStyleType _ = DefaultStyle
listitems = mapM getBlocks $ filterChildren (named "list-item") e
deflistitems = mapM parseVarListEntry $ filterChildren
(named "def-item") e
parseVarListEntry e' = do
let terms = filterChildren (named "term") e'
let items = filterChildren (named "def") e'
terms' <- mapM getInlines terms
items' <- mapM getBlocks items
return (mconcat $ intersperse (str "; ") terms', items')
parseFigure =
case filterChildren (named "graphic") e of
[g] -> do
capt <- case filterChild (named "caption") e of
Just t -> mconcat .
intersperse linebreak <$>
mapM getInlines
(filterChildren (const True) t)
Nothing -> return mempty
img <- getGraphic (Just (capt, attrValue "id" e)) g
return $ para img
_ -> divWith (attrValue "id" e, ["fig"], []) <$> getBlocks e
parseTable = do
let isCaption x = named "title" x || named "caption" x
capt <- case filterChild isCaption e of
Just t -> getInlines t
Nothing -> return mempty
let e' = fromMaybe e $ filterChild (named "tgroup") e
let isColspec x = named "colspec" x || named "col" x
let colspecs = case filterChild (named "colgroup") e' of
Just c -> filterChildren isColspec c
_ -> filterChildren isColspec e'
let isRow x = named "row" x || named "tr" x
headrows <- case filterChild (named "thead") e' of
Just h -> case filterChild isRow h of
Just x -> parseRow x
Nothing -> return []
Nothing -> return []
bodyrows <- case filterChild (named "tbody") e' of
Just b -> mapM parseRow
$ filterChildren isRow b
Nothing -> mapM parseRow
$ filterChildren isRow e'
let toAlignment c = case findAttr (unqual "align") c of
Just "left" -> AlignLeft
Just "right" -> AlignRight
Just "center" -> AlignCenter
_ -> AlignDefault
let toWidth c = do
w <- findAttrText (unqual "colwidth") c
n <- safeRead $ "0" <> T.filter (\x -> isDigit x || x == '.') w
if n > 0 then Just n else Nothing
let numrows = foldl' max 0 $ map length bodyrows
let aligns = case colspecs of
[] -> replicate numrows AlignDefault
cs -> map toAlignment cs
let widths = case colspecs of
[] -> replicate numrows ColWidthDefault
cs -> let ws = map toWidth cs
in case sequence ws of
Just ws' -> let tot = sum ws'
in ColWidth . (/ tot) <$> ws'
Nothing -> replicate numrows ColWidthDefault
let toRow = Row nullAttr . map simpleCell
toHeaderRow l = if null l then [] else [toRow l]
return $ table (simpleCaption $ plain capt)
(zip aligns widths)
(TableHead nullAttr $ toHeaderRow headrows)
[TableBody nullAttr 0 [] $ map toRow bodyrows]
(TableFoot nullAttr [])
isEntry x = named "entry" x || named "td" x || named "th" x
parseRow = mapM (parseMixed plain . elContent) . filterChildren isEntry
sect n = do isbook <- gets jatsBook
let n' = if isbook || n == 0 then n + 1 else n
labelText <- case filterChild (named "label") e of
Just t -> (<> ("." <> space)) <$>
getInlines t
Nothing -> return mempty
headerText <- case filterChild (named "title") e `mplus`
(filterChild (named "info") e >>=
filterChild (named "title")) of
Just t -> (labelText <>) <$>
getInlines t
Nothing -> return mempty
oldN <- gets jatsSectionLevel
modify $ \st -> st{ jatsSectionLevel = n }
b <- getBlocks e
let ident = attrValue "id" e
modify $ \st -> st{ jatsSectionLevel = oldN }
return $ headerWith (ident,[],[]) n' headerText <> b
getInlines :: PandocMonad m => Element -> JATS m Inlines
getInlines e' = (trimInlines . mconcat) <$>
mapM parseInline (elContent e')
parseMetadata :: PandocMonad m => Element -> JATS m Blocks
parseMetadata e = do
getTitle e
getAuthors e
getAffiliations e
getAbstract e
return mempty
getTitle :: PandocMonad m => Element -> JATS m ()
getTitle e = do
tit <- case filterElement (named "article-title") e of
Just s -> getInlines s
Nothing -> return mempty
subtit <- case filterElement (named "subtitle") e of
Just s -> (text ": " <>) <$>
getInlines s
Nothing -> return mempty
when (tit /= mempty) $ addMeta "title" tit
when (subtit /= mempty) $ addMeta "subtitle" subtit
getAuthors :: PandocMonad m => Element -> JATS m ()
getAuthors e = do
authors <- mapM getContrib $ filterElements
(\x -> named "contrib" x &&
attrValue "contrib-type" x == "author") e
authorNotes <- mapM getInlines $ filterElements (named "author-notes") e
let authors' = case (reverse authors, authorNotes) of
([], _) -> []
(_, []) -> authors
(a:as, ns) -> reverse as ++ [a <> mconcat ns]
unless (null authors) $ addMeta "author" authors'
getAffiliations :: PandocMonad m => Element -> JATS m ()
getAffiliations x = do
affs <- mapM getInlines $ filterChildren (named "aff") x
unless (null affs) $ addMeta "institute" affs
getAbstract :: PandocMonad m => Element -> JATS m ()
getAbstract e =
case filterElement (named "abstract") e of
Just s -> do
blks <- getBlocks s
addMeta "abstract" blks
Nothing -> pure ()
getContrib :: PandocMonad m => Element -> JATS m Inlines
getContrib x = do
given <- maybe (return mempty) getInlines
$ filterElement (named "given-names") x
family <- maybe (return mempty) getInlines
$ filterElement (named "surname") x
if given == mempty && family == mempty
then return mempty
else if given == mempty || family == mempty
then return $ given <> family
else return $ given <> space <> family
parseRefList :: PandocMonad m => Element -> JATS m Blocks
parseRefList e = do
refs <- mapM parseRef $ filterChildren (named "ref") e
addMeta "references" refs
return mempty
parseRef :: PandocMonad m
=> Element -> JATS m (Map.Map Text MetaValue)
parseRef e = do
let refId = text $ attrValue "id" e
let getInlineText n = maybe (return mempty) getInlines . filterChild (named n)
case filterChild (named "element-citation") e of
Just c -> do
let refType = text $
case attrValue "publication-type" c of
"journal" -> "article-journal"
x -> x
(refTitle, refContainerTitle) <- do
t <- getInlineText "article-title" c
ct <- getInlineText "source" c
if t == mempty
then return (ct, mempty)
else return (t, ct)
refLabel <- getInlineText "label" c
refYear <- getInlineText "year" c
refVolume <- getInlineText "volume" c
refFirstPage <- getInlineText "fpage" c
refLastPage <- getInlineText "lpage" c
refPublisher <- getInlineText "publisher-name" c
refPublisherPlace <- getInlineText "publisher-loc" c
let refPages = refFirstPage <> (if refLastPage == mempty
then mempty
else text "\x2013" <> refLastPage)
let personGroups' = filterChildren (named "person-group") c
let getName nm = do
given <- maybe (return mempty) getInlines
$ filterChild (named "given-names") nm
family <- maybe (return mempty) getInlines
$ filterChild (named "surname") nm
return $ toMetaValue $ Map.fromList [
("given" :: Text, given)
, ("family", family)
]
personGroups <- mapM (\pg ->
do names <- mapM getName
(filterChildren (named "name") pg)
return (attrValue "person-group-type" pg,
toMetaValue names))
personGroups'
return $ Map.fromList $
[ ("id" :: Text, toMetaValue refId)
, ("type", toMetaValue refType)
, ("title", toMetaValue refTitle)
, ("container-title", toMetaValue refContainerTitle)
, ("publisher", toMetaValue refPublisher)
, ("publisher-place", toMetaValue refPublisherPlace)
, ("title", toMetaValue refTitle)
, ("issued", toMetaValue
$ Map.fromList [
("year" :: Text, refYear)
])
, ("volume", toMetaValue refVolume)
, ("page", toMetaValue refPages)
, ("citation-label", toMetaValue refLabel)
] ++ personGroups
Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty
findAttrText :: QName -> Element -> Maybe Text
findAttrText x = fmap T.pack . findAttr x
textContent :: Element -> Text
textContent = T.pack . strContent
textContentRecursive :: Element -> Text
textContentRecursive = T.pack . strContentRecursive
strContentRecursive :: Element -> String
strContentRecursive = strContent .
(\e' -> e'{ elContent = map elementToStr $ elContent e' })
elementToStr :: Content -> Content
elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
elementToStr x = x
parseInline :: PandocMonad m => Content -> JATS m Inlines
parseInline (Text (CData _ s _)) = return $ text $ T.pack s
parseInline (CRef ref) =
return . text . maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref
parseInline (Elem e) =
case qName (elName e) of
"italic" -> emph <$> innerInlines
"bold" -> strong <$> innerInlines
"strike" -> strikeout <$> innerInlines
"sub" -> subscript <$> innerInlines
"sup" -> superscript <$> innerInlines
"underline" -> underline <$> innerInlines
"break" -> return linebreak
"sc" -> smallcaps <$> innerInlines
"code" -> codeWithLang
"monospace" -> codeWithLang
"inline-graphic" -> getGraphic Nothing e
"disp-quote" -> do
qt <- gets jatsQuoteType
let qt' = if qt == SingleQuote then DoubleQuote else SingleQuote
modify $ \st -> st{ jatsQuoteType = qt' }
contents <- innerInlines
modify $ \st -> st{ jatsQuoteType = qt }
return $ if qt == SingleQuote
then singleQuoted contents
else doubleQuoted contents
"xref" -> do
ils <- innerInlines
let rid = attrValue "rid" e
let rids = T.words rid
let refType = ("ref-type",) <$> maybeAttrValue "ref-type" e
let attr = (attrValue "id" e, [], maybeToList refType)
return $ if refType == Just ("ref-type","bibr")
then cite
(map (\id' ->
Citation{ citationId = id'
, citationPrefix = []
, citationSuffix = []
, citationMode = NormalCitation
, citationNoteNum = 0
, citationHash = 0}) rids)
ils
else linkWith attr ("#" <> rid) "" ils
"ext-link" -> do
ils <- innerInlines
let title = fromMaybe "" $ findAttrText (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e
let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of
Just h -> T.pack h
_ -> "#" <> attrValue "rid" e
let ils' = if ils == mempty then str href else ils
let attr = (attrValue "id" e, [], [])
return $ linkWith attr href title ils'
"disp-formula" -> formula displayMath
"inline-formula" -> formula math
"math" | qPrefix (elName e) == Just "mml" -> return . math $ mathML e
"tex-math" -> return . math $ textContent e
"email" -> return $ link ("mailto:" <> textContent e) ""
$ str $ textContent e
"uri" -> return $ link (textContent e) "" $ str $ textContent e
"fn" -> (note . mconcat) <$>
mapM parseBlock (elContent e)
_ -> innerInlines
where innerInlines = (trimInlines . mconcat) <$>
mapM parseInline (elContent e)
mathML x =
case readMathML . T.pack . showElement $ everywhere (mkT removePrefix) x of
Left _ -> mempty
Right m -> writeTeX m
formula constructor = do
let whereToLook = fromMaybe e $ filterElement (named "alternatives") e
texMaths = map textContent $
filterChildren (named "tex-math") whereToLook
mathMLs = map mathML $
filterChildren isMathML whereToLook
return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs
isMathML x = qName (elName x) == "math" &&
qPrefix (elName x) == Just "mml"
removePrefix elname = elname { qPrefix = Nothing }
codeWithLang = do
let classes' = case attrValue "language" e of
"" -> []
l -> [l]
return $ codeWith (attrValue "id" e,classes',[]) $ textContentRecursive e