JATS reader: improve refs parsing.

Handle issn and isbn; use simpler form for issued date.
This commit is contained in:
John MacFarlane 2022-03-28 18:37:04 -07:00
parent 6217fd0976
commit 51c8b059e1

View file

@ -1,3 +1,4 @@
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
@ -21,7 +22,7 @@ import Data.Default
import Data.Generics
import Data.List (foldl', intersperse)
import qualified Data.Map as Map
import Data.Maybe (maybeToList, fromMaybe, mapMaybe)
import Data.Maybe (maybeToList, fromMaybe, catMaybes)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
@ -400,75 +401,66 @@ parseRefList e = do
parseRef :: PandocMonad m
=> Element -> JATS m (Map.Map Text MetaValue)
parseRef e = do
let refId = text $ attrValue "id" e
let getInlineText n = maybe (return mempty) getInlines . filterChild (named n)
case filterChild (named "element-citation") e of
Just c -> do
let refType = text $
case attrValue "publication-type" c of
"journal" -> "article-journal"
x -> x
(refTitle, refContainerTitle) <- do
t <- getInlineText "article-title" c
ct <- getInlineText "source" c
if t == mempty
then return (ct, mempty)
else return (t, ct)
refLabel <- getInlineText "label" c
refYear <- getInlineText "year" c
refVolume <- getInlineText "volume" c
refIssue <- getInlineText "issue" c
refFirstPage <- getInlineText "fpage" c
refLastPage <- getInlineText "lpage" c
refPublisher <- getInlineText "publisher-name" c
refPublisherPlace <- getInlineText "publisher-loc" c
let refPages = refFirstPage <> (if refLastPage == mempty
then mempty
else text "\x2013" <> refLastPage)
let personGroups' = filterChildren (named "person-group") c
let getName nm = do
given <- maybe (return mempty) getInlines
$ filterChild (named "given-names") nm
family <- maybe (return mempty) getInlines
$ filterChild (named "surname") nm
return $ toMetaValue $ Map.fromList [
("given" :: Text, given)
, ("family", family)
]
let extractObjectId e' =
let v = toMetaValue (strContent e')
in case attrValue "pub-id-type" e' of
"doi" -> Just ("DOI", v)
"pmid" -> Just ("PMID", v)
_ -> Nothing
let objectIds = mapMaybe extractObjectId $
(filterChildren (\e' -> named "object-id" e' ||
named "pub-id" e') c)
personGroups <- mapM (\pg ->
do names <- mapM getName
(filterChildren (named "name") pg)
return (attrValue "person-group-type" pg,
toMetaValue names))
personGroups'
return $ Map.fromList $
[ ("id" :: Text, toMetaValue refId)
, ("type", toMetaValue refType)
, ("title", toMetaValue refTitle)
, ("container-title", toMetaValue refContainerTitle)
, ("publisher", toMetaValue refPublisher)
, ("publisher-place", toMetaValue refPublisherPlace)
, ("title", toMetaValue refTitle)
, ("issued", toMetaValue
$ Map.fromList [
("year" :: Text, refYear)
])
, ("volume", toMetaValue refVolume)
, ("issue", toMetaValue refIssue)
, ("page", toMetaValue refPages)
, ("citation-label", toMetaValue refLabel)
] ++ objectIds ++ personGroups
Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty
-- TODO handle mixed-citation
let combineWithDash x y = x <> text "-" <> y
let getName nm = do
given <- maybe (return mempty) getInlines
$ filterChild (named "given-names") nm
family <- maybe (return mempty) getInlines
$ filterChild (named "surname") nm
return $ toMetaValue $ Map.fromList [
("given" :: Text, given)
, ("family", family)
]
let refElement :: PandocMonad m
=> Element -> Element -> JATS m (Maybe (Text, MetaValue))
refElement c el =
case qName (elName el) of
"article-title" -> Just . ("title",) . toMetaValue <$> getInlines el
"source" ->
case filterChild (named "article-title") c of
Just _ -> Just . ("container-title",) . toMetaValue <$> getInlines el
Nothing -> Just . ("title",) . toMetaValue <$> getInlines el
"label" -> Just . ("citation-label",) . toMetaValue <$> getInlines el
"year" -> case filterChild (named "month") c of
Just m -> Just . ("issued",) . toMetaValue <$>
(combineWithDash
<$> getInlines el <*> getInlines m)
Nothing -> Just . ("issued",) . toMetaValue <$> getInlines el
"volume" -> Just . ("volume",) . toMetaValue <$> getInlines el
"issue" -> Just . ("issue",) . toMetaValue <$> getInlines el
"isbn" -> Just . ("ISBN",) . toMetaValue <$> getInlines el
"issn" -> Just . ("ISSN",) . toMetaValue <$> getInlines el
"fpage" ->
case filterChild (named "lpage") c of
Just lp -> Just . ("page",) . toMetaValue <$>
(combineWithDash <$> getInlines el <*> getInlines lp)
Nothing -> Just . ("page-first",) . toMetaValue <$> getInlines el
"publisher-name" -> Just . ("publisher",) . toMetaValue <$> getInlines el
"publisher-loc" -> Just . ("publisher-place",) . toMetaValue
<$> getInlines el
"person-group" -> do names <- mapM getName
(filterChildren (named "name") el)
pure $ Just (attrValue "person-group-type" el,
toMetaValue names)
"pub-id" -> case attrValue "pub-id-type" el of
"doi" -> pure $ Just ("DOI", toMetaValue $ strContent el)
"pmid" -> pure $ Just ("PMID", toMetaValue $ strContent el)
_ -> pure Nothing
"object-id" -> case attrValue "pub-id-type" el of
"doi" -> pure $ Just ("DOI", toMetaValue $ strContent el)
"pmid" -> pure $ Just ("PMID", toMetaValue $ strContent el)
_ -> pure Nothing
_ -> pure Nothing
refVariables <-
case filterChild (named "element-citation") e of
Just c -> (("type", toMetaValue $ case attrValue "publication-type" c of
"journal" -> "article-journal"
x -> x) :) .
catMaybes <$> mapM (refElement c) (elChildren c)
Nothing -> pure [] -- TODO handle mixed-citation
return $ Map.fromList (("id", toMetaValue $ attrValue "id" e) : refVariables)
textContent :: Element -> Text
textContent = strContent