From 51c8b059e1710cc7301b4daa379bb3b66847d00e Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Mon, 28 Mar 2022 18:37:04 -0700
Subject: [PATCH] JATS reader: improve refs parsing.

Handle issn and isbn; use simpler form for issued date.
---
 src/Text/Pandoc/Readers/JATS.hs | 132 +++++++++++++++-----------------
 1 file changed, 62 insertions(+), 70 deletions(-)

diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index 516bf2f14..98cf5bdf5 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TupleSections #-}
 {-# LANGUAGE OverloadedStrings #-}
 {- |
@@ -21,7 +22,7 @@ import Data.Default
 import Data.Generics
 import Data.List (foldl', intersperse)
 import qualified Data.Map as Map
-import Data.Maybe (maybeToList, fromMaybe, mapMaybe)
+import Data.Maybe (maybeToList, fromMaybe, catMaybes)
 import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
@@ -400,75 +401,66 @@ parseRefList e = do
 parseRef :: PandocMonad m
          => Element -> JATS m (Map.Map Text MetaValue)
 parseRef e = do
-  let refId = text $ attrValue "id" e
-  let getInlineText n = maybe (return mempty) getInlines . filterChild (named n)
-  case filterChild (named "element-citation") e of
-       Just c  -> do
-         let refType = text $
-               case attrValue "publication-type" c of
-                  "journal" -> "article-journal"
-                  x -> x
-         (refTitle, refContainerTitle) <- do
-           t <- getInlineText "article-title" c
-           ct <- getInlineText "source" c
-           if t == mempty
-              then return (ct, mempty)
-              else return (t, ct)
-         refLabel <- getInlineText "label" c
-         refYear <- getInlineText "year" c
-         refVolume <- getInlineText "volume" c
-         refIssue <- getInlineText "issue" c
-         refFirstPage <- getInlineText "fpage" c
-         refLastPage <- getInlineText "lpage" c
-         refPublisher <- getInlineText "publisher-name" c
-         refPublisherPlace <- getInlineText "publisher-loc" c
-         let refPages = refFirstPage <> (if refLastPage == mempty
-                                            then mempty
-                                            else text "\x2013" <> refLastPage)
-         let personGroups' = filterChildren (named "person-group") c
-         let getName nm = do
-               given <- maybe (return mempty) getInlines
-                         $ filterChild (named "given-names") nm
-               family <- maybe (return mempty) getInlines
-                         $ filterChild (named "surname") nm
-               return $ toMetaValue $ Map.fromList [
-                   ("given" :: Text, given)
-                 , ("family", family)
-                 ]
-         let extractObjectId e' =
-               let v = toMetaValue (strContent e')
-                in case attrValue "pub-id-type" e' of
-                      "doi"  -> Just ("DOI",  v)
-                      "pmid" -> Just ("PMID", v)
-                      _      -> Nothing
-         let objectIds = mapMaybe extractObjectId $
-                          (filterChildren (\e' -> named "object-id" e' ||
-                                                  named "pub-id" e') c)
-         personGroups <- mapM (\pg ->
-                                do names <- mapM getName
-                                            (filterChildren (named "name") pg)
-                                   return (attrValue "person-group-type" pg,
-                                           toMetaValue names))
-                         personGroups'
-         return $ Map.fromList $
-           [ ("id" :: Text, toMetaValue refId)
-           , ("type", toMetaValue refType)
-           , ("title", toMetaValue refTitle)
-           , ("container-title", toMetaValue refContainerTitle)
-           , ("publisher", toMetaValue refPublisher)
-           , ("publisher-place", toMetaValue refPublisherPlace)
-           , ("title", toMetaValue refTitle)
-           , ("issued", toMetaValue
-                        $ Map.fromList [
-                            ("year" :: Text, refYear)
-                          ])
-           , ("volume", toMetaValue refVolume)
-           , ("issue", toMetaValue refIssue)
-           , ("page", toMetaValue refPages)
-           , ("citation-label", toMetaValue refLabel)
-           ] ++ objectIds ++ personGroups
-       Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty
-       -- TODO handle mixed-citation
+  let combineWithDash x y = x <> text "-" <> y
+  let getName nm = do
+        given <- maybe (return mempty) getInlines
+                  $ filterChild (named "given-names") nm
+        family <- maybe (return mempty) getInlines
+                  $ filterChild (named "surname") nm
+        return $ toMetaValue $ Map.fromList [
+            ("given" :: Text, given)
+          , ("family", family)
+          ]
+  let refElement :: PandocMonad m
+                 => Element -> Element -> JATS m (Maybe (Text, MetaValue))
+      refElement c el =
+        case qName (elName el) of
+          "article-title" -> Just . ("title",) . toMetaValue <$> getInlines el
+          "source" ->
+            case filterChild (named "article-title") c of
+              Just _ -> Just . ("container-title",) . toMetaValue <$> getInlines el
+              Nothing -> Just . ("title",) . toMetaValue <$> getInlines el
+          "label" -> Just . ("citation-label",) . toMetaValue <$> getInlines el
+          "year" -> case filterChild (named "month") c of
+                      Just m -> Just . ("issued",) . toMetaValue <$>
+                                 (combineWithDash
+                                    <$> getInlines el <*> getInlines m)
+                      Nothing -> Just . ("issued",) . toMetaValue <$> getInlines el
+          "volume" -> Just . ("volume",) . toMetaValue <$> getInlines el
+          "issue" -> Just . ("issue",) . toMetaValue <$> getInlines el
+          "isbn" -> Just . ("ISBN",) . toMetaValue <$> getInlines el
+          "issn" -> Just . ("ISSN",) . toMetaValue <$> getInlines el
+          "fpage" ->
+            case filterChild (named "lpage") c of
+              Just lp -> Just . ("page",) . toMetaValue <$>
+                          (combineWithDash <$> getInlines el <*> getInlines lp)
+              Nothing -> Just . ("page-first",) . toMetaValue <$> getInlines el
+          "publisher-name" -> Just . ("publisher",) . toMetaValue <$> getInlines el
+          "publisher-loc" -> Just . ("publisher-place",) . toMetaValue
+                                <$> getInlines el
+          "person-group" -> do names <- mapM getName
+                                            (filterChildren (named "name") el)
+                               pure $ Just (attrValue "person-group-type" el,
+                                            toMetaValue names)
+          "pub-id" -> case attrValue "pub-id-type" el of
+                         "doi"  -> pure $ Just ("DOI",  toMetaValue $ strContent el)
+                         "pmid" -> pure $ Just ("PMID", toMetaValue $ strContent el)
+                         _      -> pure Nothing
+          "object-id" -> case attrValue "pub-id-type" el of
+                         "doi"  -> pure $ Just ("DOI",  toMetaValue $ strContent el)
+                         "pmid" -> pure $ Just ("PMID", toMetaValue $ strContent el)
+                         _      -> pure Nothing
+
+
+          _ -> pure Nothing
+  refVariables <-
+    case filterChild (named "element-citation") e of
+      Just c -> (("type", toMetaValue $ case attrValue "publication-type" c of
+                            "journal" -> "article-journal"
+                            x -> x) :) .
+                  catMaybes <$> mapM (refElement c) (elChildren c)
+      Nothing -> pure []   -- TODO handle mixed-citation
+  return $ Map.fromList (("id", toMetaValue $ attrValue "id" e) : refVariables)
 
 textContent :: Element -> Text
 textContent = strContent