From 34897031f4df5980cd529f82bc7827d4cb468dd0 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 30 Jan 2022 23:51:47 -0800
Subject: [PATCH] Add endnote XML reader.

New input format: endnotexml

New reader module: Text.Pandoc.Readers.EndNote, exporting
`readEndNoteXML` and `readEndNoteXMLReferences`. [API change]

This reader is still a bit rudimentary, but it should get
be good enough to be helpful.
---
 MANUAL.txt                         |   2 +
 pandoc.cabal                       |   1 +
 src/Text/Pandoc/Citeproc/BibTeX.hs |   3 +-
 src/Text/Pandoc/Readers.hs         |   3 +
 src/Text/Pandoc/Readers/EndNote.hs | 201 +++++++++++++++++++++++++++++
 5 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 src/Text/Pandoc/Readers/EndNote.hs

diff --git a/MANUAL.txt b/MANUAL.txt
index f56702899..db8e97840 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -235,6 +235,7 @@ header when requesting a document from a URL:
     - `docbook` ([DocBook])
     - `docx` ([Word docx])
     - `dokuwiki` ([DokuWiki markup])
+    - `endnotexml` ([EndNote XML bibliography])
     - `epub` ([EPUB])
     - `fb2` ([FictionBook2] e-book)
     - `gfm` ([GitHub-Flavored Markdown]),
@@ -504,6 +505,7 @@ header when requesting a document from a URL:
 [BibTeX]: https://ctan.org/pkg/bibtex
 [BibLaTeX]: https://ctan.org/pkg/biblatex
 [Markua]: https://leanpub.com/markua/read
+[EndNote XML bibliography]: https://support.clarivate.com/Endnote/s/article/EndNote-XML-Document-Type-Definition
 
 ## Reader options {.options}
 
diff --git a/pandoc.cabal b/pandoc.cabal
index b4c3bdbe4..4ce391dde 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -545,6 +545,7 @@ library
                    Text.Pandoc.Readers.CommonMark,
                    Text.Pandoc.Readers.Creole,
                    Text.Pandoc.Readers.BibTeX,
+                   Text.Pandoc.Readers.EndNote,
                    Text.Pandoc.Readers.CslJson,
                    Text.Pandoc.Readers.MediaWiki,
                    Text.Pandoc.Readers.Vimwiki,
diff --git a/src/Text/Pandoc/Citeproc/BibTeX.hs b/src/Text/Pandoc/Citeproc/BibTeX.hs
index a8e5622ed..9d6b0b47e 100644
--- a/src/Text/Pandoc/Citeproc/BibTeX.hs
+++ b/src/Text/Pandoc/Citeproc/BibTeX.hs
@@ -19,6 +19,7 @@ module Text.Pandoc.Citeproc.BibTeX
     ( Variant(..)
     , readBibtexString
     , writeBibtexString
+    , toName
     )
     where
 
@@ -1173,7 +1174,7 @@ emptyName =
           , nameStaticOrdering      = False
           }
 
-toName :: Options -> [Inline] -> Bib Name
+toName :: MonadPlus m => Options -> [Inline] -> m Name
 toName _ [Str "others"] =
   return emptyName{ nameLiteral = Just "others" }
 toName _ [Span ("",[],[]) ils] = -- corporate author
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 3e094da60..19b22b041 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -55,6 +55,7 @@ module Text.Pandoc.Readers
   , readCslJson
   , readBibTeX
   , readBibLaTeX
+  , readEndNoteXML
   , readRTF
   -- * Miscellaneous
   , getReader
@@ -103,6 +104,7 @@ import Text.Pandoc.Readers.Man
 import Text.Pandoc.Readers.CSV
 import Text.Pandoc.Readers.CslJson
 import Text.Pandoc.Readers.BibTeX
+import Text.Pandoc.Readers.EndNote
 import Text.Pandoc.Readers.RTF
 import qualified Text.Pandoc.UTF8 as UTF8
 import Text.Pandoc.Sources (ToSources(..), sourcesToText)
@@ -151,6 +153,7 @@ readers = [("native"       , TextReader readNative)
           ,("csljson"      , TextReader readCslJson)
           ,("bibtex"       , TextReader readBibTeX)
           ,("biblatex"     , TextReader readBibLaTeX)
+          ,("endnotexml"   , TextReader readEndNoteXML)
           ,("rtf"          , TextReader readRTF)
            ]
 
diff --git a/src/Text/Pandoc/Readers/EndNote.hs b/src/Text/Pandoc/Readers/EndNote.hs
new file mode 100644
index 000000000..9fe1496be
--- /dev/null
+++ b/src/Text/Pandoc/Readers/EndNote.hs
@@ -0,0 +1,201 @@
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.EndNote
+   Copyright   : Copyright (C) 2022 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Parses EndNote XML bibliographies into a Pandoc document
+with empty body and `references` and `nocite` fields
+in the metadata.  A wildcard `nocite` is used so that
+if the document is rendered in another format, the
+entire bibliography will be printed.
+-}
+module Text.Pandoc.Readers.EndNote
+  ( readEndNoteXML
+  , readEndNoteXMLReferences
+  )
+where
+
+import Text.Pandoc.Options
+import Text.Pandoc.Definition
+import Citeproc (Reference(..), ItemId(..), Val(..), Date(..), DateParts(..))
+import Text.Pandoc.Builder as B
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Class (PandocMonad)
+import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue)
+import Text.Pandoc.Sources (Sources(..), ToSources(..), sourcesToText)
+import Text.Pandoc.Citeproc.BibTeX (toName)
+import Control.Applicative ((<|>))
+import Control.Monad.Except (throwError)
+import Control.Monad (mzero)
+import Text.Pandoc.XML.Light
+    ( filterElementName,
+      strContent,
+      QName(qName),
+      Element(..),
+      Content(..),
+      CData(..),
+      filterElementsName,
+      filterChildrenName,
+      findAttrBy,
+      parseXMLElement )
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text as T
+import Data.Text (Text)
+import qualified Data.Map as M
+import Safe (readMay)
+
+-- | Read EndNote XML from an input string and return a Pandoc document.
+-- The document will have only metadata, with an empty body.
+-- The metadata will contain a `references` field with the
+-- bibliography entries, and a `nocite` field with the wildcard `[@*]`.
+readEndNoteXML :: (PandocMonad m, ToSources a)
+               => ReaderOptions -> a -> m Pandoc
+readEndNoteXML _opts inp = do
+  let sources = toSources inp
+  refs <- readEndNoteXMLReferences sources
+  return $ setMeta "references" (map referenceToMetaValue refs) $ B.doc mempty
+
+readEndNoteXMLReferences :: PandocMonad m
+                         => Sources -> m [Reference Inlines]
+readEndNoteXMLReferences sources = do
+  tree <- either (throwError . PandocXMLError "") return $
+              parseXMLElement (TL.fromStrict . sourcesToText $ sources)
+  let records = filterElementsName ((== "record") . qName) tree
+  return $ map recordToReference records
+
+
+recordToReference :: Element -> Reference Inlines
+recordToReference e =
+  Reference{ referenceId = ItemId refid,
+             referenceType = reftype,
+             referenceDisambiguation = Nothing,
+             referenceVariables = refvars }
+
+ where
+   -- get strContent, recursing inside style elements:
+   getText el = getText' (Elem el)
+   getText' (Elem el) =  mconcat $ map getText' $ elContent el
+   getText' (Text cd) = cdData cd
+   getText' (CRef _) = mempty
+     -- mconcat . map cdData . onlyText . elContent
+   name t = (== t) . qName
+   refid = maybe mempty (T.strip . strContent)
+           (filterElementName (name "key") e
+            <|> filterElementName (name "rec-number") e)
+   reftype = maybe "document" toCslReferenceType
+              (filterElementName (name "ref-type") e >>=
+                findAttrBy (name "name"))
+   authors =
+     filterChildrenName (name "contributors") e >>=
+     filterChildrenName (name "authors") >>=
+     filterChildrenName (name "author") >>=
+     toName [] . B.toList . B.text . T.strip . getText
+   titles = do
+     x <- filterChildrenName (name "titles") e
+     (key, name') <- [("title", "title"),
+                      ("container-title", "secondary-title")]
+     (key,) . FancyVal . B.text . T.strip . getText <$>
+                    filterChildrenName (name name') x
+   pages = ("pages",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "pages") e
+   volume = ("volume",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "volume") e
+   number = ("number",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "number") e
+   isbn = ("isbn",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "isbn") e
+   publisher = ("publisher",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "publisher") e
+   originalPublisher =
+     ("original-publisher",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "orig-pub") e
+   publisherPlace =
+     ("publisher-place",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "pub-location") e
+   abstract = ("abstract",) . FancyVal . B.text. T.strip . getText <$>
+                filterChildrenName (name "abstract") e
+   dates = ("issued",) . toDate <$> filterChildrenName (name "dates") e
+   toDate e' = DateVal $
+    Date { dateParts = toDateParts e'
+         , dateCirca = False
+         , dateSeason = Nothing
+         , dateLiteral = Nothing }
+   toDateParts e' = do
+    x <- filterChildrenName (name "year") e'
+    case readMay . T.unpack . T.strip . getText $ x of
+      Nothing -> mzero
+      Just y  -> return $ DateParts [y]
+
+   refvars = M.fromList $
+     [ ("author", NamesVal authors) | not (null authors) ] ++
+     titles ++
+     pages ++
+     volume ++
+     number ++
+     isbn ++
+     dates ++
+     publisher ++
+     originalPublisher ++
+     publisherPlace ++
+     abstract
+
+toCslReferenceType :: Text -> Text
+toCslReferenceType t =
+  case t of
+    "Aggregated Database" -> "dataset"
+    "Ancient Text" -> "classic"
+    "Artwork" -> "graphic"
+    "Audiovisual Material" -> "graphic"
+    "Bill" -> "legislation"
+    "Blog" -> "post-weblog"
+    "Book" -> "book"
+    "Book Section" -> "chapter"
+    "Case" -> "legal_case"
+    "Catalog" -> "document"
+    "Chart or Table" -> "graphic"
+    "Classical Work" -> "classic"
+    "Computer program" -> "software"
+    "Conference Paper" -> "article"
+    "Conference Proceedings" -> "periodical"
+    "Dataset" -> "dataset"
+    "Dictionary" -> "book"
+    "Edited Book" -> "book"
+    "Electronic Article" -> "article"
+    "Electronic Book" -> "book"
+    "Electronic Book Section" -> "chapter"
+    "Encyclopedia" -> "book"
+    "Equation" -> "document"
+    "Figure" -> "graphic"
+    "Film or Broadcast" -> "motion_picture"
+    "Government Document" -> "document"
+    "Grant" -> "document"
+    "Hearing" -> "hearing"
+    "Interview" -> "interview"
+    "Journal Article" -> "article-journal"
+    "Legal Rule or Regulation" -> "regulation"
+    "Magazine Article" -> "article-magazine"
+    "Manuscript" -> "manuscript"
+    "Map" -> "map"
+    "Music" -> "musical_score"
+    "Newspaper Article" -> "article-newspaper"
+    "Online Database" -> "dataset"
+    "Online Multimedia" -> "webpage"
+    "Pamphlet" -> "pamphlet"
+    "Patent" -> "patent"
+    "Personal Communication" -> "personal_communication"
+    "Podcast" -> "document"
+    "Press Release" -> "report"
+    "Report" -> "report"
+    "Serial" -> "periodical"
+    "Standard" -> "standard"
+    "Statute" -> "legislation"
+    "Thesis" -> "thesis"
+    "Unpublished Work" -> "unpublished"
+    "Web Page" -> "webpage"
+    _ -> "document"