From 3da5440858c7542d6405f4a579071bc7c2ea3a11 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 4 Feb 2022 23:41:59 -0800 Subject: [PATCH] Add RIS bibliography format reader. New module, Text.Pandoc.Readers.RIS, exporting readRIS. New input format `ris`. Closes #7894. --- MANUAL.txt | 2 + pandoc.cabal | 1 + src/Text/Pandoc/App/FormatHeuristics.hs | 1 + src/Text/Pandoc/Readers.hs | 3 + src/Text/Pandoc/Readers/RIS.hs | 250 ++++++++++++++++++++++++ test/command/7894.md | 108 ++++++++++ 6 files changed, 365 insertions(+) create mode 100644 src/Text/Pandoc/Readers/RIS.hs create mode 100644 test/command/7894.md diff --git a/MANUAL.txt b/MANUAL.txt index a9a6ab1ee..b06a618e7 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -260,6 +260,7 @@ header when requesting a document from a URL: - `odt` ([ODT]) - `opml` ([OPML]) - `org` ([Emacs Org mode]) + - `ris` ([RIS] bibliography) - `rtf` ([Rich Text Format]) - `rst` ([reStructuredText]) - `t2t` ([txt2tags]) @@ -487,6 +488,7 @@ header when requesting a document from a URL: [roff ms]: https://man.cx/groff_ms(7) [Haskell]: https://www.haskell.org [GNU Texinfo]: https://www.gnu.org/software/texinfo/ +[RIS]: https://en.wikipedia.org/wiki/RIS_(file_format) [Emacs Org mode]: https://orgmode.org [AsciiDoc]: https://www.methods.co.nz/asciidoc/ [AsciiDoctor]: https://asciidoctor.org/ diff --git a/pandoc.cabal b/pandoc.cabal index 66df92a17..35cc543ca 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -547,6 +547,7 @@ library Text.Pandoc.Readers.Creole, Text.Pandoc.Readers.BibTeX, Text.Pandoc.Readers.EndNote, + Text.Pandoc.Readers.RIS, Text.Pandoc.Readers.CslJson, Text.Pandoc.Readers.MediaWiki, Text.Pandoc.Readers.Vimwiki, diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs index 2bcdde484..ebf8db4c5 100644 --- a/src/Text/Pandoc/App/FormatHeuristics.hs +++ b/src/Text/Pandoc/App/FormatHeuristics.hs @@ -69,6 +69,7 @@ formatFromFilePath x = ".org" -> Just "org" ".pdf" -> Just "pdf" -- so we get an "unknown reader" error ".pptx" -> Just "pptx" + ".ris" -> Just "ris" ".roff" -> Just "ms" ".rst" -> Just "rst" ".rtf" -> Just "rtf" diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 19b22b041..95f5f5b61 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -56,6 +56,7 @@ module Text.Pandoc.Readers , readBibTeX , readBibLaTeX , readEndNoteXML + , readRIS , readRTF -- * Miscellaneous , getReader @@ -105,6 +106,7 @@ import Text.Pandoc.Readers.CSV import Text.Pandoc.Readers.CslJson import Text.Pandoc.Readers.BibTeX import Text.Pandoc.Readers.EndNote +import Text.Pandoc.Readers.RIS import Text.Pandoc.Readers.RTF import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Sources (ToSources(..), sourcesToText) @@ -154,6 +156,7 @@ readers = [("native" , TextReader readNative) ,("bibtex" , TextReader readBibTeX) ,("biblatex" , TextReader readBibLaTeX) ,("endnotexml" , TextReader readEndNoteXML) + ,("ris" , TextReader readRIS) ,("rtf" , TextReader readRTF) ] diff --git a/src/Text/Pandoc/Readers/RIS.hs b/src/Text/Pandoc/Readers/RIS.hs new file mode 100644 index 000000000..d81b265d1 --- /dev/null +++ b/src/Text/Pandoc/Readers/RIS.hs @@ -0,0 +1,250 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.RIS + Copyright : Copyright (C) 2022 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Parses RIS bibliographies into a Pandoc document +with empty body and `references` and `nocite` fields +in the metadata. A wildcard `nocite` is used so that +if the document is rendered in another format, the +entire bibliography will be printed. +-} +module Text.Pandoc.Readers.RIS + ( readRIS + ) +where + +import Text.Pandoc.Options +import Text.Pandoc.Definition +import Text.Pandoc.Parsing +import Data.Char (isAsciiUpper, isDigit, isSpace, ord, chr) +import Data.List (foldl') +import Citeproc (Reference(..), ItemId(..), Val(..), Date(..), DateParts(..), + toVariable) +import Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue) +import Text.Pandoc.Citeproc.BibTeX (toName) +import Control.Monad.Except (throwError) +import qualified Data.Text as T +import Data.Text (Text) +import Data.Maybe (fromMaybe) +import qualified Data.Map as M +import Safe (readMay) + +-- | Read RIS from an input string and return a Pandoc document. +-- The document will have only metadata, with an empty body. +-- The metadata will contain a `references` field with the +-- bibliography entries, and a `nocite` field with the wildcard `[@*]`. +readRIS :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc +readRIS _opts inp = do + parsed <- readWithM risReferences () inp + case parsed of + Right refs -> do + refs' <- mapM (traverse (return . text)) refs + return $ + setMeta "nocite" (cite [Citation {citationId = "*" + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationNoteNum = 0 + , citationHash = 0}] (str "[@*]")) $ + setMeta "references" (map referenceToMetaValue refs') $ + B.doc mempty + Left e -> throwError e + +type RISParser m = ParserT Sources () m + +risLine :: PandocMonad m => RISParser m (Text, Text) +risLine = do + key <- T.pack <$> count 2 (satisfy (\c -> isAsciiUpper c || isDigit c)) + _ <- many1 spaceChar + char '-' + _ <- many1 spaceChar + val <- anyLine + return (key, T.strip val) + +risSeparator :: PandocMonad m => RISParser m () +risSeparator = do + try $ string "ER" + _ <- many1 spaceChar + char '-' + _ <- anyLine + return () + +risRecord :: PandocMonad m => RISParser m [(Text, Text)] +risRecord = manyTill risLine risSeparator + +risRecordToReference :: [(Text, Text)] -> Reference Text +risRecordToReference keys = addId $ foldr go defref keys + where + go (key, val) = + case key of + "TY" -> \ref -> ref{ referenceType = + fromMaybe "misc" (M.lookup val risTypes) } + "VL" -> addVar "volume" val + "KW" -> \ref -> + ref{ referenceVariables = + M.alter (\x -> case x of + Nothing -> Just $ TextVal val + Just (TextVal kws) + -> Just (TextVal (kws <> ", " <> val)) + _ -> x) + "keyword" + (referenceVariables ref) } + "PB" -> addVar "publisher" val + "PP" -> addVar "publisher-place" val + "SP" -> \ref -> + case M.lookup "page" (referenceVariables ref) of + Nothing -> addVar "page" val ref + Just (FancyVal eg) -> addVar "page" (val <> eg) ref + _ -> ref + "EP" -> \ref -> + case M.lookup "page" (referenceVariables ref) of + Nothing -> addVar "page" ("-" <> val) ref + Just (FancyVal eg) -> addVar "page" (val <> "-" <> eg) ref + _ -> ref + "AU" -> addName "author" val + "A1" -> addName "author" val + "ED" -> addName "editor" val + "A2" -> addName "editor" val + "TI" -> addVar "title" val + "T1" -> addVar "title" val + "CT" -> addVar "title" val + "BT" -> \ref -> + if referenceType ref == "book" + then addVar "title" val ref + else addVar "container-title" val ref + "JO" -> addVar "container-title" val + "JF" -> addVar "container-title" val + "T2" -> addVar "container-title" val + "ET" -> addVar "edition" val + "NV" -> addVar "number-of-volumes" val + "AB" -> addVar "abstract" val + "PY" -> addYear "issued" val + "Y1" -> addYear "issued" val + "IS" -> addVar "issue" val + "SN" -> addVar "ISSN" val + "LA" -> addVar "language" val + "UR" -> addVar "url" val + "LK" -> addVar "url" val + _ -> id -- TODO + addVar k v r = r{ referenceVariables = + M.insert (toVariable k) (FancyVal v) + (referenceVariables r) } + addName k v r = + let new = toName [] . B.toList . B.text $ v + f Nothing = Just (NamesVal new) + f (Just (NamesVal ns)) = Just (NamesVal (ns ++ new)) + f (Just x) = Just x + in r{ referenceVariables = + M.alter f k (referenceVariables r) } + addYear k v r = + let d = DateVal $ + case readMay (T.unpack v) of + Nothing -> + Date { dateParts = [] + , dateCirca = False + , dateSeason = Nothing + , dateLiteral = Just v } + Just y -> + Date { dateParts = [DateParts [y]] + , dateCirca = False + , dateSeason = Nothing + , dateLiteral = Nothing } + in r{ referenceVariables = M.insert k d (referenceVariables r) } + + defref = Reference{ + referenceId = mempty + , referenceType = mempty + , referenceDisambiguation = Nothing + , referenceVariables = mempty } + addId rec = rec{ referenceId = ItemId (authors <> pubdate) } + authors = T.intercalate "_" $ + [T.takeWhile (\c -> c /= ',' && not (isSpace c)) n + | (k, n) <- keys, k == "AU" || k == "A1"] + pubdate = mconcat ["_" <> d | (k, d) <- keys, k == "PY" || k == "Y1"] + +risReferences :: PandocMonad m => RISParser m [Reference Text] +risReferences = do + recs <- many risRecord + spaces + eof + return $ fixDuplicateIds $ map risRecordToReference recs + +fixDuplicateIds :: [Reference Text] -> [Reference Text] +fixDuplicateIds = reverse . snd . foldl' go (mempty, []) + where + go (ids_seen, refs) ref = + case M.lookup (referenceId ref) ids_seen of + Nothing -> (M.insert (referenceId ref) (ord 'a') ids_seen, ref:refs) + Just n -> (M.insert (referenceId ref) (n+1) ids_seen, + ref{ referenceId = + ItemId . (<> T.singleton (chr n)) . unItemId $ + referenceId ref } + : refs) + +risTypes :: M.Map Text Text +risTypes = M.fromList + [ ("ABST", "article") + , ("ADVS", "motion-picture") + , ("AGGR", "dataset") + , ("ANCIENT", "book") + , ("ART", "graphic") + , ("BILL", "bill") + , ("BLOG", "post-weblog") + , ("BOOK", "book") + , ("CASE", "legal_case") + , ("CHAP", "chapter") + , ("CHART", "graphic") + , ("CLSWK", "book") + , ("COMP", "program") + , ("CONF", "paper-conference") + , ("CPAPER", "paper-conference") + , ("CTLG", "catalog") + , ("DATA", "dataset") + , ("DBASE", "dataset") + , ("DICT", "book") + , ("EBOOK", "book") + , ("ECHAP", "chapter") + , ("EDBOOK", "book") + , ("EJOUR", "article") + , ("WEB", "webpage") + , ("ENCYC", "entry-encyclopedia") + , ("EQUA", "figure") + , ("FIGURE", "figure") + , ("GEN", "entry") + , ("GOVDOC", "report") + , ("GRANT", "report") + , ("HEAR", "report") + , ("ICOMM", "personal_communication") + , ("INPR", "article-journal") + , ("JFULL", "article-journal") + , ("JOUR", "article-journal") + , ("LEGAL", "legal_case") + , ("MANSCPT", "manuscript") + , ("MAP", "map") + , ("MGZN", "article-magazine") + , ("MPCT", "motion-picture") + , ("MULTI", "webpage") + , ("MUSIC", "musical_score") + , ("NEWS", "article-newspaper") + , ("PAMP", "pamphlet") + , ("PAT", "patent") + , ("PCOMM", "personal_communication") + , ("RPRT", "report") + , ("SER", "article") + , ("SLIDE", "graphic") + , ("SOUND", "musical_score") + , ("STAND", "report") + , ("STAT", "legislation") + , ("THES", "thesis") + , ("UNBILL", "bill") + , ("UNPB", "unpublished") + , ("VIDEO", "graphic") ] diff --git a/test/command/7894.md b/test/command/7894.md new file mode 100644 index 000000000..f9c7c1e38 --- /dev/null +++ b/test/command/7894.md @@ -0,0 +1,108 @@ +``` +% pandoc -f ris -t csljson +TY - BOOK +AU - Chang, C. C. +AU - Keisler, H. Jerome +PY - 1990 +ET - 3 +TI - Model Theory +PU - North-Holland Press +PP - Amsterdam +KW - model theory +KW - logic +ER - +TY - JOUR +AU - Shannon, Claude E. +PY - 1948 +DA - July +TI - A Mathematical Theory of Communication +T2 - Bell System Technical Journal +SP - 379 +EP - 423 +VL - 27 +ER - +TY - JOUR +T1 - On computable numbers, with an application to the Entscheidungsproblem +A1 - Turing, Alan Mathison +JO - Proc. of London Mathematical Society +VL - 47 +IS - 1 +KW - decidability +KW - computability +SP - 230 +EP - 265 +Y1 - 1937 +ER - +^D +[ + { + "author": [ + { + "family": "Keisler", + "given": "H. Jerome" + }, + { + "family": "Chang", + "given": "C. C." + } + ], + "edition": "3", + "id": "Chang_Keisler_1990", + "issued": { + "date-parts": [ + [ + 1990 + ] + ] + }, + "keyword": "logic, model theory", + "publisher-place": "Amsterdam", + "title": "Model Theory", + "type": "book" + }, + { + "author": [ + { + "family": "Shannon", + "given": "Claude E." + } + ], + "container-title": "Bell System Technical Journal", + "id": "Shannon_1948", + "issued": { + "date-parts": [ + [ + 1948 + ] + ] + }, + "page": "379-423", + "title": "A Mathematical Theory of Communication", + "type": "article-journal", + "volume": "27" + }, + { + "author": [ + { + "family": "Turing", + "given": "Alan Mathison" + } + ], + "container-title": "Proc. of London Mathematical Society", + "id": "Turing_1937", + "issue": "1", + "issued": { + "date-parts": [ + [ + 1937 + ] + ] + }, + "keyword": "computability, decidability", + "page": "230-265", + "title": "On computable numbers, with an application to the Entscheidungsproblem", + "type": "article-journal", + "volume": "47" + } +] +```