Add RIS bibliography format reader.

New module, Text.Pandoc.Readers.RIS, exporting readRIS.

New input format `ris`.

Closes #7894.
This commit is contained in:
John MacFarlane 2022-02-04 23:41:59 -08:00
parent 2fca57522b
commit 3da5440858
6 changed files with 365 additions and 0 deletions

View file

@ -260,6 +260,7 @@ header when requesting a document from a URL:
- `odt` ([ODT]) - `odt` ([ODT])
- `opml` ([OPML]) - `opml` ([OPML])
- `org` ([Emacs Org mode]) - `org` ([Emacs Org mode])
- `ris` ([RIS] bibliography)
- `rtf` ([Rich Text Format]) - `rtf` ([Rich Text Format])
- `rst` ([reStructuredText]) - `rst` ([reStructuredText])
- `t2t` ([txt2tags]) - `t2t` ([txt2tags])
@ -487,6 +488,7 @@ header when requesting a document from a URL:
[roff ms]: https://man.cx/groff_ms(7) [roff ms]: https://man.cx/groff_ms(7)
[Haskell]: https://www.haskell.org [Haskell]: https://www.haskell.org
[GNU Texinfo]: https://www.gnu.org/software/texinfo/ [GNU Texinfo]: https://www.gnu.org/software/texinfo/
[RIS]: https://en.wikipedia.org/wiki/RIS_(file_format)
[Emacs Org mode]: https://orgmode.org [Emacs Org mode]: https://orgmode.org
[AsciiDoc]: https://www.methods.co.nz/asciidoc/ [AsciiDoc]: https://www.methods.co.nz/asciidoc/
[AsciiDoctor]: https://asciidoctor.org/ [AsciiDoctor]: https://asciidoctor.org/

View file

@ -547,6 +547,7 @@ library
Text.Pandoc.Readers.Creole, Text.Pandoc.Readers.Creole,
Text.Pandoc.Readers.BibTeX, Text.Pandoc.Readers.BibTeX,
Text.Pandoc.Readers.EndNote, Text.Pandoc.Readers.EndNote,
Text.Pandoc.Readers.RIS,
Text.Pandoc.Readers.CslJson, Text.Pandoc.Readers.CslJson,
Text.Pandoc.Readers.MediaWiki, Text.Pandoc.Readers.MediaWiki,
Text.Pandoc.Readers.Vimwiki, Text.Pandoc.Readers.Vimwiki,

View file

@ -69,6 +69,7 @@ formatFromFilePath x =
".org" -> Just "org" ".org" -> Just "org"
".pdf" -> Just "pdf" -- so we get an "unknown reader" error ".pdf" -> Just "pdf" -- so we get an "unknown reader" error
".pptx" -> Just "pptx" ".pptx" -> Just "pptx"
".ris" -> Just "ris"
".roff" -> Just "ms" ".roff" -> Just "ms"
".rst" -> Just "rst" ".rst" -> Just "rst"
".rtf" -> Just "rtf" ".rtf" -> Just "rtf"

View file

@ -56,6 +56,7 @@ module Text.Pandoc.Readers
, readBibTeX , readBibTeX
, readBibLaTeX , readBibLaTeX
, readEndNoteXML , readEndNoteXML
, readRIS
, readRTF , readRTF
-- * Miscellaneous -- * Miscellaneous
, getReader , getReader
@ -105,6 +106,7 @@ import Text.Pandoc.Readers.CSV
import Text.Pandoc.Readers.CslJson import Text.Pandoc.Readers.CslJson
import Text.Pandoc.Readers.BibTeX import Text.Pandoc.Readers.BibTeX
import Text.Pandoc.Readers.EndNote import Text.Pandoc.Readers.EndNote
import Text.Pandoc.Readers.RIS
import Text.Pandoc.Readers.RTF import Text.Pandoc.Readers.RTF
import qualified Text.Pandoc.UTF8 as UTF8 import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.Pandoc.Sources (ToSources(..), sourcesToText)
@ -154,6 +156,7 @@ readers = [("native" , TextReader readNative)
,("bibtex" , TextReader readBibTeX) ,("bibtex" , TextReader readBibTeX)
,("biblatex" , TextReader readBibLaTeX) ,("biblatex" , TextReader readBibLaTeX)
,("endnotexml" , TextReader readEndNoteXML) ,("endnotexml" , TextReader readEndNoteXML)
,("ris" , TextReader readRIS)
,("rtf" , TextReader readRTF) ,("rtf" , TextReader readRTF)
] ]

View file

@ -0,0 +1,250 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.RIS
Copyright : Copyright (C) 2022 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Parses RIS bibliographies into a Pandoc document
with empty body and `references` and `nocite` fields
in the metadata. A wildcard `nocite` is used so that
if the document is rendered in another format, the
entire bibliography will be printed.
-}
module Text.Pandoc.Readers.RIS
( readRIS
)
where
import Text.Pandoc.Options
import Text.Pandoc.Definition
import Text.Pandoc.Parsing
import Data.Char (isAsciiUpper, isDigit, isSpace, ord, chr)
import Data.List (foldl')
import Citeproc (Reference(..), ItemId(..), Val(..), Date(..), DateParts(..),
toVariable)
import Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue)
import Text.Pandoc.Citeproc.BibTeX (toName)
import Control.Monad.Except (throwError)
import qualified Data.Text as T
import Data.Text (Text)
import Data.Maybe (fromMaybe)
import qualified Data.Map as M
import Safe (readMay)
-- | Read RIS from an input string and return a Pandoc document.
-- The document will have only metadata, with an empty body.
-- The metadata will contain a `references` field with the
-- bibliography entries, and a `nocite` field with the wildcard `[@*]`.
readRIS :: (PandocMonad m, ToSources a)
=> ReaderOptions -> a -> m Pandoc
readRIS _opts inp = do
parsed <- readWithM risReferences () inp
case parsed of
Right refs -> do
refs' <- mapM (traverse (return . text)) refs
return $
setMeta "nocite" (cite [Citation {citationId = "*"
, citationPrefix = []
, citationSuffix = []
, citationMode = NormalCitation
, citationNoteNum = 0
, citationHash = 0}] (str "[@*]")) $
setMeta "references" (map referenceToMetaValue refs') $
B.doc mempty
Left e -> throwError e
type RISParser m = ParserT Sources () m
risLine :: PandocMonad m => RISParser m (Text, Text)
risLine = do
key <- T.pack <$> count 2 (satisfy (\c -> isAsciiUpper c || isDigit c))
_ <- many1 spaceChar
char '-'
_ <- many1 spaceChar
val <- anyLine
return (key, T.strip val)
risSeparator :: PandocMonad m => RISParser m ()
risSeparator = do
try $ string "ER"
_ <- many1 spaceChar
char '-'
_ <- anyLine
return ()
risRecord :: PandocMonad m => RISParser m [(Text, Text)]
risRecord = manyTill risLine risSeparator
risRecordToReference :: [(Text, Text)] -> Reference Text
risRecordToReference keys = addId $ foldr go defref keys
where
go (key, val) =
case key of
"TY" -> \ref -> ref{ referenceType =
fromMaybe "misc" (M.lookup val risTypes) }
"VL" -> addVar "volume" val
"KW" -> \ref ->
ref{ referenceVariables =
M.alter (\x -> case x of
Nothing -> Just $ TextVal val
Just (TextVal kws)
-> Just (TextVal (kws <> ", " <> val))
_ -> x)
"keyword"
(referenceVariables ref) }
"PB" -> addVar "publisher" val
"PP" -> addVar "publisher-place" val
"SP" -> \ref ->
case M.lookup "page" (referenceVariables ref) of
Nothing -> addVar "page" val ref
Just (FancyVal eg) -> addVar "page" (val <> eg) ref
_ -> ref
"EP" -> \ref ->
case M.lookup "page" (referenceVariables ref) of
Nothing -> addVar "page" ("-" <> val) ref
Just (FancyVal eg) -> addVar "page" (val <> "-" <> eg) ref
_ -> ref
"AU" -> addName "author" val
"A1" -> addName "author" val
"ED" -> addName "editor" val
"A2" -> addName "editor" val
"TI" -> addVar "title" val
"T1" -> addVar "title" val
"CT" -> addVar "title" val
"BT" -> \ref ->
if referenceType ref == "book"
then addVar "title" val ref
else addVar "container-title" val ref
"JO" -> addVar "container-title" val
"JF" -> addVar "container-title" val
"T2" -> addVar "container-title" val
"ET" -> addVar "edition" val
"NV" -> addVar "number-of-volumes" val
"AB" -> addVar "abstract" val
"PY" -> addYear "issued" val
"Y1" -> addYear "issued" val
"IS" -> addVar "issue" val
"SN" -> addVar "ISSN" val
"LA" -> addVar "language" val
"UR" -> addVar "url" val
"LK" -> addVar "url" val
_ -> id -- TODO
addVar k v r = r{ referenceVariables =
M.insert (toVariable k) (FancyVal v)
(referenceVariables r) }
addName k v r =
let new = toName [] . B.toList . B.text $ v
f Nothing = Just (NamesVal new)
f (Just (NamesVal ns)) = Just (NamesVal (ns ++ new))
f (Just x) = Just x
in r{ referenceVariables =
M.alter f k (referenceVariables r) }
addYear k v r =
let d = DateVal $
case readMay (T.unpack v) of
Nothing ->
Date { dateParts = []
, dateCirca = False
, dateSeason = Nothing
, dateLiteral = Just v }
Just y ->
Date { dateParts = [DateParts [y]]
, dateCirca = False
, dateSeason = Nothing
, dateLiteral = Nothing }
in r{ referenceVariables = M.insert k d (referenceVariables r) }
defref = Reference{
referenceId = mempty
, referenceType = mempty
, referenceDisambiguation = Nothing
, referenceVariables = mempty }
addId rec = rec{ referenceId = ItemId (authors <> pubdate) }
authors = T.intercalate "_" $
[T.takeWhile (\c -> c /= ',' && not (isSpace c)) n
| (k, n) <- keys, k == "AU" || k == "A1"]
pubdate = mconcat ["_" <> d | (k, d) <- keys, k == "PY" || k == "Y1"]
risReferences :: PandocMonad m => RISParser m [Reference Text]
risReferences = do
recs <- many risRecord
spaces
eof
return $ fixDuplicateIds $ map risRecordToReference recs
fixDuplicateIds :: [Reference Text] -> [Reference Text]
fixDuplicateIds = reverse . snd . foldl' go (mempty, [])
where
go (ids_seen, refs) ref =
case M.lookup (referenceId ref) ids_seen of
Nothing -> (M.insert (referenceId ref) (ord 'a') ids_seen, ref:refs)
Just n -> (M.insert (referenceId ref) (n+1) ids_seen,
ref{ referenceId =
ItemId . (<> T.singleton (chr n)) . unItemId $
referenceId ref }
: refs)
risTypes :: M.Map Text Text
risTypes = M.fromList
[ ("ABST", "article")
, ("ADVS", "motion-picture")
, ("AGGR", "dataset")
, ("ANCIENT", "book")
, ("ART", "graphic")
, ("BILL", "bill")
, ("BLOG", "post-weblog")
, ("BOOK", "book")
, ("CASE", "legal_case")
, ("CHAP", "chapter")
, ("CHART", "graphic")
, ("CLSWK", "book")
, ("COMP", "program")
, ("CONF", "paper-conference")
, ("CPAPER", "paper-conference")
, ("CTLG", "catalog")
, ("DATA", "dataset")
, ("DBASE", "dataset")
, ("DICT", "book")
, ("EBOOK", "book")
, ("ECHAP", "chapter")
, ("EDBOOK", "book")
, ("EJOUR", "article")
, ("WEB", "webpage")
, ("ENCYC", "entry-encyclopedia")
, ("EQUA", "figure")
, ("FIGURE", "figure")
, ("GEN", "entry")
, ("GOVDOC", "report")
, ("GRANT", "report")
, ("HEAR", "report")
, ("ICOMM", "personal_communication")
, ("INPR", "article-journal")
, ("JFULL", "article-journal")
, ("JOUR", "article-journal")
, ("LEGAL", "legal_case")
, ("MANSCPT", "manuscript")
, ("MAP", "map")
, ("MGZN", "article-magazine")
, ("MPCT", "motion-picture")
, ("MULTI", "webpage")
, ("MUSIC", "musical_score")
, ("NEWS", "article-newspaper")
, ("PAMP", "pamphlet")
, ("PAT", "patent")
, ("PCOMM", "personal_communication")
, ("RPRT", "report")
, ("SER", "article")
, ("SLIDE", "graphic")
, ("SOUND", "musical_score")
, ("STAND", "report")
, ("STAT", "legislation")
, ("THES", "thesis")
, ("UNBILL", "bill")
, ("UNPB", "unpublished")
, ("VIDEO", "graphic") ]

108
test/command/7894.md Normal file
View file

@ -0,0 +1,108 @@
```
% pandoc -f ris -t csljson
TY - BOOK
AU - Chang, C. C.
AU - Keisler, H. Jerome
PY - 1990
ET - 3
TI - Model Theory
PU - North-Holland Press
PP - Amsterdam
KW - model theory
KW - logic
ER -
TY - JOUR
AU - Shannon, Claude E.
PY - 1948
DA - July
TI - A Mathematical Theory of Communication
T2 - Bell System Technical Journal
SP - 379
EP - 423
VL - 27
ER -
TY - JOUR
T1 - On computable numbers, with an application to the Entscheidungsproblem
A1 - Turing, Alan Mathison
JO - Proc. of London Mathematical Society
VL - 47
IS - 1
KW - decidability
KW - computability
SP - 230
EP - 265
Y1 - 1937
ER -
^D
[
{
"author": [
{
"family": "Keisler",
"given": "H. Jerome"
},
{
"family": "Chang",
"given": "C. C."
}
],
"edition": "3",
"id": "Chang_Keisler_1990",
"issued": {
"date-parts": [
[
1990
]
]
},
"keyword": "logic, model theory",
"publisher-place": "Amsterdam",
"title": "Model Theory",
"type": "book"
},
{
"author": [
{
"family": "Shannon",
"given": "Claude E."
}
],
"container-title": "Bell System Technical Journal",
"id": "Shannon_1948",
"issued": {
"date-parts": [
[
1948
]
]
},
"page": "379-423",
"title": "A Mathematical Theory of Communication",
"type": "article-journal",
"volume": "27"
},
{
"author": [
{
"family": "Turing",
"given": "Alan Mathison"
}
],
"container-title": "Proc. of London Mathematical Society",
"id": "Turing_1937",
"issue": "1",
"issued": {
"date-parts": [
[
1937
]
]
},
"keyword": "computability, decidability",
"page": "230-265",
"title": "On computable numbers, with an application to the Entscheidungsproblem",
"type": "article-journal",
"volume": "47"
}
]
```