Add RIS bibliography format reader.

New module, Text.Pandoc.Readers.RIS, exporting readRIS.

New input format `ris`.

Closes #7894.
This commit is contained in:
John MacFarlane 2022-02-04 23:41:59 -08:00
parent 2fca57522b
commit 3da5440858
6 changed files with 365 additions and 0 deletions

View file

@ -260,6 +260,7 @@ header when requesting a document from a URL:
- `odt` ([ODT])
- `opml` ([OPML])
- `org` ([Emacs Org mode])
- `ris` ([RIS] bibliography)
- `rtf` ([Rich Text Format])
- `rst` ([reStructuredText])
- `t2t` ([txt2tags])
@ -487,6 +488,7 @@ header when requesting a document from a URL:
[roff ms]: https://man.cx/groff_ms(7)
[Haskell]: https://www.haskell.org
[GNU Texinfo]: https://www.gnu.org/software/texinfo/
[RIS]: https://en.wikipedia.org/wiki/RIS_(file_format)
[Emacs Org mode]: https://orgmode.org
[AsciiDoc]: https://www.methods.co.nz/asciidoc/
[AsciiDoctor]: https://asciidoctor.org/

View file

@ -547,6 +547,7 @@ library
Text.Pandoc.Readers.Creole,
Text.Pandoc.Readers.BibTeX,
Text.Pandoc.Readers.EndNote,
Text.Pandoc.Readers.RIS,
Text.Pandoc.Readers.CslJson,
Text.Pandoc.Readers.MediaWiki,
Text.Pandoc.Readers.Vimwiki,

View file

@ -69,6 +69,7 @@ formatFromFilePath x =
".org" -> Just "org"
".pdf" -> Just "pdf" -- so we get an "unknown reader" error
".pptx" -> Just "pptx"
".ris" -> Just "ris"
".roff" -> Just "ms"
".rst" -> Just "rst"
".rtf" -> Just "rtf"

View file

@ -56,6 +56,7 @@ module Text.Pandoc.Readers
, readBibTeX
, readBibLaTeX
, readEndNoteXML
, readRIS
, readRTF
-- * Miscellaneous
, getReader
@ -105,6 +106,7 @@ import Text.Pandoc.Readers.CSV
import Text.Pandoc.Readers.CslJson
import Text.Pandoc.Readers.BibTeX
import Text.Pandoc.Readers.EndNote
import Text.Pandoc.Readers.RIS
import Text.Pandoc.Readers.RTF
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Sources (ToSources(..), sourcesToText)
@ -154,6 +156,7 @@ readers = [("native" , TextReader readNative)
,("bibtex" , TextReader readBibTeX)
,("biblatex" , TextReader readBibLaTeX)
,("endnotexml" , TextReader readEndNoteXML)
,("ris" , TextReader readRIS)
,("rtf" , TextReader readRTF)
]

View file

@ -0,0 +1,250 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.RIS
Copyright : Copyright (C) 2022 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Parses RIS bibliographies into a Pandoc document
with empty body and `references` and `nocite` fields
in the metadata. A wildcard `nocite` is used so that
if the document is rendered in another format, the
entire bibliography will be printed.
-}
module Text.Pandoc.Readers.RIS
( readRIS
)
where
import Text.Pandoc.Options
import Text.Pandoc.Definition
import Text.Pandoc.Parsing
import Data.Char (isAsciiUpper, isDigit, isSpace, ord, chr)
import Data.List (foldl')
import Citeproc (Reference(..), ItemId(..), Val(..), Date(..), DateParts(..),
toVariable)
import Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue)
import Text.Pandoc.Citeproc.BibTeX (toName)
import Control.Monad.Except (throwError)
import qualified Data.Text as T
import Data.Text (Text)
import Data.Maybe (fromMaybe)
import qualified Data.Map as M
import Safe (readMay)
-- | Read RIS from an input string and return a Pandoc document.
-- The document will have only metadata, with an empty body.
-- The metadata will contain a `references` field with the
-- bibliography entries, and a `nocite` field with the wildcard `[@*]`.
readRIS :: (PandocMonad m, ToSources a)
=> ReaderOptions -> a -> m Pandoc
readRIS _opts inp = do
parsed <- readWithM risReferences () inp
case parsed of
Right refs -> do
refs' <- mapM (traverse (return . text)) refs
return $
setMeta "nocite" (cite [Citation {citationId = "*"
, citationPrefix = []
, citationSuffix = []
, citationMode = NormalCitation
, citationNoteNum = 0
, citationHash = 0}] (str "[@*]")) $
setMeta "references" (map referenceToMetaValue refs') $
B.doc mempty
Left e -> throwError e
type RISParser m = ParserT Sources () m
risLine :: PandocMonad m => RISParser m (Text, Text)
risLine = do
key <- T.pack <$> count 2 (satisfy (\c -> isAsciiUpper c || isDigit c))
_ <- many1 spaceChar
char '-'
_ <- many1 spaceChar
val <- anyLine
return (key, T.strip val)
risSeparator :: PandocMonad m => RISParser m ()
risSeparator = do
try $ string "ER"
_ <- many1 spaceChar
char '-'
_ <- anyLine
return ()
risRecord :: PandocMonad m => RISParser m [(Text, Text)]
risRecord = manyTill risLine risSeparator
risRecordToReference :: [(Text, Text)] -> Reference Text
risRecordToReference keys = addId $ foldr go defref keys
where
go (key, val) =
case key of
"TY" -> \ref -> ref{ referenceType =
fromMaybe "misc" (M.lookup val risTypes) }
"VL" -> addVar "volume" val
"KW" -> \ref ->
ref{ referenceVariables =
M.alter (\x -> case x of
Nothing -> Just $ TextVal val
Just (TextVal kws)
-> Just (TextVal (kws <> ", " <> val))
_ -> x)
"keyword"
(referenceVariables ref) }
"PB" -> addVar "publisher" val
"PP" -> addVar "publisher-place" val
"SP" -> \ref ->
case M.lookup "page" (referenceVariables ref) of
Nothing -> addVar "page" val ref
Just (FancyVal eg) -> addVar "page" (val <> eg) ref
_ -> ref
"EP" -> \ref ->
case M.lookup "page" (referenceVariables ref) of
Nothing -> addVar "page" ("-" <> val) ref
Just (FancyVal eg) -> addVar "page" (val <> "-" <> eg) ref
_ -> ref
"AU" -> addName "author" val
"A1" -> addName "author" val
"ED" -> addName "editor" val
"A2" -> addName "editor" val
"TI" -> addVar "title" val
"T1" -> addVar "title" val
"CT" -> addVar "title" val
"BT" -> \ref ->
if referenceType ref == "book"
then addVar "title" val ref
else addVar "container-title" val ref
"JO" -> addVar "container-title" val
"JF" -> addVar "container-title" val
"T2" -> addVar "container-title" val
"ET" -> addVar "edition" val
"NV" -> addVar "number-of-volumes" val
"AB" -> addVar "abstract" val
"PY" -> addYear "issued" val
"Y1" -> addYear "issued" val
"IS" -> addVar "issue" val
"SN" -> addVar "ISSN" val
"LA" -> addVar "language" val
"UR" -> addVar "url" val
"LK" -> addVar "url" val
_ -> id -- TODO
addVar k v r = r{ referenceVariables =
M.insert (toVariable k) (FancyVal v)
(referenceVariables r) }
addName k v r =
let new = toName [] . B.toList . B.text $ v
f Nothing = Just (NamesVal new)
f (Just (NamesVal ns)) = Just (NamesVal (ns ++ new))
f (Just x) = Just x
in r{ referenceVariables =
M.alter f k (referenceVariables r) }
addYear k v r =
let d = DateVal $
case readMay (T.unpack v) of
Nothing ->
Date { dateParts = []
, dateCirca = False
, dateSeason = Nothing
, dateLiteral = Just v }
Just y ->
Date { dateParts = [DateParts [y]]
, dateCirca = False
, dateSeason = Nothing
, dateLiteral = Nothing }
in r{ referenceVariables = M.insert k d (referenceVariables r) }
defref = Reference{
referenceId = mempty
, referenceType = mempty
, referenceDisambiguation = Nothing
, referenceVariables = mempty }
addId rec = rec{ referenceId = ItemId (authors <> pubdate) }
authors = T.intercalate "_" $
[T.takeWhile (\c -> c /= ',' && not (isSpace c)) n
| (k, n) <- keys, k == "AU" || k == "A1"]
pubdate = mconcat ["_" <> d | (k, d) <- keys, k == "PY" || k == "Y1"]
risReferences :: PandocMonad m => RISParser m [Reference Text]
risReferences = do
recs <- many risRecord
spaces
eof
return $ fixDuplicateIds $ map risRecordToReference recs
fixDuplicateIds :: [Reference Text] -> [Reference Text]
fixDuplicateIds = reverse . snd . foldl' go (mempty, [])
where
go (ids_seen, refs) ref =
case M.lookup (referenceId ref) ids_seen of
Nothing -> (M.insert (referenceId ref) (ord 'a') ids_seen, ref:refs)
Just n -> (M.insert (referenceId ref) (n+1) ids_seen,
ref{ referenceId =
ItemId . (<> T.singleton (chr n)) . unItemId $
referenceId ref }
: refs)
risTypes :: M.Map Text Text
risTypes = M.fromList
[ ("ABST", "article")
, ("ADVS", "motion-picture")
, ("AGGR", "dataset")
, ("ANCIENT", "book")
, ("ART", "graphic")
, ("BILL", "bill")
, ("BLOG", "post-weblog")
, ("BOOK", "book")
, ("CASE", "legal_case")
, ("CHAP", "chapter")
, ("CHART", "graphic")
, ("CLSWK", "book")
, ("COMP", "program")
, ("CONF", "paper-conference")
, ("CPAPER", "paper-conference")
, ("CTLG", "catalog")
, ("DATA", "dataset")
, ("DBASE", "dataset")
, ("DICT", "book")
, ("EBOOK", "book")
, ("ECHAP", "chapter")
, ("EDBOOK", "book")
, ("EJOUR", "article")
, ("WEB", "webpage")
, ("ENCYC", "entry-encyclopedia")
, ("EQUA", "figure")
, ("FIGURE", "figure")
, ("GEN", "entry")
, ("GOVDOC", "report")
, ("GRANT", "report")
, ("HEAR", "report")
, ("ICOMM", "personal_communication")
, ("INPR", "article-journal")
, ("JFULL", "article-journal")
, ("JOUR", "article-journal")
, ("LEGAL", "legal_case")
, ("MANSCPT", "manuscript")
, ("MAP", "map")
, ("MGZN", "article-magazine")
, ("MPCT", "motion-picture")
, ("MULTI", "webpage")
, ("MUSIC", "musical_score")
, ("NEWS", "article-newspaper")
, ("PAMP", "pamphlet")
, ("PAT", "patent")
, ("PCOMM", "personal_communication")
, ("RPRT", "report")
, ("SER", "article")
, ("SLIDE", "graphic")
, ("SOUND", "musical_score")
, ("STAND", "report")
, ("STAT", "legislation")
, ("THES", "thesis")
, ("UNBILL", "bill")
, ("UNPB", "unpublished")
, ("VIDEO", "graphic") ]

108
test/command/7894.md Normal file
View file

@ -0,0 +1,108 @@
```
% pandoc -f ris -t csljson
TY - BOOK
AU - Chang, C. C.
AU - Keisler, H. Jerome
PY - 1990
ET - 3
TI - Model Theory
PU - North-Holland Press
PP - Amsterdam
KW - model theory
KW - logic
ER -
TY - JOUR
AU - Shannon, Claude E.
PY - 1948
DA - July
TI - A Mathematical Theory of Communication
T2 - Bell System Technical Journal
SP - 379
EP - 423
VL - 27
ER -
TY - JOUR
T1 - On computable numbers, with an application to the Entscheidungsproblem
A1 - Turing, Alan Mathison
JO - Proc. of London Mathematical Society
VL - 47
IS - 1
KW - decidability
KW - computability
SP - 230
EP - 265
Y1 - 1937
ER -
^D
[
{
"author": [
{
"family": "Keisler",
"given": "H. Jerome"
},
{
"family": "Chang",
"given": "C. C."
}
],
"edition": "3",
"id": "Chang_Keisler_1990",
"issued": {
"date-parts": [
[
1990
]
]
},
"keyword": "logic, model theory",
"publisher-place": "Amsterdam",
"title": "Model Theory",
"type": "book"
},
{
"author": [
{
"family": "Shannon",
"given": "Claude E."
}
],
"container-title": "Bell System Technical Journal",
"id": "Shannon_1948",
"issued": {
"date-parts": [
[
1948
]
]
},
"page": "379-423",
"title": "A Mathematical Theory of Communication",
"type": "article-journal",
"volume": "27"
},
{
"author": [
{
"family": "Turing",
"given": "Alan Mathison"
}
],
"container-title": "Proc. of London Mathematical Society",
"id": "Turing_1937",
"issue": "1",
"issued": {
"date-parts": [
[
1937
]
]
},
"keyword": "computability, decidability",
"page": "230-265",
"title": "On computable numbers, with an application to the Entscheidungsproblem",
"type": "article-journal",
"volume": "47"
}
]
```