From 7a70a46c0319f279fdee3926abff08922be2f02c Mon Sep 17 00:00:00 2001 From: Rowan Rodrik van der Molen <bigsmoke@gmail.com> Date: Fri, 5 Nov 2021 17:22:38 +0000 Subject: [PATCH] Support for <indexterm>s when reading DocBook (#7607) * Support for <indexterm>s when reading DocBook * Update implementation status of `<n-ary>` tags * Remove non-idiomatic parentheses * More complete `<indexterm>` support, with tests Co-authored-by: Rowan Rodrik van der Molen <rowan@ytec.nl> --- src/Text/Pandoc/Readers/DocBook.hs | 41 ++++++- test/docbook-reader.docbook | 12 ++ test/docbook-reader.native | 187 +++++++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 4 deletions(-) diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index 1c13e597b..bdf802925 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -19,7 +19,7 @@ import Data.Foldable (asum) import Data.Generics import Data.List (intersperse,elemIndex) import Data.List.NonEmpty (nonEmpty) -import Data.Maybe (fromMaybe,mapMaybe) +import Data.Maybe (catMaybes,fromMaybe,mapMaybe) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TL @@ -316,7 +316,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] postcode - A postal code in an address [x] preface - Introductory matter preceding the first chapter of a book [ ] prefaceinfo - Meta-information for a Preface -[ ] primary - The primary word or phrase under which an index term should be +[x] primary - The primary word or phrase under which an index term should be sorted [ ] primaryie - A primary term in an index entry, not in the text [ ] printhistory - The printing history of a document @@ -385,7 +385,7 @@ List of all DocBook tags, with [x] indicating implemented, [o] screeninfo - Information about how a screen shot was produced [ ] screenshot - A representation of what the user sees or might see on a computer screen -[ ] secondary - A secondary word or phrase in an index term +[x] secondary - A secondary word or phrase in an index term [ ] secondaryie - A secondary term in an index entry, rather than in the text [x] sect1 - A top-level section of document [x] sect1info - Meta-information for a Sect1 @@ -461,7 +461,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] td - A table entry in an HTML table [x] term - The word or phrase being defined or described in a variable list [ ] termdef - An inline term definition -[ ] tertiary - A tertiary word or phrase in an index term +[x] tertiary - A tertiary word or phrase in an index term [ ] tertiaryie - A tertiary term in an index entry, rather than in the text [ ] textdata - Pointer to external text data [ ] textobject - A wrapper for a text description of an object and its @@ -1080,6 +1080,17 @@ elementToStr :: Content -> Content elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x +childElTextAsAttr :: Text -> Element -> Maybe (Text, Text) +childElTextAsAttr n e = case findChild q e of + Nothing -> Nothing + Just childEl -> Just (n, strContentRecursive childEl) + where q = QName n (Just "http://docbook.org/ns/docbook") Nothing + +attrValueAsOptionalAttr :: Text -> Element -> Maybe (Text, Text) +attrValueAsOptionalAttr n e = case attrValue n e of + "" -> Nothing + _ -> Just (n, attrValue n e) + parseInline :: PandocMonad m => Content -> DB m Inlines parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = @@ -1094,6 +1105,28 @@ parseInline (Elem e) = if ident /= "" || classes /= [] then innerInlines (spanWith (ident,classes,[])) else innerInlines id + "indexterm" -> do + let ident = attrValue "id" e + let classes = T.words $ attrValue "role" e + let attrs = + -- In DocBook, <primary>, <secondary>, <tertiary>, <see>, and <seealso> + -- have mixed content models. However, because we're representing these + -- elements in Pandoc's AST as attributes of a phrase, we flatten all + -- the descendant content of these elements. + [ childElTextAsAttr "primary" e + , childElTextAsAttr "secondary" e + , childElTextAsAttr "tertiary" e + , childElTextAsAttr "see" e + , childElTextAsAttr "seealso" e + , attrValueAsOptionalAttr "significance" e + , attrValueAsOptionalAttr "startref" e + , attrValueAsOptionalAttr "scope" e + , attrValueAsOptionalAttr "class" e + -- We don't do anything with the "pagenum" attribute, because these only + -- occur within literal <index> sections, which is not supported by Pandoc, + -- because Pandoc has no concept of pages. + ] + return $ spanWith (ident, ("indexterm" : classes), (catMaybes attrs)) mempty "equation" -> equation e displayMath "informalequation" -> equation e displayMath "inlineequation" -> equation e math diff --git a/test/docbook-reader.docbook b/test/docbook-reader.docbook index c38abda82..00bd84649 100644 --- a/test/docbook-reader.docbook +++ b/test/docbook-reader.docbook @@ -1603,4 +1603,16 @@ or here: <http://example.com/> </step> </procedure> </sect1> +<sect1 id="indexterms"> + <title>Index terms</title> + <para> + In the simplest case, index terms<indexterm><primary>index term</primary></indexterm> consists of just a <code><primary></code> element, but <indexterm><primary>index term</primary><secondary>multi-level</secondary></indexterm> they can also consist of a <code><primary></code> <emph>and</emph> <code><secondary></code> element, and <indexterm><primary>index term</primary><secondary>multi-level</secondary><tertiary>3-level</tertiary></indexterm> can even include a <code><tertiary></code> term. + </para> + <para> + Index terms can also refer to other index terms: <indexterm><primary>index cross referencing</primary></indexterm><indexterm><primary>index term</primary><secondary>cross references</secondary><see>index cross referencing</see></indexterm>exclusively, using the <code><see></code> tag; or <indexterm><primary>index cross referencing</primary><seealso>cross referencing</seealso></indexterm> as a reference to related terms, using the <code><seealso></code> tag. + </para> + <para> + <indexterm><primary>food</primary><secondary>big <foreignphrase>baguette</foreignphrase> <strong>supreme</strong></secondary></indexterm>Nested content in index term elements is flattened. + </para> +</sect1> </article> diff --git a/test/docbook-reader.native b/test/docbook-reader.native index be3819336..7520068b1 100644 --- a/test/docbook-reader.native +++ b/test/docbook-reader.native @@ -2930,4 +2930,191 @@ Pandoc [ Str "A" , Space , Str "Final" , Space , Str "Step" ] ] ] + , Header + 1 + ( "indexterms" , [] , [] ) + [ Str "Index" , Space , Str "terms" ] + , Para + [ Str "In" + , Space + , Str "the" + , Space + , Str "simplest" + , Space + , Str "case," + , Space + , Str "index" + , Space + , Str "terms" + , Span + ( "" , [ "indexterm" ] , [ ( "primary" , "index term" ) ] ) + [] + , Space + , Str "consists" + , Space + , Str "of" + , Space + , Str "just" + , Space + , Str "a" + , Space + , Code ( "" , [] , [] ) "<primary>" + , Space + , Str "element," + , Space + , Str "but" + , Space + , Span + ( "" + , [ "indexterm" ] + , [ ( "primary" , "index term" ) + , ( "secondary" , "multi-level" ) + ] + ) + [] + , Space + , Str "they" + , Space + , Str "can" + , Space + , Str "also" + , Space + , Str "consist" + , Space + , Str "of" + , Space + , Str "a" + , Space + , Code ( "" , [] , [] ) "<primary>" + , Space + , Str "and" + , Space + , Code ( "" , [] , [] ) "<secondary>" + , Space + , Str "element," + , Space + , Str "and" + , Space + , Span + ( "" + , [ "indexterm" ] + , [ ( "primary" , "index term" ) + , ( "secondary" , "multi-level" ) + , ( "tertiary" , "3-level" ) + ] + ) + [] + , Space + , Str "can" + , Space + , Str "even" + , Space + , Str "include" + , Space + , Str "a" + , Space + , Code ( "" , [] , [] ) "<tertiary>" + , Space + , Str "term." + ] + , Para + [ Str "Index" + , Space + , Str "terms" + , Space + , Str "can" + , Space + , Str "also" + , Space + , Str "refer" + , Space + , Str "to" + , Space + , Str "other" + , Space + , Str "index" + , Space + , Str "terms:" + , Space + , Span + ( "" + , [ "indexterm" ] + , [ ( "primary" , "index cross referencing" ) ] + ) + [] + , Span + ( "" + , [ "indexterm" ] + , [ ( "primary" , "index term" ) + , ( "secondary" , "cross references" ) + , ( "see" , "index cross referencing" ) + ] + ) + [] + , Str "exclusively," + , Space + , Str "using" + , Space + , Str "the" + , Space + , Code ( "" , [] , [] ) "<see>" + , Space + , Str "tag;" + , Space + , Str "or" + , Space + , Span + ( "" + , [ "indexterm" ] + , [ ( "primary" , "index cross referencing" ) + , ( "seealso" , "cross referencing" ) + ] + ) + [] + , Space + , Str "as" + , Space + , Str "a" + , Space + , Str "reference" + , Space + , Str "to" + , Space + , Str "related" + , Space + , Str "terms," + , Space + , Str "using" + , Space + , Str "the" + , Space + , Code ( "" , [] , [] ) "<seealso>" + , Space + , Str "tag." + ] + , Para + [ Span + ( "" + , [ "indexterm" ] + , [ ( "primary" , "food" ) + , ( "secondary" , "big baguette supreme" ) + ] + ) + [] + , Str "Nested" + , Space + , Str "content" + , Space + , Str "in" + , Space + , Str "index" + , Space + , Str "term" + , Space + , Str "elements" + , Space + , Str "is" + , Space + , Str "flattened." + ] ]