Support for <indexterm>s when reading DocBook (#7607)

* Support for <indexterm>s when reading DocBook
* Update implementation status of `<n-ary>` tags
* Remove non-idiomatic parentheses
* More complete `<indexterm>` support, with tests

Co-authored-by: Rowan Rodrik van der Molen <rowan@ytec.nl>
This commit is contained in:
Rowan Rodrik van der Molen 2021-11-05 17:22:38 +00:00 committed by GitHub
parent 5750f60442
commit 7a70a46c03
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 236 additions and 4 deletions

View file

@ -19,7 +19,7 @@ import Data.Foldable (asum)
import Data.Generics
import Data.List (intersperse,elemIndex)
import Data.List.NonEmpty (nonEmpty)
import Data.Maybe (fromMaybe,mapMaybe)
import Data.Maybe (catMaybes,fromMaybe,mapMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
@ -316,7 +316,7 @@ List of all DocBook tags, with [x] indicating implemented,
[ ] postcode - A postal code in an address
[x] preface - Introductory matter preceding the first chapter of a book
[ ] prefaceinfo - Meta-information for a Preface
[ ] primary - The primary word or phrase under which an index term should be
[x] primary - The primary word or phrase under which an index term should be
sorted
[ ] primaryie - A primary term in an index entry, not in the text
[ ] printhistory - The printing history of a document
@ -385,7 +385,7 @@ List of all DocBook tags, with [x] indicating implemented,
[o] screeninfo - Information about how a screen shot was produced
[ ] screenshot - A representation of what the user sees or might see on a
computer screen
[ ] secondary - A secondary word or phrase in an index term
[x] secondary - A secondary word or phrase in an index term
[ ] secondaryie - A secondary term in an index entry, rather than in the text
[x] sect1 - A top-level section of document
[x] sect1info - Meta-information for a Sect1
@ -461,7 +461,7 @@ List of all DocBook tags, with [x] indicating implemented,
[x] td - A table entry in an HTML table
[x] term - The word or phrase being defined or described in a variable list
[ ] termdef - An inline term definition
[ ] tertiary - A tertiary word or phrase in an index term
[x] tertiary - A tertiary word or phrase in an index term
[ ] tertiaryie - A tertiary term in an index entry, rather than in the text
[ ] textdata - Pointer to external text data
[ ] textobject - A wrapper for a text description of an object and its
@ -1080,6 +1080,17 @@ elementToStr :: Content -> Content
elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
elementToStr x = x
childElTextAsAttr :: Text -> Element -> Maybe (Text, Text)
childElTextAsAttr n e = case findChild q e of
Nothing -> Nothing
Just childEl -> Just (n, strContentRecursive childEl)
where q = QName n (Just "http://docbook.org/ns/docbook") Nothing
attrValueAsOptionalAttr :: Text -> Element -> Maybe (Text, Text)
attrValueAsOptionalAttr n e = case attrValue n e of
"" -> Nothing
_ -> Just (n, attrValue n e)
parseInline :: PandocMonad m => Content -> DB m Inlines
parseInline (Text (CData _ s _)) = return $ text s
parseInline (CRef ref) =
@ -1094,6 +1105,28 @@ parseInline (Elem e) =
if ident /= "" || classes /= []
then innerInlines (spanWith (ident,classes,[]))
else innerInlines id
"indexterm" -> do
let ident = attrValue "id" e
let classes = T.words $ attrValue "role" e
let attrs =
-- In DocBook, <primary>, <secondary>, <tertiary>, <see>, and <seealso>
-- have mixed content models. However, because we're representing these
-- elements in Pandoc's AST as attributes of a phrase, we flatten all
-- the descendant content of these elements.
[ childElTextAsAttr "primary" e
, childElTextAsAttr "secondary" e
, childElTextAsAttr "tertiary" e
, childElTextAsAttr "see" e
, childElTextAsAttr "seealso" e
, attrValueAsOptionalAttr "significance" e
, attrValueAsOptionalAttr "startref" e
, attrValueAsOptionalAttr "scope" e
, attrValueAsOptionalAttr "class" e
-- We don't do anything with the "pagenum" attribute, because these only
-- occur within literal <index> sections, which is not supported by Pandoc,
-- because Pandoc has no concept of pages.
]
return $ spanWith (ident, ("indexterm" : classes), (catMaybes attrs)) mempty
"equation" -> equation e displayMath
"informalequation" -> equation e displayMath
"inlineequation" -> equation e math

View file

@ -1603,4 +1603,16 @@ or here: &lt;http://example.com/&gt;
</step>
</procedure>
</sect1>
<sect1 id="indexterms">
<title>Index terms</title>
<para>
In the simplest case, index terms<indexterm><primary>index term</primary></indexterm> consists of just a <code>&lt;primary&gt;</code> element, but <indexterm><primary>index term</primary><secondary>multi-level</secondary></indexterm> they can also consist of a <code>&lt;primary&gt;</code> <emph>and</emph> <code>&lt;secondary&gt;</code> element, and <indexterm><primary>index term</primary><secondary>multi-level</secondary><tertiary>3-level</tertiary></indexterm> can even include a <code>&lt;tertiary&gt;</code> term.
</para>
<para>
Index terms can also refer to other index terms: <indexterm><primary>index cross referencing</primary></indexterm><indexterm><primary>index term</primary><secondary>cross references</secondary><see>index cross referencing</see></indexterm>exclusively, using the <code>&lt;see&gt;</code> tag; or <indexterm><primary>index cross referencing</primary><seealso>cross referencing</seealso></indexterm> as a reference to related terms, using the <code>&lt;seealso&gt;</code> tag.
</para>
<para>
<indexterm><primary>food</primary><secondary>big <foreignphrase>baguette</foreignphrase> <strong>supreme</strong></secondary></indexterm>Nested content in index term elements is flattened.
</para>
</sect1>
</article>

View file

@ -2930,4 +2930,191 @@ Pandoc
[ Str "A" , Space , Str "Final" , Space , Str "Step" ]
]
]
, Header
1
( "indexterms" , [] , [] )
[ Str "Index" , Space , Str "terms" ]
, Para
[ Str "In"
, Space
, Str "the"
, Space
, Str "simplest"
, Space
, Str "case,"
, Space
, Str "index"
, Space
, Str "terms"
, Span
( "" , [ "indexterm" ] , [ ( "primary" , "index term" ) ] )
[]
, Space
, Str "consists"
, Space
, Str "of"
, Space
, Str "just"
, Space
, Str "a"
, Space
, Code ( "" , [] , [] ) "<primary>"
, Space
, Str "element,"
, Space
, Str "but"
, Space
, Span
( ""
, [ "indexterm" ]
, [ ( "primary" , "index term" )
, ( "secondary" , "multi-level" )
]
)
[]
, Space
, Str "they"
, Space
, Str "can"
, Space
, Str "also"
, Space
, Str "consist"
, Space
, Str "of"
, Space
, Str "a"
, Space
, Code ( "" , [] , [] ) "<primary>"
, Space
, Str "and"
, Space
, Code ( "" , [] , [] ) "<secondary>"
, Space
, Str "element,"
, Space
, Str "and"
, Space
, Span
( ""
, [ "indexterm" ]
, [ ( "primary" , "index term" )
, ( "secondary" , "multi-level" )
, ( "tertiary" , "3-level" )
]
)
[]
, Space
, Str "can"
, Space
, Str "even"
, Space
, Str "include"
, Space
, Str "a"
, Space
, Code ( "" , [] , [] ) "<tertiary>"
, Space
, Str "term."
]
, Para
[ Str "Index"
, Space
, Str "terms"
, Space
, Str "can"
, Space
, Str "also"
, Space
, Str "refer"
, Space
, Str "to"
, Space
, Str "other"
, Space
, Str "index"
, Space
, Str "terms:"
, Space
, Span
( ""
, [ "indexterm" ]
, [ ( "primary" , "index cross referencing" ) ]
)
[]
, Span
( ""
, [ "indexterm" ]
, [ ( "primary" , "index term" )
, ( "secondary" , "cross references" )
, ( "see" , "index cross referencing" )
]
)
[]
, Str "exclusively,"
, Space
, Str "using"
, Space
, Str "the"
, Space
, Code ( "" , [] , [] ) "<see>"
, Space
, Str "tag;"
, Space
, Str "or"
, Space
, Span
( ""
, [ "indexterm" ]
, [ ( "primary" , "index cross referencing" )
, ( "seealso" , "cross referencing" )
]
)
[]
, Space
, Str "as"
, Space
, Str "a"
, Space
, Str "reference"
, Space
, Str "to"
, Space
, Str "related"
, Space
, Str "terms,"
, Space
, Str "using"
, Space
, Str "the"
, Space
, Code ( "" , [] , [] ) "<seealso>"
, Space
, Str "tag."
]
, Para
[ Span
( ""
, [ "indexterm" ]
, [ ( "primary" , "food" )
, ( "secondary" , "big baguette supreme" )
]
)
[]
, Str "Nested"
, Space
, Str "content"
, Space
, Str "in"
, Space
, Str "index"
, Space
, Str "term"
, Space
, Str "elements"
, Space
, Str "is"
, Space
, Str "flattened."
]
]