HTML writer: wrap sections in divs. Resolves Issue #70.

+ hierarchicalize has been rationalized; it builds a hierarchical representation of the document from the headers, and simultaneously gives each section a unique identifier based on the heading title. + Identifiers are now attached to the divs rather than to the headers themselves. + Table of content backlinks go to the beginning of the table, rather than to the section reference that was clicked. This seems better. + Code for constructing identifiers has been moved to Text.Pandoc.Shared from the HTML writer, since it is now consumed only by hierarchicalize. + In --strict mode, pandoc just prints bare headings, as before (unless --toc has been specified). + In s5 output, it does not wrap sections in divs, as that seems to confuse the s5 javascript. + Test suite updated accordingly. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1562 788f1e2b-df1e-0410-8736-df70ead52e1b
2009-04-25 00:29:58 +00:00 · 2009-04-25 00:29:58 +00:00 · df5244fd48
commit df5244fd48
parent 2ab7640df6
10 changed files with 1212 additions and 1151 deletions
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@ -1,4 +1,4 @@
-{-# LANGUAGE CPP #-}
+{-# LANGUAGE CPP, DeriveDataTypeable #-}
 {-
 Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>

@ -112,13 +112,16 @@ import Text.ParserCombinators.Parsec
 import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
 import qualified Text.PrettyPrint.HughesPJ as PP
 import Text.Pandoc.CharacterReferences ( characterReference )
-import Data.Char ( toLower, toUpper, ord, isLower, isUpper )
+import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha,
+                   isPunctuation )
 import Data.List ( find, isPrefixOf, intercalate )
-import Control.Monad ( join )
 import Network.URI ( parseURI, URI (..), isAllowedInURI )
 import System.Directory
 import Prelude hiding ( putStrLn, writeFile, readFile, getContents )
 import System.IO.UTF8
+import Data.Generics
+import qualified Control.Monad.State as S
+import Control.Monad (join)

 --
 -- List processing
@ -878,22 +881,74 @@ endsWithPlain blocks =

 -- | Data structure for defining hierarchical Pandoc documents
 data Element = Blk Block 
-             | Sec [Inline] [Element] deriving (Eq, Read, Show)
+             | Sec Int String [Inline] [Element]
+             --    lvl  ident  label    contents
+             deriving (Eq, Read, Show, Typeable, Data)

-- | Returns @True@ on Header block with at least the specified level
-headerAtLeast :: Int -> Block -> Bool
-headerAtLeast level (Header x _) = x <= level
-headerAtLeast _ _ = False
+-- | Convert Pandoc inline list to plain text identifier.
+inlineListToIdentifier :: [Inline] -> String
+inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier'
+
+inlineListToIdentifier' :: [Inline] -> [Char]
+inlineListToIdentifier' [] = ""
+inlineListToIdentifier' (x:xs) =
+  xAsText ++ inlineListToIdentifier' xs
+  where xAsText = case x of
+          Str s          -> filter (\c -> c == '-' || not (isPunctuation c)) $
+                            intercalate "-" $ words $ map toLower s
+          Emph lst       -> inlineListToIdentifier' lst
+          Strikeout lst  -> inlineListToIdentifier' lst
+          Superscript lst -> inlineListToIdentifier' lst
+          SmallCaps   lst -> inlineListToIdentifier' lst
+          Subscript lst  -> inlineListToIdentifier' lst
+          Strong lst     -> inlineListToIdentifier' lst
+          Quoted _ lst   -> inlineListToIdentifier' lst
+          Cite   _ lst   -> inlineListToIdentifier' lst
+          Code s         -> s
+          Space          -> "-"
+          EmDash         -> "-"
+          EnDash         -> "-"
+          Apostrophe     -> ""
+          Ellipses       -> ""
+          LineBreak      -> "-"
+          Math _ _       -> ""
+          TeX _          -> ""
+          HtmlInline _   -> ""
+          Link lst _     -> inlineListToIdentifier' lst
+          Image lst _    -> inlineListToIdentifier' lst
+          Note _         -> ""

 -- | Convert list of Pandoc blocks into (hierarchical) list of Elements
 hierarchicalize :: [Block] -> [Element]
-hierarchicalize [] = []
-hierarchicalize (block:rest) = 
-  case block of
-    (Header level title) -> 
-         let (thisSection, rest') = break (headerAtLeast level) rest
-         in  (Sec title (hierarchicalize thisSection)):(hierarchicalize rest') 
-    x -> (Blk x):(hierarchicalize rest)
+hierarchicalize blocks = S.evalState (hierarchicalizeWithIds blocks) []
+
+hierarchicalizeWithIds :: [Block] -> S.State [String] [Element]
+hierarchicalizeWithIds [] = return []
+hierarchicalizeWithIds ((Header level title'):xs) = do
+  usedIdents <- S.get
+  let ident = uniqueIdent title' usedIdents
+  S.modify (ident :)
+  let (sectionContents, rest) = break (headerLtEq level) xs
+  sectionContents' <- hierarchicalizeWithIds sectionContents
+  rest' <- hierarchicalizeWithIds rest
+  return $ Sec level ident title' sectionContents' : rest'
+hierarchicalizeWithIds (x:rest) = do
+  rest' <- hierarchicalizeWithIds rest
+  return $ (Blk x) : rest'
+
+headerLtEq :: Int -> Block -> Bool
+headerLtEq level (Header l _) = l <= level
+headerLtEq _ _ = False
+
+uniqueIdent :: [Inline] -> [String] -> String
+uniqueIdent title' usedIdents =
+  let baseIdent = inlineListToIdentifier title'
+      numIdent n = baseIdent ++ "-" ++ show n
+  in  if baseIdent `elem` usedIdents
+        then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of
+                  Just x  -> numIdent x
+                  Nothing -> baseIdent   -- if we have more than 60,000, allow repeats
+        else baseIdent

 -- | True if block is a Header block.
 isHeaderBlock :: Block -> Bool
--- a/src/Text/Pandoc/Writers/Docbook.hs
+++ b/src/Text/Pandoc/Writers/Docbook.hs
@ -80,7 +80,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) =
 -- | Convert an Element to Docbook.
 elementToDocbook :: WriterOptions -> Element -> Doc
 elementToDocbook opts (Blk block) = blockToDocbook opts block 
-elementToDocbook opts (Sec title elements) =
+elementToDocbook opts (Sec _ _ title elements) =
  -- Docbook doesn't allow sections with no content, so insert some if needed
  let elements' = if null elements
                    then [Blk (Para [])]
--- a/src/Text/Pandoc/Writers/HTML.hs
+++ b/src/Text/Pandoc/Writers/HTML.hs
@ -36,22 +36,21 @@ import Text.Pandoc.Shared
 import Text.Pandoc.Readers.TeXMath
 import Text.Pandoc.Highlighting ( highlightHtml, defaultHighlightingCss )
 import Numeric ( showHex )
-import Data.Char ( ord, toLower, isAlpha )
+import Data.Char ( ord, toLower )
 import Data.List ( isPrefixOf, intercalate )
+import Data.Maybe ( catMaybes )
 import qualified Data.Set as S
 import Control.Monad.State
 import Text.XHtml.Transitional hiding ( stringToHtml )

 data WriterState = WriterState
    { stNotes            :: [Html]       -- ^ List of notes
-    , stIds              :: [String]     -- ^ List of header identifiers
    , stMath             :: Bool         -- ^ Math is used in document
    , stCSS              :: S.Set String -- ^ CSS to include in header
    } deriving Show

 defaultWriterState :: WriterState
-defaultWriterState = WriterState {stNotes= [], stIds = [], 
-                                  stMath = False, stCSS = S.empty}
+defaultWriterState = WriterState {stNotes= [], stMath = False, stCSS = S.empty}

 -- Helpers to render HTML with the appropriate function.

@ -107,15 +106,13 @@ writeHtml opts (Pandoc (Meta tit authors date) blocks) =
                    not (writerS5 opts)
                        then h1 ! [theclass "title"] $ topTitle
                        else noHtml
-      headerBlocks = filter isHeaderBlock blocks
-      ids          = uniqueIdentifiers $ 
-                     map (\(Header _ lst) -> lst) headerBlocks
+      sects        = hierarchicalize blocks
      toc          = if writerTableOfContents opts 
-                        then tableOfContents opts headerBlocks ids
+                        then evalState (tableOfContents opts sects) defaultWriterState
                        else noHtml
-      (blocks', newstate) = 
-                     runState (blockListToHtml opts blocks)
-                     (defaultWriterState {stIds = ids})
+      (blocks', newstate) = runState 
+                            (mapM (elementToHtml opts) sects >>= return . toHtmlFromList)
+                            defaultWriterState
      cssLines     = stCSS newstate
      css          = if S.null cssLines
                        then noHtml
@ -146,35 +143,36 @@ writeHtml opts (Pandoc (Meta tit authors date) blocks) =
         then head' +++ body thebody
         else thebody

-- | Construct table of contents from list of header blocks and identifiers.
-- Assumes there are as many identifiers as header blocks.
-tableOfContents :: WriterOptions -> [Block] -> [String] -> Html
-tableOfContents _ [] _ = noHtml
-tableOfContents opts headers ids =
+-- | Construct table of contents from list of elements.
+tableOfContents :: WriterOptions -> [Element] -> State WriterState Html
+tableOfContents _ [] = return noHtml
+tableOfContents opts sects = do
  let opts'        = opts { writerIgnoreNotes = True }
-      contentsTree = hierarchicalize headers
-      contents     = evalState (mapM (elementToListItem opts') contentsTree) 
-                     (defaultWriterState {stIds = ids})
-  in  thediv ! [identifier "toc"] $ unordList contents
+  contents  <- mapM (elementToListItem opts') sects
+  return $ thediv ! [identifier "TOC"] $ unordList $ catMaybes contents

 -- | Converts an Element to a list item for a table of contents,
 -- retrieving the appropriate identifier from state.
-elementToListItem :: WriterOptions -> Element -> State WriterState Html
-elementToListItem _ (Blk _) = return noHtml
-elementToListItem opts (Sec headerText subsecs) = do
-  st <- get
-  let ids = stIds st
-  let (id', rest) = if null ids
-                      then ("", [])
-                      else (head ids, tail ids)
-  put $ st {stIds = rest}
+elementToListItem :: WriterOptions -> Element -> State WriterState (Maybe Html)
+elementToListItem _ (Blk _) = return Nothing
+elementToListItem opts (Sec _ id' headerText subsecs) = do
  txt <- inlineListToHtml opts headerText
-  subHeads <- mapM (elementToListItem opts) subsecs
+  subHeads <- mapM (elementToListItem opts) subsecs >>= return . catMaybes
  let subList = if null subHeads
                   then noHtml
-                   else unordList subHeads 
-  return $ (anchor ! [href ("#" ++ id'), identifier ("TOC-" ++ id')] $ txt) +++ 
-           subList
+                   else unordList subHeads
+  return $ Just $ (anchor ! [href ("#" ++ id')] $ txt) +++ subList
+
+-- | Convert an Element to Html.
+elementToHtml :: WriterOptions -> Element -> State WriterState Html
+elementToHtml opts (Blk block) = blockToHtml opts block 
+elementToHtml opts (Sec level id' title' elements) = do
+  innerContents <- mapM (elementToHtml opts) elements
+  header' <- blockToHtml opts (Header level title')
+  return $ if writerS5 opts || (writerStrictMarkdown opts && not (writerTableOfContents opts))
+              -- S5 gets confused by the extra divs around sections
+              then toHtmlFromList (header' : innerContents)
+              else thediv ! [identifier id'] << (header' : innerContents)

 -- | Convert list of Note blocks to a footnote <div>.
 -- Assumes notes are sorted.
@ -236,15 +234,6 @@ obfuscateChar char =
 obfuscateString :: String -> String
 obfuscateString = concatMap obfuscateChar . decodeCharacterReferences

-- | True if character is a punctuation character (unicode).
-isPunctuation :: Char -> Bool
-isPunctuation c =
-  let c' = ord c
-  in  if c `elem` "!\"'()*,-./:;<>?[\\]`{|}~" || c' >= 0x2000 && c' <= 0x206F ||
-         c' >= 0xE000 && c' <= 0xE0FF
-         then True
-         else False
-
 -- | Add CSS for document header.
 addToCSS :: String -> State WriterState ()
 addToCSS item = do
@ -252,50 +241,6 @@ addToCSS item = do
  let current = stCSS st
  put $ st {stCSS = S.insert item current}

-- | Convert Pandoc inline list to plain text identifier.
-inlineListToIdentifier :: [Inline] -> String
-inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier'
-
-inlineListToIdentifier' :: [Inline] -> [Char]
-inlineListToIdentifier' [] = ""
-inlineListToIdentifier' (x:xs) = 
-  xAsText ++ inlineListToIdentifier' xs
-  where xAsText = case x of
-          Str s          -> filter (\c -> c == '-' || not (isPunctuation c)) $
-                            intercalate "-" $ words $ map toLower s
-          Emph lst       -> inlineListToIdentifier' lst
-          Strikeout lst  -> inlineListToIdentifier' lst
-          Superscript lst -> inlineListToIdentifier' lst
-          SmallCaps   lst -> inlineListToIdentifier' lst
-          Subscript lst  -> inlineListToIdentifier' lst
-          Strong lst     -> inlineListToIdentifier' lst
-          Quoted _ lst   -> inlineListToIdentifier' lst
-          Cite   _ lst   -> inlineListToIdentifier' lst
-          Code s         -> s
-          Space          -> "-"
-          EmDash         -> "-"
-          EnDash         -> "-"
-          Apostrophe     -> ""
-          Ellipses       -> ""
-          LineBreak      -> "-"
-          Math _ _       -> ""
-          TeX _          -> ""
-          HtmlInline _   -> ""
-          Link lst _     -> inlineListToIdentifier' lst
-          Image lst _    -> inlineListToIdentifier' lst
-          Note _         -> ""
-
-- | Return unique identifiers for list of inline lists.
-uniqueIdentifiers :: [[Inline]] -> [String]
-uniqueIdentifiers ls =
-  let addIdentifier (nonuniqueIds, uniqueIds) l =
-        let new = inlineListToIdentifier l
-            matches = length $ filter (== new) nonuniqueIds
-            new' = (if null new then "section" else new) ++ 
-                   if matches > 0 then ("-" ++ show matches) else ""
-        in  (new:nonuniqueIds, new':uniqueIds)
-  in  reverse $ snd $ foldl addIdentifier ([],[]) ls
-
 -- | Convert Pandoc block element to HTML.
 blockToHtml :: WriterOptions -> Block -> State WriterState Html
 blockToHtml _ Null = return $ noHtml 
@ -335,26 +280,17 @@ blockToHtml opts (BlockQuote blocks) =
     else blockListToHtml opts blocks >>= (return . blockquote)
 blockToHtml opts (Header level lst) = do 
  contents <- inlineListToHtml opts lst
-  st <- get
-  let ids = stIds st
-  let (id', rest) = if null ids
-                      then ("", [])
-                      else (head ids, tail ids)
-  put $ st {stIds = rest}
-  let attribs = if writerStrictMarkdown opts && not (writerTableOfContents opts)
-                   then []
-                   else [identifier id']
  let contents'  = if writerTableOfContents opts
-                      then anchor ! [href ("#TOC-" ++ id')] $ contents
+                      then anchor ! [href "#TOC"] $ contents
                      else contents
  return $ case level of
-              1 -> h1 contents' ! attribs
-              2 -> h2 contents' ! attribs
-              3 -> h3 contents' ! attribs
-              4 -> h4 contents' ! attribs
-              5 -> h5 contents' ! attribs
-              6 -> h6 contents' ! attribs
-              _ -> paragraph contents' ! attribs
+              1 -> h1 contents'
+              2 -> h2 contents'
+              3 -> h3 contents'
+              4 -> h4 contents'
+              5 -> h5 contents'
+              6 -> h6 contents'
+              _ -> paragraph contents'
 blockToHtml opts (BulletList lst) = do
  contents <- mapM (blockListToHtml opts) lst
  let attribs = if writerIncremental opts
@ -492,7 +428,7 @@ inlineToHtml opts inline =
                                  return $ primHtml $ "<EQ>" ++ str ++ "</EQ>"
                               PlainMath -> 
                                  inlineListToHtml opts (readTeXMath str) >>=
-                                  return . (thespan ! [theclass "math"])) 
+                                  return . (thespan ! [theclass "math"]) ) 
    (TeX str)        -> case writerHTMLMathMethod opts of
                              LaTeXMathML _ -> do modify (\st -> st {stMath = True})
                                                  return $ primHtml str
--- a/src/Text/Pandoc/Writers/Markdown.hs
+++ b/src/Text/Pandoc/Writers/Markdown.hs
@ -138,7 +138,7 @@ tableOfContents opts headers =
 -- | Converts an Element to a list item for a table of contents,
 elementToListItem :: Element -> [Block]
 elementToListItem (Blk _) = []
-elementToListItem (Sec headerText subsecs) = [Plain headerText] ++ 
+elementToListItem (Sec _ _ headerText subsecs) = [Plain headerText] ++ 
  if null subsecs
     then []
     else [BulletList $ map elementToListItem subsecs]
--- a/src/Text/Pandoc/Writers/RTF.hs
+++ b/src/Text/Pandoc/Writers/RTF.hs
@ -59,7 +59,7 @@ tableOfContents headers =

 elementToListItem :: Element -> [Block]
 elementToListItem (Blk _) = []
-elementToListItem (Sec sectext subsecs) = [Plain sectext] ++
+elementToListItem (Sec _ _ sectext subsecs) = [Plain sectext] ++
  if null subsecs
     then []
     else [BulletList (map elementToListItem subsecs)]
--- a/tests/s5.basic.html
+++ b/tests/s5.basic.html
@ -276,24 +276,24 @@ window.onload=startup;window.onresize=function(){setTimeout('fontScale()',50);}
 <div id="currentSlide"></div>
 <div id="header"></div>
 <div id="footer">
-<h1 id="july-15-2006"
+<h1
    >July 15, 2006</h1
-    ><h2 id="my-s5-document"
+    ><h2
    >My S5 Document</h2
    ></div>
 </div>
 <div class="presentation">

 <div class="slide">
-<h1 id="my-s5-document-1"
+<h1
    >My S5 Document</h1
-    ><h3 id="sam-smith-jen-jones"
+    ><h3
    >Sam Smith, Jen Jones</h3
-    ><h4 id="july-15-2006-1"
+    ><h4
    >July 15, 2006</h4
    ></div>
 <div class="slide">
-<h1 id="first-slide"
+<h1
    >First slide</h1
    ><ul
    ><li
@ -303,7 +303,7 @@ window.onload=startup;window.onresize=function(){setTimeout('fontScale()',50);}
      ></ul
    ></div>
 <div class="slide">
-<h1 id="math"
+<h1
    >Math</h1
    ><ul
    ><li
--- a/tests/s5.fancy.html
+++ b/tests/s5.fancy.html
@ -484,24 +484,24 @@ window.onload=startup;window.onresize=function(){setTimeout('fontScale()',50);}
 <div id="currentSlide"></div>
 <div id="header"></div>
 <div id="footer">
-<h1 id="july-15-2006"
+<h1
    >July 15, 2006</h1
-    ><h2 id="my-s5-document"
+    ><h2
    >My S5 Document</h2
    ></div>
 </div>
 <div class="presentation">

 <div class="slide">
-<h1 id="my-s5-document-1"
+<h1
    >My S5 Document</h1
-    ><h3 id="sam-smith-jen-jones"
+    ><h3
    >Sam Smith, Jen Jones</h3
-    ><h4 id="july-15-2006-1"
+    ><h4
    >July 15, 2006</h4
    ></div>
 <div class="slide">
-<h1 id="first-slide"
+<h1
    >First slide</h1
    ><ul class="incremental"
    ><li
@ -511,7 +511,7 @@ window.onload=startup;window.onresize=function(){setTimeout('fontScale()',50);}
      ></ul
    ></div>
 <div class="slide">
-<h1 id="math"
+<h1
    >Math</h1
    ><ul class="incremental"
    ><li
--- a/tests/s5.fragment.html
+++ b/tests/s5.fragment.html
@ -1,41 +1,45 @@
-<h1 id="first-slide"
->First slide</h1
-><ul
-><li
-  >first bullet</li
+<div id="first-slide"
+><h1
+  >First slide</h1
+  ><ul
  ><li
-  >second bullet</li
-  ></ul
-><h1 id="math"
->Math</h1
-><ul
-><li
-  ><span class="math"
-    >\frac{<em
-      >d</em
-      >}{<em
-      >dx</em
-      >}<em
-      >f</em
-      >(<em
-      >x</em
-      >)=\lim<sub
-      ><em
-	>h</em
-	>&#8201;&#8594;&#8201;0</sub
+    >first bullet</li
+    ><li
+    >second bullet</li
+    ></ul
+  ></div
+><div id="math"
+><h1
+  >Math</h1
+  ><ul
+  ><li
+    ><span class="math"
      >\frac{<em
-      >f</em
-      >(<em
-      >x</em
-      >+<em
-      >h</em
-      >)-<em
-      >f</em
-      >(<em
-      >x</em
-      >)}{<em
-      >h</em
-      >}</span
-    ></li
-  ></ul
+	>d</em
+	>}{<em
+	>dx</em
+	>}<em
+	>f</em
+	>(<em
+	>x</em
+	>)=\lim<sub
+	><em
+	  >h</em
+	  >&#8201;&#8594;&#8201;0</sub
+	>\frac{<em
+	>f</em
+	>(<em
+	>x</em
+	>+<em
+	>h</em
+	>)-<em
+	>f</em
+	>(<em
+	>x</em
+	>)}{<em
+	>h</em
+	>}</span
+      ></li
+    ></ul
+  ></div
 >
--- a/tests/s5.inserts.html
+++ b/tests/s5.inserts.html
@ -15,46 +15,50 @@ STUFF INSERTED
  >STUFF INSERTED
 <h1 class="title"
    >My S5 Document</h1
-    ><h1 id="first-slide"
-    >First slide</h1
-    ><ul
-    ><li
-      >first bullet</li
+    ><div id="first-slide"
+    ><h1
+      >First slide</h1
+      ><ul
      ><li
-      >second bullet</li
-      ></ul
-    ><h1 id="math"
-    >Math</h1
-    ><ul
-    ><li
-      ><span class="math"
-	>\frac{<em
-	  >d</em
-	  >}{<em
-	  >dx</em
-	  >}<em
-	  >f</em
-	  >(<em
-	  >x</em
-	  >)=\lim<sub
-	  ><em
-	    >h</em
-	    >&#8201;&#8594;&#8201;0</sub
+	>first bullet</li
+	><li
+	>second bullet</li
+	></ul
+      ></div
+    ><div id="math"
+    ><h1
+      >Math</h1
+      ><ul
+      ><li
+	><span class="math"
 	  >\frac{<em
-	  >f</em
-	  >(<em
-	  >x</em
-	  >+<em
-	  >h</em
-	  >)-<em
-	  >f</em
-	  >(<em
-	  >x</em
-	  >)}{<em
-	  >h</em
-	  >}</span
-	></li
-      ></ul
+	    >d</em
+	    >}{<em
+	    >dx</em
+	    >}<em
+	    >f</em
+	    >(<em
+	    >x</em
+	    >)=\lim<sub
+	    ><em
+	      >h</em
+	      >&#8201;&#8594;&#8201;0</sub
+	    >\frac{<em
+	    >f</em
+	    >(<em
+	    >x</em
+	    >+<em
+	    >h</em
+	    >)-<em
+	    >f</em
+	    >(<em
+	    >x</em
+	    >)}{<em
+	    >h</em
+	    >}</span
+	  ></li
+	></ul
+      ></div
    >STUFF INSERTED
 </body
  ></html
--- a/tests/writer.html
+++ b/tests/writer.html