From 48f2cc5600bd26c60ffa1d5531ba2d9aeead129d Mon Sep 17 00:00:00 2001
From: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>
Date: Fri, 21 Dec 2007 19:25:54 +0000
Subject: [PATCH] Modified rules for HTML header identifiers to ensure legal
 identifiers. + Modified htmlListToIdentifier and uniqueIdentifier in HTML
 writer   to ensure that identifiers begin with an alphabetic character. + The
 new rules are described in README. + Resolves Issue #33.

git-svn-id: https://pandoc.googlecode.com/svn/trunk@1150 788f1e2b-df1e-0410-8736-df70ead52e1b
---
 README                      | 17 ++++++++++++-----
 Text/Pandoc/Writers/HTML.hs | 29 ++++++++++++++++-------------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/README b/README
index a8fd9e673..cd6b30cda 100644
--- a/README
+++ b/README
@@ -790,12 +790,19 @@ derive the identifier from the header text,
   - Remove all punctuation, except dashes and hyphens.
   - Replace all spaces, dashes, newlines, and hyphens with hyphens.
   - Convert all alphabetic characters to lowercase.
+  - Remove everything up to the first letter (identifiers may
+    not begin with a number or punctuation mark).
+  - If nothing is left after this, use the identifier `section`.
 
-Thus, for example, a heading 'Header identifiers in HTML' will get
-the identifier `header-identifiers-in-html`, a heading
-'*Dogs*?--in *my* house?' will get the identifier `dogs--in-my-house`,
-and a heading '[HTML], [S5], or [RTF]?' will get the identifier
-`html-s5-or-rtf`.
+Thus, for example,
+
+  Header                                  Identifier
+  -------------------------------------   ---------------------------
+  Header identifiers in HTML              `header-identifiers-in-html`
+  *Dogs*?--in *my* house?                 `dogs--in-my-house`
+  [HTML], [S5], or [RTF]?                 `html-s5-or-rtf`
+  3. Applications                         `applications`
+  33                                      `section`
 
 These rules should, in most cases, allow one to determine the identifier
 from the header text. The exception is when several headers have the
diff --git a/Text/Pandoc/Writers/HTML.hs b/Text/Pandoc/Writers/HTML.hs
index 660bf652e..0061420d0 100644
--- a/Text/Pandoc/Writers/HTML.hs
+++ b/Text/Pandoc/Writers/HTML.hs
@@ -35,7 +35,7 @@ import Text.Pandoc.Shared
 import Text.Pandoc.Readers.TeXMath
 import Text.Regex ( mkRegex, matchRegex )
 import Numeric ( showHex )
-import Data.Char ( ord, toLower )
+import Data.Char ( ord, toLower, isAlpha )
 import Data.List ( isPrefixOf, intersperse )
 import qualified Data.Set as S
 import Control.Monad.State
@@ -215,18 +215,20 @@ addToCSS item = do
 
 -- | Convert Pandoc inline list to plain text identifier.
 inlineListToIdentifier :: [Inline] -> String
-inlineListToIdentifier [] = ""
-inlineListToIdentifier (x:xs) = 
-  xAsText ++ inlineListToIdentifier xs
+inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier'
+
+inlineListToIdentifier' [] = ""
+inlineListToIdentifier' (x:xs) = 
+  xAsText ++ inlineListToIdentifier' xs
   where xAsText = case x of
           Str s          -> filter (\c -> c == '-' || not (isPunctuation c)) $
                             concat $ intersperse "-" $ words $ map toLower s
-          Emph lst       -> inlineListToIdentifier lst
-          Strikeout lst  -> inlineListToIdentifier lst
-          Superscript lst -> inlineListToIdentifier lst
-          Subscript lst  -> inlineListToIdentifier lst
-          Strong lst     -> inlineListToIdentifier lst
-          Quoted _ lst   -> inlineListToIdentifier lst
+          Emph lst       -> inlineListToIdentifier' lst
+          Strikeout lst  -> inlineListToIdentifier' lst
+          Superscript lst -> inlineListToIdentifier' lst
+          Subscript lst  -> inlineListToIdentifier' lst
+          Strong lst     -> inlineListToIdentifier' lst
+          Quoted _ lst   -> inlineListToIdentifier' lst
           Code s         -> s
           Space          -> "-"
           EmDash         -> "-"
@@ -237,8 +239,8 @@ inlineListToIdentifier (x:xs) =
           Math _         -> ""
           TeX _          -> ""
           HtmlInline _   -> ""
-          Link lst _     -> inlineListToIdentifier lst
-          Image lst _    -> inlineListToIdentifier lst
+          Link lst _     -> inlineListToIdentifier' lst
+          Image lst _    -> inlineListToIdentifier' lst
           Note _         -> ""
 
 -- | Return unique identifiers for list of inline lists.
@@ -247,7 +249,8 @@ uniqueIdentifiers ls =
   let addIdentifier (nonuniqueIds, uniqueIds) l =
         let new = inlineListToIdentifier l
             matches = length $ filter (== new) nonuniqueIds
-            new' = new ++ if matches > 0 then ("-" ++ show matches) else ""
+            new' = (if null new then "section" else new) ++ 
+                   if matches > 0 then ("-" ++ show matches) else ""
         in  (new:nonuniqueIds, new':uniqueIds)
   in  reverse $ snd $ foldl addIdentifier ([],[]) ls