From dc6925542c6aa60078c370e7e356b42ea216b1b7 Mon Sep 17 00:00:00 2001
From: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>
Date: Sun, 28 Jan 2007 00:04:43 +0000
Subject: [PATCH] + Simplified entity handling by removing stringToSGML from
 Entities.hs.   It is no longer needed now that all entities are processed in
 the markdown   and HTML readers.  All calls to stringToSGML have been
 replaced by calls   to encodeEntities. + Since inTag's attribute handling
 already encodes entities,   calls to encodeEntities are no longer needed for
 attribute values, so   they've been removed. + The HTML and Markdown readers
 now call decodeEntities on all raw   strings (e.g. authors, dates, link
 titles), to ensure that no unprocessed   entities are included in the native
 representation of the document.   (In the HTML reader, most of this work is
 done by a change in   extractAttributeName.) + The result is a small speed
 improvement (around 5% on my benchmark)   and cleaner code.

git-svn-id: https://pandoc.googlecode.com/svn/trunk@519 788f1e2b-df1e-0410-8736-df70ead52e1b
---
 src/Text/Pandoc/Entities.hs         | 18 ------------------
 src/Text/Pandoc/Readers/HTML.hs     |  4 +++-
 src/Text/Pandoc/Readers/Markdown.hs |  8 ++++----
 src/Text/Pandoc/Shared.hs           |  6 +++---
 src/Text/Pandoc/Writers/Docbook.hs  | 14 +++++++-------
 src/Text/Pandoc/Writers/HTML.hs     | 18 ++++++++----------
 6 files changed, 25 insertions(+), 43 deletions(-)

diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs
index 696f943a6..e91cf3864 100644
--- a/src/Text/Pandoc/Entities.hs
+++ b/src/Text/Pandoc/Entities.hs
@@ -34,7 +34,6 @@ module Text.Pandoc.Entities (
                      encodeEntities,
                      decodeEntities,
                      escapeSGMLChar,
-                     stringToSGML,
                      characterEntity
                     ) where
 import Data.Char ( chr, ord )
@@ -115,23 +114,6 @@ decodeEntities str =
 	Left err        -> error $ "\nError: " ++ show err
 	Right result    -> result
 
--- | Escape string for SGML, preserving entity references.
-stringToSGML :: String -> String
-stringToSGML str = 
-  let regular   = do
-                    str <- many1 (satisfy (not . needsEscaping))
-                    return str 
-      special   = do
-                    notFollowedBy characterEntity
-                    c <- anyChar
-                    return $ escapeSGMLChar c 
-      entity    = do
-                    ent <- manyTill anyChar (char ';')
-                    return (ent ++ ";") in
-  case parse (many (regular <|> special <|> entity)) str str of
-    Left err       -> error $ "\nError: " ++ show err
-    Right result   -> concat result
-
 entityTable :: [(String, Char)]
 entityTable =  [
 	("&quot;", chr 34),
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index fc06b657e..3fcb33698 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -445,7 +445,9 @@ extractAttribute name [] = Nothing
 extractAttribute name ((attrName, contents):rest) = 
   let name' = map toLower name 
       attrName' = map toLower attrName in
-  if (attrName' == name') then Just contents else extractAttribute name rest
+  if (attrName' == name')
+     then Just (decodeEntities contents)
+     else extractAttribute name rest
 
 link = try (do
   (tag, attributes) <- htmlTag "a"  
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 0f1ef348d..a7456426f 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock,
                                   anyHtmlTag, anyHtmlEndTag,
                                   htmlEndTag, extractTagType,
                                   htmlBlockElement )
-import Text.Pandoc.Entities ( characterEntity )
+import Text.Pandoc.Entities ( characterEntity, decodeEntities )
 import Text.ParserCombinators.Parsec
 
 -- | Read markdown from an input string and return a Pandoc document.
@@ -144,14 +144,14 @@ authorsLine = try (do
   skipSpaces
   authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
   newline
-  return (map removeLeadingTrailingSpace authors))
+  return (map (decodeEntities . removeLeadingTrailingSpace) authors))
 
 dateLine = try (do
   char '%'
   skipSpaces
   date <- many (noneOf "\n")
   newline
-  return (removeTrailingSpace date))
+  return (decodeEntities $ removeTrailingSpace date))
 
 titleBlock = try (do
   failIfStrict
@@ -894,7 +894,7 @@ titleWith startChar endChar = try (do
                                   char endChar
                                   skipSpaces
                                   notFollowedBy (noneOf ")\n")))
-  return tit)
+  return $ decodeEntities tit)
 
 title = choice [ titleWith '(' ')', 
                  titleWith '"' '"', 
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index 02f8782b2..83b64b4fb 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -72,7 +72,7 @@ module Text.Pandoc.Shared (
                     ) where
 import Text.Pandoc.Definition
 import Text.ParserCombinators.Parsec as Parsec
-import Text.Pandoc.Entities ( decodeEntities, encodeEntities, stringToSGML )
+import Text.Pandoc.Entities ( decodeEntities, encodeEntities )
 import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>), 
                                          ($$), nest, Doc, isEmpty )
 import Data.Char ( toLower, ord )
@@ -524,8 +524,8 @@ replaceRefLinksInline keytable other = other
 -- | Return a text object with a string of formatted SGML attributes. 
 attributeList :: [(String, String)] -> Doc
 attributeList = text .  concatMap 
-  (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ 
-  stringToSGML b ++ "\"") 
+  (\(a, b) -> " " ++ encodeEntities a ++ "=\"" ++ 
+  encodeEntities b ++ "\"") 
 
 -- | Put the supplied contents between start and end tags of tagType,
 --   with specified attributes and (if specified) indentation.
diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs
index 7e50f8ede..405b2978a 100644
--- a/src/Text/Pandoc/Writers/Docbook.hs
+++ b/src/Text/Pandoc/Writers/Docbook.hs
@@ -32,7 +32,7 @@ module Text.Pandoc.Writers.Docbook (
                                    ) where
 import Text.Pandoc.Definition
 import Text.Pandoc.Shared
-import Text.Pandoc.Entities ( encodeEntities, stringToSGML )
+import Text.Pandoc.Entities ( encodeEntities )
 import Data.Char ( toLower, ord )
 import Data.List ( isPrefixOf, partition, drop )
 import Text.PrettyPrint.HughesPJ hiding ( Str )
@@ -64,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $
     then -- last name first
       let (lastname, rest) = break (==',') name 
           firstname = removeLeadingSpace rest in
-      inTagsSimple "firstname" (text $ stringToSGML firstname) <> 
-      inTagsSimple "surname" (text $ stringToSGML lastname) 
+      inTagsSimple "firstname" (text $ encodeEntities firstname) <> 
+      inTagsSimple "surname" (text $ encodeEntities lastname) 
     else -- last name last
       let namewords = words name
           lengthname = length namewords 
@@ -73,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $
             0  -> ("","") 
             1  -> ("", name)
             n  -> (joinWithSep " " (take (n-1) namewords), last namewords) in
-       inTagsSimple "firstname" (text $ stringToSGML firstname) $$ 
-       inTagsSimple "surname" (text $ stringToSGML lastname) 
+       inTagsSimple "firstname" (text $ encodeEntities firstname) $$ 
+       inTagsSimple "surname" (text $ encodeEntities lastname) 
 
 -- | Convert Pandoc document to string in Docbook format.
 writeDocbook :: WriterOptions -> Pandoc -> String
@@ -86,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) =
                 then inTagsIndented "articleinfo" $
                      (inTagsSimple "title" (wrap opts title)) $$ 
                      (vcat (map authorToDocbook authors)) $$ 
-                     (inTagsSimple "date" (text $ stringToSGML date)) 
+                     (inTagsSimple "date" (text $ encodeEntities date)) 
                 else empty
       blocks' = replaceReferenceLinks blocks
       (noteBlocks, blocks'') = partition isNoteBlock blocks' 
@@ -227,7 +227,7 @@ inlineToDocbook opts (Image alt (Src src tit)) =
                    then empty
                    else inTagsIndented "objectinfo" $
                         inTagsIndented "title" 
-                        (text $ stringToSGML tit) in
+                        (text $ encodeEntities tit) in
   inTagsIndented "inlinemediaobject" $ 
   inTagsIndented "imageobject" $
   titleDoc $$ selfClosingTag "imagedata" [("fileref", src)] 
diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs
index 0f2a2b5dc..8a654e3c9 100644
--- a/src/Text/Pandoc/Writers/HTML.hs
+++ b/src/Text/Pandoc/Writers/HTML.hs
@@ -32,7 +32,7 @@ module Text.Pandoc.Writers.HTML (
                                 ) where
 import Text.Pandoc.Definition
 import Text.Pandoc.Shared
-import Text.Pandoc.Entities ( encodeEntities, stringToSGML )
+import Text.Pandoc.Entities ( encodeEntities )
 import Text.Regex ( mkRegex, matchRegex )
 import Numeric ( showHex )
 import Data.Char ( ord, toLower )
@@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) =
                       then empty 
                       else selfClosingTag "meta" [("name", "author"), 
                            ("content", 
-                            joinWithSep ", " (map stringToSGML authors))]  
+                            joinWithSep ", " (map encodeEntities authors))]  
       datetext = if (date == "")
                     then empty 
                     else selfClosingTag "meta" [("name", "date"),
-                         ("content", stringToSGML date)] in
+                         ("content", encodeEntities date)] in
   text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$ 
   text "</head>\n<body>"
 
@@ -248,20 +248,18 @@ inlineToHtml opts (TeX str) = text $ encodeEntities str
 inlineToHtml opts (HtmlInline str) = text str
 inlineToHtml opts (LineBreak) = selfClosingTag "br" []
 inlineToHtml opts Space = space
-inlineToHtml opts (Link txt (Src src tit)) = 
-  let title = stringToSGML tit in
+inlineToHtml opts (Link txt (Src src title)) = 
   if (isPrefixOf "mailto:" src)
      then obfuscateLink opts txt src 
-     else inTags False "a" ([("href", encodeEntities src)] ++ 
-          if null tit then [] else [("title", title)]) 
+     else inTags False "a" ([("href", src)] ++ 
+          if null title then [] else [("title", title)]) 
           (inlineListToHtml opts txt)
 inlineToHtml opts (Link txt (Ref ref)) = 
   char '[' <> (inlineListToHtml opts txt) <> text "][" <> 
   (inlineListToHtml opts ref) <> char ']'
   -- this is what markdown does, for better or worse
-inlineToHtml opts (Image alt (Src source tit)) = 
-  let title = stringToSGML tit
-      alternate = render $ inlineListToHtml opts alt in 
+inlineToHtml opts (Image alt (Src source title)) = 
+  let alternate = render $ inlineListToHtml opts alt in 
   selfClosingTag "img" $ [("src", source)] ++
   (if null alternate then [] else [("alt", alternate)]) ++
   [("title", title)]  -- note:  null title is included, as in Markdown.pl