+ Simplified entity handling by removing stringToSGML from Entities.hs.

It is no longer needed now that all entities are processed in the markdown 
  and HTML readers.  All calls to stringToSGML have been replaced by calls
  to encodeEntities.
+ Since inTag's attribute handling already encodes entities, 
  calls to encodeEntities are no longer needed for attribute values, so
  they've been removed.
+ The HTML and Markdown readers now call decodeEntities on all raw
  strings (e.g. authors, dates, link titles), to ensure that no unprocessed
  entities are included in the native representation of the document. 
  (In the HTML reader, most of this work is done by a change in
  extractAttributeName.)
+ The result is a small speed improvement (around 5% on my benchmark)
  and cleaner code.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@519 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2007-01-28 00:04:43 +00:00
parent 21484713c6
commit dc6925542c
6 changed files with 25 additions and 43 deletions

View file

@ -34,7 +34,6 @@ module Text.Pandoc.Entities (
encodeEntities,
decodeEntities,
escapeSGMLChar,
stringToSGML,
characterEntity
) where
import Data.Char ( chr, ord )
@ -115,23 +114,6 @@ decodeEntities str =
Left err -> error $ "\nError: " ++ show err
Right result -> result
-- | Escape string for SGML, preserving entity references.
stringToSGML :: String -> String
stringToSGML str =
let regular = do
str <- many1 (satisfy (not . needsEscaping))
return str
special = do
notFollowedBy characterEntity
c <- anyChar
return $ escapeSGMLChar c
entity = do
ent <- manyTill anyChar (char ';')
return (ent ++ ";") in
case parse (many (regular <|> special <|> entity)) str str of
Left err -> error $ "\nError: " ++ show err
Right result -> concat result
entityTable :: [(String, Char)]
entityTable = [
("&quot;", chr 34),

View file

@ -445,7 +445,9 @@ extractAttribute name [] = Nothing
extractAttribute name ((attrName, contents):rest) =
let name' = map toLower name
attrName' = map toLower attrName in
if (attrName' == name') then Just contents else extractAttribute name rest
if (attrName' == name')
then Just (decodeEntities contents)
else extractAttribute name rest
link = try (do
(tag, attributes) <- htmlTag "a"

View file

@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock,
anyHtmlTag, anyHtmlEndTag,
htmlEndTag, extractTagType,
htmlBlockElement )
import Text.Pandoc.Entities ( characterEntity )
import Text.Pandoc.Entities ( characterEntity, decodeEntities )
import Text.ParserCombinators.Parsec
-- | Read markdown from an input string and return a Pandoc document.
@ -144,14 +144,14 @@ authorsLine = try (do
skipSpaces
authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
newline
return (map removeLeadingTrailingSpace authors))
return (map (decodeEntities . removeLeadingTrailingSpace) authors))
dateLine = try (do
char '%'
skipSpaces
date <- many (noneOf "\n")
newline
return (removeTrailingSpace date))
return (decodeEntities $ removeTrailingSpace date))
titleBlock = try (do
failIfStrict
@ -894,7 +894,7 @@ titleWith startChar endChar = try (do
char endChar
skipSpaces
notFollowedBy (noneOf ")\n")))
return tit)
return $ decodeEntities tit)
title = choice [ titleWith '(' ')',
titleWith '"' '"',

View file

@ -72,7 +72,7 @@ module Text.Pandoc.Shared (
) where
import Text.Pandoc.Definition
import Text.ParserCombinators.Parsec as Parsec
import Text.Pandoc.Entities ( decodeEntities, encodeEntities, stringToSGML )
import Text.Pandoc.Entities ( decodeEntities, encodeEntities )
import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>),
($$), nest, Doc, isEmpty )
import Data.Char ( toLower, ord )
@ -524,8 +524,8 @@ replaceRefLinksInline keytable other = other
-- | Return a text object with a string of formatted SGML attributes.
attributeList :: [(String, String)] -> Doc
attributeList = text . concatMap
(\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++
stringToSGML b ++ "\"")
(\(a, b) -> " " ++ encodeEntities a ++ "=\"" ++
encodeEntities b ++ "\"")
-- | Put the supplied contents between start and end tags of tagType,
-- with specified attributes and (if specified) indentation.

View file

@ -32,7 +32,7 @@ module Text.Pandoc.Writers.Docbook (
) where
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Entities ( encodeEntities, stringToSGML )
import Text.Pandoc.Entities ( encodeEntities )
import Data.Char ( toLower, ord )
import Data.List ( isPrefixOf, partition, drop )
import Text.PrettyPrint.HughesPJ hiding ( Str )
@ -64,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $
then -- last name first
let (lastname, rest) = break (==',') name
firstname = removeLeadingSpace rest in
inTagsSimple "firstname" (text $ stringToSGML firstname) <>
inTagsSimple "surname" (text $ stringToSGML lastname)
inTagsSimple "firstname" (text $ encodeEntities firstname) <>
inTagsSimple "surname" (text $ encodeEntities lastname)
else -- last name last
let namewords = words name
lengthname = length namewords
@ -73,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $
0 -> ("","")
1 -> ("", name)
n -> (joinWithSep " " (take (n-1) namewords), last namewords) in
inTagsSimple "firstname" (text $ stringToSGML firstname) $$
inTagsSimple "surname" (text $ stringToSGML lastname)
inTagsSimple "firstname" (text $ encodeEntities firstname) $$
inTagsSimple "surname" (text $ encodeEntities lastname)
-- | Convert Pandoc document to string in Docbook format.
writeDocbook :: WriterOptions -> Pandoc -> String
@ -86,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) =
then inTagsIndented "articleinfo" $
(inTagsSimple "title" (wrap opts title)) $$
(vcat (map authorToDocbook authors)) $$
(inTagsSimple "date" (text $ stringToSGML date))
(inTagsSimple "date" (text $ encodeEntities date))
else empty
blocks' = replaceReferenceLinks blocks
(noteBlocks, blocks'') = partition isNoteBlock blocks'
@ -227,7 +227,7 @@ inlineToDocbook opts (Image alt (Src src tit)) =
then empty
else inTagsIndented "objectinfo" $
inTagsIndented "title"
(text $ stringToSGML tit) in
(text $ encodeEntities tit) in
inTagsIndented "inlinemediaobject" $
inTagsIndented "imageobject" $
titleDoc $$ selfClosingTag "imagedata" [("fileref", src)]

View file

@ -32,7 +32,7 @@ module Text.Pandoc.Writers.HTML (
) where
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Entities ( encodeEntities, stringToSGML )
import Text.Pandoc.Entities ( encodeEntities )
import Text.Regex ( mkRegex, matchRegex )
import Numeric ( showHex )
import Data.Char ( ord, toLower )
@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) =
then empty
else selfClosingTag "meta" [("name", "author"),
("content",
joinWithSep ", " (map stringToSGML authors))]
joinWithSep ", " (map encodeEntities authors))]
datetext = if (date == "")
then empty
else selfClosingTag "meta" [("name", "date"),
("content", stringToSGML date)] in
("content", encodeEntities date)] in
text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$
text "</head>\n<body>"
@ -248,20 +248,18 @@ inlineToHtml opts (TeX str) = text $ encodeEntities str
inlineToHtml opts (HtmlInline str) = text str
inlineToHtml opts (LineBreak) = selfClosingTag "br" []
inlineToHtml opts Space = space
inlineToHtml opts (Link txt (Src src tit)) =
let title = stringToSGML tit in
inlineToHtml opts (Link txt (Src src title)) =
if (isPrefixOf "mailto:" src)
then obfuscateLink opts txt src
else inTags False "a" ([("href", encodeEntities src)] ++
if null tit then [] else [("title", title)])
else inTags False "a" ([("href", src)] ++
if null title then [] else [("title", title)])
(inlineListToHtml opts txt)
inlineToHtml opts (Link txt (Ref ref)) =
char '[' <> (inlineListToHtml opts txt) <> text "][" <>
(inlineListToHtml opts ref) <> char ']'
-- this is what markdown does, for better or worse
inlineToHtml opts (Image alt (Src source tit)) =
let title = stringToSGML tit
alternate = render $ inlineListToHtml opts alt in
inlineToHtml opts (Image alt (Src source title)) =
let alternate = render $ inlineListToHtml opts alt in
selfClosingTag "img" $ [("src", source)] ++
(if null alternate then [] else [("alt", alternate)]) ++
[("title", title)] -- note: null title is included, as in Markdown.pl