Fixed regression in CSS parsing with --self-contained.

In 1b44acf0c5 we replaced some
hackish CSS parsing with css-text, which I thought was a complete
CSS parser.  It turns out that it is very buggy, which results
in lots of things being silently dropped from CSS when
`--self-contained` is used (#2224).

This commit replaces the use of css-text with a small but
more principled css preprocessor, which only removes whitespace
and replaces URLs with base 64 data when possible.

Closes #2224.
This commit is contained in:
John MacFarlane 2015-06-28 11:51:35 -07:00
parent 2768d1c2d2
commit ed9a118b54
2 changed files with 44 additions and 37 deletions

View file

@ -262,7 +262,6 @@ Library
temporary >= 1.1 && < 1.3,
blaze-html >= 0.5 && < 0.9,
blaze-markup >= 0.5.1 && < 0.8,
css-text >= 0.1.2 && < 0.3,
yaml >= 0.8.8.2 && < 0.9,
scientific >= 0.2 && < 0.4,
vector >= 0.10 && < 0.11,

View file

@ -40,20 +40,16 @@ import System.FilePath (takeExtension, takeDirectory, (</>))
import Data.Char (toLower, isAscii, isAlphaNum)
import Codec.Compression.GZip as Gzip
import qualified Data.ByteString.Lazy as L
import Text.Pandoc.Shared (renderTags', err, fetchItem')
import Text.Pandoc.Shared (renderTags', err, fetchItem', warn, trim)
import Text.Pandoc.MediaBag (MediaBag)
import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.UTF8 (toString)
import Text.Pandoc.Options (WriterOptions(..))
import Data.List (isPrefixOf)
import Control.Applicative
import Text.CSS.Parse (parseNestedBlocks, NestedBlock(..))
import Text.CSS.Render (renderNestedBlocks)
import Data.Text.Encoding (decodeUtf8, encodeUtf8)
import qualified Data.Text as T
import Data.Text (Text)
import Data.Text.Lazy (toStrict)
import Data.Text.Lazy.Builder (toLazyText)
import Text.Parsec (runParserT, ParsecT)
import qualified Text.Parsec as P
import Control.Monad.Trans (lift)
isOk :: Char -> Bool
isOk c = isAscii c && isAlphaNum c
@ -94,40 +90,52 @@ convertTag media sourceURL t@(TagOpen "link" as) =
return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
convertTag _ _ t = return t
-- NOTE: This is really crude, it doesn't respect CSS comments.
cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString
-> IO ByteString
cssURLs media sourceURL d orig = do
case parseNestedBlocks (decodeUtf8 orig) of
Left _err -> return orig
Right bs -> (encodeUtf8 . toStrict . toLazyText . renderNestedBlocks)
<$> mapM (handleCSSUrls media sourceURL d) bs
res <- runParserT (parseCSSUrls media sourceURL d) () "css" orig
case res of
Left e -> warn ("Could not parse CSS: " ++ show e) >> return orig
Right bs -> return bs
handleCSSUrls :: MediaBag -> Maybe String -> FilePath -> NestedBlock
-> IO NestedBlock
handleCSSUrls media sourceURL d (NestedBlock t bs) =
NestedBlock t <$> mapM (handleCSSUrls media sourceURL d) bs
handleCSSUrls media sourceURL d (LeafBlock (selector, attrs)) = do
attrs' <- mapM (handleCSSAttr media sourceURL d) attrs
return (LeafBlock (selector, attrs'))
parseCSSUrls :: MediaBag -> Maybe String -> FilePath
-> ParsecT ByteString () IO ByteString
parseCSSUrls media sourceURL d = B.concat <$> P.many
(pCSSWhite <|> pCSSComment <|> pCSSUrl media sourceURL d <|> pCSSOther)
handleCSSAttr :: MediaBag -> Maybe String -> FilePath -> (Text, Text)
-> IO (Text, Text)
handleCSSAttr media sourceURL d (key, val) =
if "url(" `T.isPrefixOf` val
then do
let url = T.unpack $ dropParens $ T.drop 3 val
case url of
'#':_ -> return (key, val)
'd':'a':'t':'a':':':_ -> return (key, val)
_ -> do
let url' = if isURI url then url else d </> url
enc <- getDataURI media sourceURL "" url'
return (key, T.pack enc)
else return (key, val)
pCSSWhite :: ParsecT ByteString () IO ByteString
pCSSWhite = P.space >> P.spaces >> return B.empty
pCSSComment :: ParsecT ByteString () IO ByteString
pCSSComment = P.try $ do
P.string "/*"
P.manyTill P.anyChar (P.try (P.string "*/"))
return B.empty
pCSSOther :: ParsecT ByteString () IO ByteString
pCSSOther = do
(B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|>
(B.singleton <$> P.char 'u') <|>
(B.singleton <$> P.char '/')
pCSSUrl :: MediaBag -> Maybe String -> FilePath
-> ParsecT ByteString () IO ByteString
pCSSUrl media sourceURL d = P.try $ do
P.string "url("
P.spaces
quote <- P.option Nothing (Just <$> P.oneOf "\"'")
url <- P.manyTill P.anyChar (maybe (P.lookAhead (P.char ')')) P.char quote)
P.spaces
P.char ')'
let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++
maybe "" (:[]) quote ++ ")")
case trim url of
'#':_ -> return fallback
'd':'a':'t':'a':':':_ -> return fallback
u -> do let url' = if isURI u then u else d </> u
enc <- lift $ getDataURI media sourceURL "" url'
return (B.pack enc)
dropParens :: Text -> Text
dropParens = T.dropAround (`elem` ['(',')','"','\'',' ','\t','\n','\r'])
getDataURI :: MediaBag -> Maybe String -> MimeType -> String
-> IO String