HTML reader: Support base tag.

We only support the href attribute, as there's no place for
"target" in the Pandoc document model for links.

Added HTML reader test module, with tests for this feature.

Closes #1751.
This commit is contained in:
John MacFarlane 2015-05-13 20:39:01 -07:00
parent 75cfa7b462
commit e06810499e
4 changed files with 58 additions and 7 deletions

View file

@ -481,6 +481,7 @@ Test-Suite test-pandoc
Tests.Shared
Tests.Walk
Tests.Readers.LaTeX
Tests.Readers.HTML
Tests.Readers.Markdown
Tests.Readers.Org
Tests.Readers.RST

View file

@ -62,7 +62,7 @@ import Debug.Trace (trace)
import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def)
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
import Network.URI (isURI)
import Text.Pandoc.Error
import Text.Parsec.Error
@ -74,7 +74,8 @@ readHtml :: ReaderOptions -- ^ Reader options
-> Either PandocError Pandoc
readHtml opts inp =
mapLeft (ParseFailure . getError) . flip runReader def $
runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags
runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing)
"source" tags
where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do
@ -98,7 +99,8 @@ replaceNotes' x = return x
data HTMLState =
HTMLState
{ parserState :: ParserState,
noteTable :: [(String, Blocks)]
noteTable :: [(String, Blocks)],
baseHref :: Maybe String
}
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
@ -120,7 +122,7 @@ pBody :: TagParser Blocks
pBody = pInTags "body" block
pHead :: TagParser Blocks
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
pMetaTag = do
@ -132,6 +134,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
let content = fromAttrib "content" mt
updateState $ B.setMeta name (B.text content)
return mempty
pBaseTag = do
bt <- pSatisfy (~== TagOpen "base" [])
let baseH = fromAttrib "href" bt
if null baseH
then return mempty
else do
let baseH' = case reverse baseH of
'/':_ -> baseH
_ -> baseH ++ "/"
updateState $ \st -> st{ baseHref = Just baseH' }
return mempty
block :: TagParser Blocks
block = do
@ -566,7 +579,11 @@ pAnchor = try $ do
pRelLink :: TagParser Inlines
pRelLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "href" tag
let url = case (isURI url', mbBaseHref) of
(False, Just h) -> h ++ url'
_ -> url'
let title = fromAttrib "title" tag
let uid = fromAttrib "id" tag
let spanC = case uid of
@ -578,7 +595,11 @@ pRelLink = try $ do
pImage :: TagParser Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
let url = fromAttrib "src" tag
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "src" tag
let url = case (isURI url', mbBaseHref) of
(False, Just h) -> h ++ url'
_ -> url'
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
return $ B.image (escapeURI url) title (B.text alt)
@ -945,7 +966,7 @@ instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance Default HTMLState where
def = HTMLState def []
def = HTMLState def [] Nothing
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}

View file

@ -0,0 +1,27 @@
{-# LANGUAGE OverloadedStrings #-}
module Tests.Readers.HTML (tests) where
import Text.Pandoc.Definition
import Test.Framework
import Tests.Helpers
import Tests.Arbitrary()
import Text.Pandoc.Builder
import Text.Pandoc
import Text.Pandoc.Error
html :: String -> Pandoc
html = handleError . readHtml def
tests :: [Test]
tests = [ testGroup "base tag"
[ test html "simple" $
"<head><base href=\"http://www.w3schools.com/images\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
, test html "slash at end of base" $
"<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
, test html "absolute URL" $
"<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"http://example.com/stickman.gif\" alt=\"Stickman\"></head>" =?>
plain (image "http://example.com/stickman.gif" "" (text "Stickman"))
]
]

View file

@ -8,6 +8,7 @@ import qualified Tests.Old
import qualified Tests.Readers.LaTeX
import qualified Tests.Readers.Markdown
import qualified Tests.Readers.Org
import qualified Tests.Readers.HTML
import qualified Tests.Readers.RST
import qualified Tests.Readers.Docx
import qualified Tests.Readers.Txt2Tags
@ -46,6 +47,7 @@ tests = [ testGroup "Old" Tests.Old.tests
, testGroup "Readers"
[ testGroup "LaTeX" Tests.Readers.LaTeX.tests
, testGroup "Markdown" Tests.Readers.Markdown.tests
, testGroup "HTML" Tests.Readers.HTML.tests
, testGroup "Org" Tests.Readers.Org.tests
, testGroup "RST" Tests.Readers.RST.tests
, testGroup "Docx" Tests.Readers.Docx.tests