HTML reader: Support base tag.

We only support the href attribute, as there's no place for
"target" in the Pandoc document model for links.

Added HTML reader test module, with tests for this feature.

Closes #1751.
This commit is contained in:
John MacFarlane 2015-05-13 20:39:01 -07:00
parent 75cfa7b462
commit e06810499e
4 changed files with 58 additions and 7 deletions

View file

@ -481,6 +481,7 @@ Test-Suite test-pandoc
Tests.Shared Tests.Shared
Tests.Walk Tests.Walk
Tests.Readers.LaTeX Tests.Readers.LaTeX
Tests.Readers.HTML
Tests.Readers.Markdown Tests.Readers.Markdown
Tests.Readers.Org Tests.Readers.Org
Tests.Readers.RST Tests.Readers.RST

View file

@ -62,7 +62,7 @@ import Debug.Trace (trace)
import Text.TeXMath (readMathML, writeTeX) import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def) import Data.Default (Default (..), def)
import Control.Monad.Reader (Reader,ask, asks, local, runReader) import Control.Monad.Reader (Reader,ask, asks, local, runReader)
import Network.URI (isURI)
import Text.Pandoc.Error import Text.Pandoc.Error
import Text.Parsec.Error import Text.Parsec.Error
@ -74,7 +74,8 @@ readHtml :: ReaderOptions -- ^ Reader options
-> Either PandocError Pandoc -> Either PandocError Pandoc
readHtml opts inp = readHtml opts inp =
mapLeft (ParseFailure . getError) . flip runReader def $ mapLeft (ParseFailure . getError) . flip runReader def $
runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing)
"source" tags
where tags = stripPrefixes . canonicalizeTags $ where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do parseDoc = do
@ -98,7 +99,8 @@ replaceNotes' x = return x
data HTMLState = data HTMLState =
HTMLState HTMLState
{ parserState :: ParserState, { parserState :: ParserState,
noteTable :: [(String, Blocks)] noteTable :: [(String, Blocks)],
baseHref :: Maybe String
} }
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
@ -120,7 +122,7 @@ pBody :: TagParser Blocks
pBody = pInTags "body" block pBody = pInTags "body" block
pHead :: TagParser Blocks pHead :: TagParser Blocks
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
where pTitle = pInTags "title" inline >>= setTitle . trimInlines where pTitle = pInTags "title" inline >>= setTitle . trimInlines
setTitle t = mempty <$ (updateState $ B.setMeta "title" t) setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
pMetaTag = do pMetaTag = do
@ -132,6 +134,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
let content = fromAttrib "content" mt let content = fromAttrib "content" mt
updateState $ B.setMeta name (B.text content) updateState $ B.setMeta name (B.text content)
return mempty return mempty
pBaseTag = do
bt <- pSatisfy (~== TagOpen "base" [])
let baseH = fromAttrib "href" bt
if null baseH
then return mempty
else do
let baseH' = case reverse baseH of
'/':_ -> baseH
_ -> baseH ++ "/"
updateState $ \st -> st{ baseHref = Just baseH' }
return mempty
block :: TagParser Blocks block :: TagParser Blocks
block = do block = do
@ -566,7 +579,11 @@ pAnchor = try $ do
pRelLink :: TagParser Inlines pRelLink :: TagParser Inlines
pRelLink = try $ do pRelLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "href" tag
let url = case (isURI url', mbBaseHref) of
(False, Just h) -> h ++ url'
_ -> url'
let title = fromAttrib "title" tag let title = fromAttrib "title" tag
let uid = fromAttrib "id" tag let uid = fromAttrib "id" tag
let spanC = case uid of let spanC = case uid of
@ -578,7 +595,11 @@ pRelLink = try $ do
pImage :: TagParser Inlines pImage :: TagParser Inlines
pImage = do pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src") tag <- pSelfClosing (=="img") (isJust . lookup "src")
let url = fromAttrib "src" tag mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "src" tag
let url = case (isURI url', mbBaseHref) of
(False, Just h) -> h ++ url'
_ -> url'
let title = fromAttrib "title" tag let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag let alt = fromAttrib "alt" tag
return $ B.image (escapeURI url) title (B.text alt) return $ B.image (escapeURI url) title (B.text alt)
@ -945,7 +966,7 @@ instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState extractReaderOptions = extractReaderOptions . parserState
instance Default HTMLState where instance Default HTMLState where
def = HTMLState def [] def = HTMLState def [] Nothing
instance HasMeta HTMLState where instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st} setMeta s b st = st {parserState = setMeta s b $ parserState st}

View file

@ -0,0 +1,27 @@
{-# LANGUAGE OverloadedStrings #-}
module Tests.Readers.HTML (tests) where
import Text.Pandoc.Definition
import Test.Framework
import Tests.Helpers
import Tests.Arbitrary()
import Text.Pandoc.Builder
import Text.Pandoc
import Text.Pandoc.Error
html :: String -> Pandoc
html = handleError . readHtml def
tests :: [Test]
tests = [ testGroup "base tag"
[ test html "simple" $
"<head><base href=\"http://www.w3schools.com/images\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
, test html "slash at end of base" $
"<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
, test html "absolute URL" $
"<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"http://example.com/stickman.gif\" alt=\"Stickman\"></head>" =?>
plain (image "http://example.com/stickman.gif" "" (text "Stickman"))
]
]

View file

@ -8,6 +8,7 @@ import qualified Tests.Old
import qualified Tests.Readers.LaTeX import qualified Tests.Readers.LaTeX
import qualified Tests.Readers.Markdown import qualified Tests.Readers.Markdown
import qualified Tests.Readers.Org import qualified Tests.Readers.Org
import qualified Tests.Readers.HTML
import qualified Tests.Readers.RST import qualified Tests.Readers.RST
import qualified Tests.Readers.Docx import qualified Tests.Readers.Docx
import qualified Tests.Readers.Txt2Tags import qualified Tests.Readers.Txt2Tags
@ -46,6 +47,7 @@ tests = [ testGroup "Old" Tests.Old.tests
, testGroup "Readers" , testGroup "Readers"
[ testGroup "LaTeX" Tests.Readers.LaTeX.tests [ testGroup "LaTeX" Tests.Readers.LaTeX.tests
, testGroup "Markdown" Tests.Readers.Markdown.tests , testGroup "Markdown" Tests.Readers.Markdown.tests
, testGroup "HTML" Tests.Readers.HTML.tests
, testGroup "Org" Tests.Readers.Org.tests , testGroup "Org" Tests.Readers.Org.tests
, testGroup "RST" Tests.Readers.RST.tests , testGroup "RST" Tests.Readers.RST.tests
, testGroup "Docx" Tests.Readers.Docx.tests , testGroup "Docx" Tests.Readers.Docx.tests