HTML reader: Support base tag.
We only support the href attribute, as there's no place for "target" in the Pandoc document model for links. Added HTML reader test module, with tests for this feature. Closes #1751.
This commit is contained in:
parent
75cfa7b462
commit
e06810499e
4 changed files with 58 additions and 7 deletions
|
@ -481,6 +481,7 @@ Test-Suite test-pandoc
|
|||
Tests.Shared
|
||||
Tests.Walk
|
||||
Tests.Readers.LaTeX
|
||||
Tests.Readers.HTML
|
||||
Tests.Readers.Markdown
|
||||
Tests.Readers.Org
|
||||
Tests.Readers.RST
|
||||
|
|
|
@ -62,7 +62,7 @@ import Debug.Trace (trace)
|
|||
import Text.TeXMath (readMathML, writeTeX)
|
||||
import Data.Default (Default (..), def)
|
||||
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
|
||||
|
||||
import Network.URI (isURI)
|
||||
import Text.Pandoc.Error
|
||||
|
||||
import Text.Parsec.Error
|
||||
|
@ -74,7 +74,8 @@ readHtml :: ReaderOptions -- ^ Reader options
|
|||
-> Either PandocError Pandoc
|
||||
readHtml opts inp =
|
||||
mapLeft (ParseFailure . getError) . flip runReader def $
|
||||
runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags
|
||||
runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing)
|
||||
"source" tags
|
||||
where tags = stripPrefixes . canonicalizeTags $
|
||||
parseTagsOptions parseOptions{ optTagPosition = True } inp
|
||||
parseDoc = do
|
||||
|
@ -98,7 +99,8 @@ replaceNotes' x = return x
|
|||
data HTMLState =
|
||||
HTMLState
|
||||
{ parserState :: ParserState,
|
||||
noteTable :: [(String, Blocks)]
|
||||
noteTable :: [(String, Blocks)],
|
||||
baseHref :: Maybe String
|
||||
}
|
||||
|
||||
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
|
||||
|
@ -120,7 +122,7 @@ pBody :: TagParser Blocks
|
|||
pBody = pInTags "body" block
|
||||
|
||||
pHead :: TagParser Blocks
|
||||
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
|
||||
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
|
||||
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
|
||||
setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
|
||||
pMetaTag = do
|
||||
|
@ -132,6 +134,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
|
|||
let content = fromAttrib "content" mt
|
||||
updateState $ B.setMeta name (B.text content)
|
||||
return mempty
|
||||
pBaseTag = do
|
||||
bt <- pSatisfy (~== TagOpen "base" [])
|
||||
let baseH = fromAttrib "href" bt
|
||||
if null baseH
|
||||
then return mempty
|
||||
else do
|
||||
let baseH' = case reverse baseH of
|
||||
'/':_ -> baseH
|
||||
_ -> baseH ++ "/"
|
||||
updateState $ \st -> st{ baseHref = Just baseH' }
|
||||
return mempty
|
||||
|
||||
block :: TagParser Blocks
|
||||
block = do
|
||||
|
@ -566,7 +579,11 @@ pAnchor = try $ do
|
|||
pRelLink :: TagParser Inlines
|
||||
pRelLink = try $ do
|
||||
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
|
||||
let url = fromAttrib "href" tag
|
||||
mbBaseHref <- baseHref <$> getState
|
||||
let url' = fromAttrib "href" tag
|
||||
let url = case (isURI url', mbBaseHref) of
|
||||
(False, Just h) -> h ++ url'
|
||||
_ -> url'
|
||||
let title = fromAttrib "title" tag
|
||||
let uid = fromAttrib "id" tag
|
||||
let spanC = case uid of
|
||||
|
@ -578,7 +595,11 @@ pRelLink = try $ do
|
|||
pImage :: TagParser Inlines
|
||||
pImage = do
|
||||
tag <- pSelfClosing (=="img") (isJust . lookup "src")
|
||||
let url = fromAttrib "src" tag
|
||||
mbBaseHref <- baseHref <$> getState
|
||||
let url' = fromAttrib "src" tag
|
||||
let url = case (isURI url', mbBaseHref) of
|
||||
(False, Just h) -> h ++ url'
|
||||
_ -> url'
|
||||
let title = fromAttrib "title" tag
|
||||
let alt = fromAttrib "alt" tag
|
||||
return $ B.image (escapeURI url) title (B.text alt)
|
||||
|
@ -945,7 +966,7 @@ instance HasReaderOptions HTMLState where
|
|||
extractReaderOptions = extractReaderOptions . parserState
|
||||
|
||||
instance Default HTMLState where
|
||||
def = HTMLState def []
|
||||
def = HTMLState def [] Nothing
|
||||
|
||||
instance HasMeta HTMLState where
|
||||
setMeta s b st = st {parserState = setMeta s b $ parserState st}
|
||||
|
|
27
tests/Tests/Readers/HTML.hs
Normal file
27
tests/Tests/Readers/HTML.hs
Normal file
|
@ -0,0 +1,27 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module Tests.Readers.HTML (tests) where
|
||||
|
||||
import Text.Pandoc.Definition
|
||||
import Test.Framework
|
||||
import Tests.Helpers
|
||||
import Tests.Arbitrary()
|
||||
import Text.Pandoc.Builder
|
||||
import Text.Pandoc
|
||||
import Text.Pandoc.Error
|
||||
|
||||
html :: String -> Pandoc
|
||||
html = handleError . readHtml def
|
||||
|
||||
tests :: [Test]
|
||||
tests = [ testGroup "base tag"
|
||||
[ test html "simple" $
|
||||
"<head><base href=\"http://www.w3schools.com/images\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
|
||||
plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
|
||||
, test html "slash at end of base" $
|
||||
"<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
|
||||
plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
|
||||
, test html "absolute URL" $
|
||||
"<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"http://example.com/stickman.gif\" alt=\"Stickman\"></head>" =?>
|
||||
plain (image "http://example.com/stickman.gif" "" (text "Stickman"))
|
||||
]
|
||||
]
|
|
@ -8,6 +8,7 @@ import qualified Tests.Old
|
|||
import qualified Tests.Readers.LaTeX
|
||||
import qualified Tests.Readers.Markdown
|
||||
import qualified Tests.Readers.Org
|
||||
import qualified Tests.Readers.HTML
|
||||
import qualified Tests.Readers.RST
|
||||
import qualified Tests.Readers.Docx
|
||||
import qualified Tests.Readers.Txt2Tags
|
||||
|
@ -46,6 +47,7 @@ tests = [ testGroup "Old" Tests.Old.tests
|
|||
, testGroup "Readers"
|
||||
[ testGroup "LaTeX" Tests.Readers.LaTeX.tests
|
||||
, testGroup "Markdown" Tests.Readers.Markdown.tests
|
||||
, testGroup "HTML" Tests.Readers.HTML.tests
|
||||
, testGroup "Org" Tests.Readers.Org.tests
|
||||
, testGroup "RST" Tests.Readers.RST.tests
|
||||
, testGroup "Docx" Tests.Readers.Docx.tests
|
||||
|
|
Loading…
Reference in a new issue