MediaWiki reader: Improved efficiency with raw html tags.
Parse one tag, then use a case statement.
This commit is contained in:
parent
880af86556
commit
e3abc2595f
1 changed files with 59 additions and 78 deletions
|
@ -30,8 +30,6 @@ Conversion of mediawiki text to 'Pandoc' document.
|
||||||
-}
|
-}
|
||||||
{-
|
{-
|
||||||
TODO:
|
TODO:
|
||||||
_ make tag parsers more efficient by parsing one HTML tag, checking it,
|
|
||||||
then...
|
|
||||||
_ support HTML lists
|
_ support HTML lists
|
||||||
_ support list style attributes and start values in ol lists, also
|
_ support list style attributes and start values in ol lists, also
|
||||||
value attribute on li
|
value attribute on li
|
||||||
|
@ -85,23 +83,27 @@ spaceChars = " \n\t"
|
||||||
sym :: String -> MWParser ()
|
sym :: String -> MWParser ()
|
||||||
sym s = () <$ try (string s)
|
sym s = () <$ try (string s)
|
||||||
|
|
||||||
|
isBlockTag' :: Tag String -> Bool
|
||||||
|
isBlockTag' tag@(TagOpen t _) = isBlockTag tag ||
|
||||||
|
t == "haskell" || t == "syntaxhighlight"
|
||||||
|
isBlockTag' tag@(TagClose t) = isBlockTag tag ||
|
||||||
|
t == "haskell" || t == "syntaxhighlight"
|
||||||
|
isBlockTag' tag = isBlockTag tag
|
||||||
|
|
||||||
htmlComment :: MWParser ()
|
htmlComment :: MWParser ()
|
||||||
htmlComment = () <$ htmlTag isCommentTag
|
htmlComment = () <$ htmlTag isCommentTag
|
||||||
|
|
||||||
inlinesInTags :: String -> MWParser Inlines
|
inlinesInTags :: String -> MWParser Inlines
|
||||||
inlinesInTags tag = trimInlines . mconcat <$> try
|
inlinesInTags tag = trimInlines . mconcat <$> try
|
||||||
(htmlTag (~== TagOpen tag []) *>
|
(manyTill inline (htmlTag (~== TagClose tag)))
|
||||||
manyTill inline (htmlTag (~== TagClose tag)))
|
|
||||||
|
|
||||||
blocksInTags :: String -> MWParser Blocks
|
blocksInTags :: String -> MWParser Blocks
|
||||||
blocksInTags tag = mconcat <$> try
|
blocksInTags tag = mconcat <$> try
|
||||||
(htmlTag (~== TagOpen tag []) *>
|
(manyTill block (htmlTag (~== TagClose tag)))
|
||||||
manyTill block (htmlTag (~== TagClose tag)))
|
|
||||||
|
|
||||||
charsInTags :: String -> MWParser [Char]
|
charsInTags :: String -> MWParser [Char]
|
||||||
charsInTags tag = innerText . parseTags <$> try
|
charsInTags tag = innerText . parseTags <$> try
|
||||||
(htmlTag (~== TagOpen tag []) *>
|
(manyTill anyChar (htmlTag (~== TagClose tag)))
|
||||||
manyTill anyChar (htmlTag (~== TagClose tag)))
|
|
||||||
|
|
||||||
--
|
--
|
||||||
-- main parser
|
-- main parser
|
||||||
|
@ -119,18 +121,15 @@ parseMediaWiki = do
|
||||||
--
|
--
|
||||||
|
|
||||||
block :: MWParser Blocks
|
block :: MWParser Blocks
|
||||||
block = header
|
block = mempty <$ skipMany1 blankline
|
||||||
|
<|> header
|
||||||
<|> hrule
|
<|> hrule
|
||||||
<|> bulletList
|
<|> bulletList
|
||||||
<|> orderedList
|
<|> orderedList
|
||||||
<|> definitionList
|
<|> definitionList
|
||||||
<|> mempty <$ try (spaces *> htmlComment)
|
<|> mempty <$ try (spaces *> htmlComment)
|
||||||
<|> preformatted
|
<|> preformatted
|
||||||
<|> blockquote
|
<|> blockTag
|
||||||
<|> codeblock
|
|
||||||
<|> syntaxhighlight
|
|
||||||
<|> haskell
|
|
||||||
<|> mempty <$ skipMany1 blankline
|
|
||||||
<|> pTag
|
<|> pTag
|
||||||
<|> blockHtml
|
<|> blockHtml
|
||||||
<|> para
|
<|> para
|
||||||
|
@ -138,6 +137,32 @@ block = header
|
||||||
para :: MWParser Blocks
|
para :: MWParser Blocks
|
||||||
para = B.para . trimInlines . mconcat <$> many1 inline
|
para = B.para . trimInlines . mconcat <$> many1 inline
|
||||||
|
|
||||||
|
blockTag :: MWParser Blocks
|
||||||
|
blockTag = do
|
||||||
|
(TagOpen t attrs, raw) <- htmlTag (\x -> isBlockTag' x && isTagOpen x)
|
||||||
|
case t of
|
||||||
|
"blockquote" -> B.blockQuote <$> blocksInTags "blockquote"
|
||||||
|
"pre" -> B.codeBlock . trimCode <$> charsInTags "pre"
|
||||||
|
"syntaxhighlight" -> syntaxhighlight attrs
|
||||||
|
"haskell" -> B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
|
||||||
|
charsInTags "haskell"
|
||||||
|
"p" -> return mempty
|
||||||
|
_ -> return $ B.rawBlock "html" raw
|
||||||
|
|
||||||
|
trimCode :: String -> String
|
||||||
|
trimCode ('\n':xs) = stripTrailingNewlines xs
|
||||||
|
trimCode xs = stripTrailingNewlines xs
|
||||||
|
|
||||||
|
syntaxhighlight :: [Attribute String] -> MWParser Blocks
|
||||||
|
syntaxhighlight attrs = try $ do
|
||||||
|
let mblang = lookup "lang" attrs
|
||||||
|
let mbstart = lookup "start" attrs
|
||||||
|
let mbline = lookup "line" attrs
|
||||||
|
let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline
|
||||||
|
let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart
|
||||||
|
contents <- charsInTags "syntaxhighlight"
|
||||||
|
return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents
|
||||||
|
|
||||||
-- We can just skip pTags, as contents will be treated as paragraphs
|
-- We can just skip pTags, as contents will be treated as paragraphs
|
||||||
pTag :: MWParser Blocks
|
pTag :: MWParser Blocks
|
||||||
pTag = mempty <$ (htmlTag (\t -> t ~== TagOpen "p" [] || t ~== TagClose "p"))
|
pTag = mempty <$ (htmlTag (\t -> t ~== TagOpen "p" [] || t ~== TagClose "p"))
|
||||||
|
@ -164,32 +189,6 @@ preformatted = do
|
||||||
strToCode x = x
|
strToCode x = x
|
||||||
B.para . bottomUp strToCode . mconcat <$> many1 inline'
|
B.para . bottomUp strToCode . mconcat <$> many1 inline'
|
||||||
|
|
||||||
blockquote :: MWParser Blocks
|
|
||||||
blockquote = B.blockQuote <$> blocksInTags "blockquote"
|
|
||||||
|
|
||||||
codeblock :: MWParser Blocks
|
|
||||||
codeblock = B.codeBlock . trimCode <$> charsInTags "pre"
|
|
||||||
|
|
||||||
trimCode :: String -> String
|
|
||||||
trimCode ('\n':xs) = stripTrailingNewlines xs
|
|
||||||
trimCode xs = stripTrailingNewlines xs
|
|
||||||
|
|
||||||
syntaxhighlight :: MWParser Blocks
|
|
||||||
syntaxhighlight = try $ do
|
|
||||||
(TagOpen _ attrs, _) <- lookAhead
|
|
||||||
$ htmlTag (~== TagOpen "syntaxhighlight" [])
|
|
||||||
let mblang = lookup "lang" attrs
|
|
||||||
let mbstart = lookup "start" attrs
|
|
||||||
let mbline = lookup "line" attrs
|
|
||||||
let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline
|
|
||||||
let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart
|
|
||||||
contents <- charsInTags "syntaxhighlight"
|
|
||||||
return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents
|
|
||||||
|
|
||||||
haskell :: MWParser Blocks
|
|
||||||
haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
|
|
||||||
charsInTags "haskell"
|
|
||||||
|
|
||||||
header :: MWParser Blocks
|
header :: MWParser Blocks
|
||||||
header = try $ do
|
header = try $ do
|
||||||
col <- sourceColumn <$> getPosition
|
col <- sourceColumn <$> getPosition
|
||||||
|
@ -271,15 +270,8 @@ inline = whitespace
|
||||||
<|> str
|
<|> str
|
||||||
<|> strong
|
<|> strong
|
||||||
<|> emph
|
<|> emph
|
||||||
<|> nowiki
|
|
||||||
<|> linebreak
|
|
||||||
<|> externalLink
|
<|> externalLink
|
||||||
<|> strikeout
|
<|> inlineTag
|
||||||
<|> subscript
|
|
||||||
<|> superscript
|
|
||||||
<|> math
|
|
||||||
<|> code
|
|
||||||
<|> hask
|
|
||||||
<|> B.singleton <$> charRef
|
<|> B.singleton <$> charRef
|
||||||
<|> inlineHtml
|
<|> inlineHtml
|
||||||
<|> special
|
<|> special
|
||||||
|
@ -287,8 +279,26 @@ inline = whitespace
|
||||||
str :: MWParser Inlines
|
str :: MWParser Inlines
|
||||||
str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
|
str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
|
||||||
|
|
||||||
|
inlineTag :: MWParser Inlines
|
||||||
|
inlineTag = do
|
||||||
|
(TagOpen t _, raw) <- htmlTag (\x -> isInlineTag x && isTagOpen x)
|
||||||
|
case t of
|
||||||
|
"nowiki" -> B.text . fromEntities <$> try
|
||||||
|
(manyTill anyChar (htmlTag (~== TagClose "nowiki")))
|
||||||
|
"br" -> B.linebreak <$
|
||||||
|
(optional (htmlTag (~== TagClose "br")) *> optional blankline)
|
||||||
|
"strike" -> B.strikeout <$> inlinesInTags "strike"
|
||||||
|
"del" -> B.strikeout <$> inlinesInTags "del"
|
||||||
|
"sub" -> B.subscript <$> inlinesInTags "sub"
|
||||||
|
"sup" -> B.superscript <$> inlinesInTags "sup"
|
||||||
|
"math" -> B.math <$> charsInTags "math"
|
||||||
|
"code" -> B.code <$> charsInTags "code"
|
||||||
|
"tt" -> B.code <$> charsInTags "tt"
|
||||||
|
"hask" -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
|
||||||
|
_ -> return $ B.rawInline "html" raw
|
||||||
|
|
||||||
special :: MWParser Inlines
|
special :: MWParser Inlines
|
||||||
special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *>
|
special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag') *>
|
||||||
oneOf specialChars)
|
oneOf specialChars)
|
||||||
|
|
||||||
inlineHtml :: MWParser Inlines
|
inlineHtml :: MWParser Inlines
|
||||||
|
@ -303,12 +313,6 @@ endline = () <$ try (newline <*
|
||||||
notFollowedBy' hrule <*
|
notFollowedBy' hrule <*
|
||||||
notFollowedBy anyListStart)
|
notFollowedBy anyListStart)
|
||||||
|
|
||||||
linebreak :: MWParser Inlines
|
|
||||||
linebreak = B.linebreak <$
|
|
||||||
(htmlTag (~== TagOpen "br" []) *>
|
|
||||||
optional (htmlTag (~== TagClose "br")) *>
|
|
||||||
optional blankline)
|
|
||||||
|
|
||||||
externalLink :: MWParser Inlines
|
externalLink :: MWParser Inlines
|
||||||
externalLink = try $ do
|
externalLink = try $ do
|
||||||
char '['
|
char '['
|
||||||
|
@ -325,29 +329,6 @@ url = do
|
||||||
(orig, src) <- uri
|
(orig, src) <- uri
|
||||||
return $ B.link src "" (B.str orig)
|
return $ B.link src "" (B.str orig)
|
||||||
|
|
||||||
nowiki :: MWParser Inlines
|
|
||||||
nowiki = B.text . fromEntities <$> try
|
|
||||||
(htmlTag (~== TagOpen "nowiki" []) *>
|
|
||||||
manyTill anyChar (htmlTag (~== TagClose "nowiki")))
|
|
||||||
|
|
||||||
strikeout :: MWParser Inlines
|
|
||||||
strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del")
|
|
||||||
|
|
||||||
superscript :: MWParser Inlines
|
|
||||||
superscript = B.superscript <$> inlinesInTags "sup"
|
|
||||||
|
|
||||||
subscript :: MWParser Inlines
|
|
||||||
subscript = B.subscript <$> inlinesInTags "sub"
|
|
||||||
|
|
||||||
math :: MWParser Inlines
|
|
||||||
math = B.math <$> charsInTags "math"
|
|
||||||
|
|
||||||
code :: MWParser Inlines
|
|
||||||
code = B.code <$> (charsInTags "code" <|> charsInTags "tt")
|
|
||||||
|
|
||||||
hask :: MWParser Inlines
|
|
||||||
hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
|
|
||||||
|
|
||||||
-- | Parses a list of inlines between start and end delimiters.
|
-- | Parses a list of inlines between start and end delimiters.
|
||||||
inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
|
inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
|
||||||
inlinesBetween start end =
|
inlinesBetween start end =
|
||||||
|
|
Loading…
Reference in a new issue