Added txt2tags reader

http://txt2tags.org/

There are two points which currently do not match the official
implementation.

1. In the official implementation lists can not be nested like the
following but the reader would interpret this as a bullet list with the
first item being a numbered list.

```
  - + This is not a list
```

2. The specification describes how URIs automatically becomes links.
Unfortunately as is often the case, their definitiong of URI is not
clear. I tried three solutions but was unsure about which to adopt.

* Using isURI from Network.URI, this matches far too many strings and is
therefore unsuitable
* Using uri from Text.Pandoc.Shared, this doesn't match all strings that
the reference implementation matches
* Try to simulate the regex which is used in the native code

I went with the third approach but it is not perfect, for example
trailing punctuation is captured in Urls.
This commit is contained in:
Matthew Pickering 2014-07-11 09:16:54 +01:00
parent 18f4490482
commit 7d04d383a6
2 changed files with 509 additions and 1 deletions

View file

@ -328,7 +328,8 @@ Library
Text.Pandoc.Templates,
Text.Pandoc.XML,
Text.Pandoc.SelfContained,
Text.Pandoc.Process
Text.Pandoc.Process,
Text.Pandoc.Readers.Txt2Tags
Other-Modules: Text.Pandoc.Readers.Docx.Lists,
Text.Pandoc.Readers.Docx.Reducible,
Text.Pandoc.Readers.Docx.Parse,

View file

@ -0,0 +1,507 @@
{-# LANGUAGE ViewPatterns #-}
module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags
, getT2TMeta
, T2TMeta (..)
, readTxt2TagsNoMacros)
where
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder ( Inlines, Blocks, (<>)
, trimInlines )
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Shared (escapeURI,compactify', compactify'DL)
import Text.Pandoc.Parsing hiding (space, spaces, uri)
import Control.Applicative ((<$>), (<$), (<*>), (<*), (*>))
import Data.Char (toLower)
import Data.List (transpose, intersperse, intercalate)
import Data.Maybe (fromMaybe)
import Data.Monoid (Monoid, mconcat, mempty, mappend)
--import Network.URI (isURI) -- Not sure whether to use this function
import Control.Monad (void, guard, when)
import Data.Default
import Control.Monad.Reader (Reader, runReader, asks)
type T2T = Parser String ParserState
type T2T = ParserT String ParserState (Reader T2TMeta)
-- | An object for the T2T macros meta information
-- the contents of each field is simply substituted verbatim into the file
data T2TMeta = T2TMeta {
date :: String -- ^ Current date
, mtime :: String -- ^ Last modification time of infile
, infile :: FilePath -- ^ Input file
, outfile :: FilePath -- ^ Output file
} deriving Show
instance Default T2TMeta where
def = T2TMeta "" "" "" ""
-- | Get the meta information required by Txt2Tags macros
getT2TMeta :: FilePath -> FilePath -> IO T2TMeta
getT2TMeta inp out = do
curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime
let getModTime = formatTime defaultTimeLocale "%F"
<$> getModificationTime inp
curMtime <- catchIOError getModTime (const (return ""))
return $ T2TMeta curDate curMtime inp out
-- | Read Txt2Tags from an input string returning a Pandoc document
readTxt2Tags :: T2TMeta -> ReaderOptions -> String -> Pandoc
readTxt2Tags t opts s = flip runReader t $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n")
-- | Read Txt2Tags (ignoring all macros) from an input string returning
-- a Pandoc document
readTxt2TagsNoMacros :: ReaderOptions -> String -> Pandoc
readTxt2TagsNoMacros = readTxt2Tags def
parseT2T :: T2T Pandoc
parseT2T = do
_ <- (Nothing <$ try blankline) <|> (Just <$> (count 3 anyLine))
config <- manyTill setting (notFollowedBy setting)
-- TODO: Handle settings better
let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) nullMeta config
updateState (\s -> s {stateMeta = settings})
body <- mconcat <$> manyTill block eof
return $ Pandoc mempty (B.toList body)
type Keyword = String
type Value = String
setting :: T2T (Keyword, Value)
setting = do
string "%!"
keyword <- ignoreSpacesCap (many1 alphaNum)
char ':'
value <- ignoreSpacesCap (manyTill anyChar (newline))
return (keyword, value)
-- Blocks
parseBlocks :: T2T Blocks
parseBlocks = mconcat <$> manyTill block eof
block :: T2T Blocks
block = do
choice
[ mempty <$ blanklines
, quote
, hrule -- hrule must go above title
, title
, commentBlock
, verbatim
, rawBlock
, taggedBlock
, list
, table
, para
]
title :: T2T Blocks
title = try $ balancedTitle '+' <|> balancedTitle '='
balancedTitle :: Char -> T2T Blocks
balancedTitle c = try $ do
spaces
level <- length <$> many1 (char c)
guard (level <= 5) -- Max header level 5
heading <- manyTill (noneOf "\n\r") (count level (char c))
label <- optionMaybe (enclosed (char '[') (char ']') (alphaNum <|> oneOf "_-"))
many spaceChar *> newline
let attr = maybe nullAttr (\x -> (x, [], [])) label
return $ B.headerWith attr level (trimInlines $ B.text heading)
para :: T2T Blocks
para = try $ do
ils <- parseInlines
nl <- option False (True <$ newline)
option (B.plain ils) (guard nl >> notFollowedBy listStart >> return (B.para ils))
where
listStart = try bulletListStart <|> orderedListStart
commentBlock :: T2T Blocks
commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment
-- Seperator and Strong line treated the same
hrule :: T2T Blocks
hrule = try $ do
spaces
line <- many1 (oneOf "=-_")
guard (length line >= 20)
B.horizontalRule <$ blankline
quote :: T2T Blocks
quote = try $ do
lookAhead tab
rawQuote <- many1 (tab *> optional spaces *> anyLine)
contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n")
return $ B.blockQuote contents
commentLine :: T2T Inlines
commentLine = comment
-- List Parsing code from Org Reader
list :: T2T Blocks
list = choice [bulletList, orderedList, definitionList]
bulletList :: T2T Blocks
bulletList = B.bulletList . compactify'
<$> many1 (listItem bulletListStart parseBlocks)
orderedList :: T2T Blocks
orderedList = B.orderedList . compactify'
<$> many1 (listItem orderedListStart parseBlocks)
definitionList :: T2T Blocks
definitionList = try $ do
B.definitionList . compactify'DL <$>
many1 (listItem definitionListStart definitionListEnd)
definitionListEnd :: T2T (Inlines, [Blocks])
definitionListEnd = (,) <$> (mconcat <$> manyTill inline newline) <*> ((:[]) <$> parseBlocks)
genericListStart :: T2T Char
-> T2T Int
genericListStart listMarker = try $
(2+) <$> (length <$> many spaceChar
<* listMarker <* space <* notFollowedBy space)
-- parses bullet list \start and returns its length (excl. following whitespace)
bulletListStart :: T2T Int
bulletListStart = genericListStart (char '-')
orderedListStart :: T2T Int
orderedListStart = genericListStart (char '+' )
definitionListStart :: T2T Int
definitionListStart = genericListStart (char ':')
-- parse raw text for one list item, excluding start marker and continuations
listItem :: T2T Int
-> T2T a
-> T2T a
listItem start end = try $ do
markerLength <- try start
firstLine <- anyLineNewline
blank <- option "" ("\n" <$ blankline)
rest <- concat <$> many (listContinuation markerLength)
parseFromString end $ firstLine ++ blank ++ rest
-- continuation of a list item - indented and separated by blankline or endline.
-- Note: nested lists are parsed as continuations.
listContinuation :: Int
-> T2T String
listContinuation markerLength = try $
notFollowedBy' (blankline >> blankline)
*> (mappend <$> (concat <$> many1 listLine)
<*> many blankline)
where listLine = try $ indentWith markerLength *> anyLineNewline
anyLineNewline :: T2T String
anyLineNewline = (++ "\n") <$> anyLine
indentWith :: Int -> T2T String
indentWith n = count n space
-- Table
table :: T2T Blocks
table = try $ do
header <- fmap snd <$> option mempty (try headerRow)
rows <- many1 (many commentLine *> tableRow)
let columns = transpose rows
let ncolumns = length columns
let aligns = map (foldr1 findAlign) (map (map fst) columns)
let rows' = map (map snd) rows
let size = maximum (map length rows')
let rowsPadded = map (pad size) rows'
let headerPadded = if (not (null header)) then pad size header else mempty
return $ B.table mempty
(zip aligns (replicate ncolumns 0.0))
headerPadded rowsPadded
pad :: (Show a, Monoid a) => Int -> [a] -> [a]
pad n xs = xs ++ (replicate (n - length xs) mempty)
findAlign :: Alignment -> Alignment -> Alignment
findAlign x y
| x == y = x
| otherwise = AlignDefault
headerRow :: T2T [(Alignment, Blocks)]
headerRow = genericRow (string "||")
tableRow :: T2T [(Alignment, Blocks)]
tableRow = genericRow (char '|')
genericRow :: T2T a -> T2T [(Alignment, Blocks)]
genericRow start = try $ do
spaces *> start
manyTill tableCell newline <?> "genericRow"
tableCell :: T2T (Alignment, Blocks)
tableCell = try $ do
leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead
content <- (manyTill inline (try $ lookAhead (cellEnd)))
rightSpaces <- length <$> many space
let align =
case compare leftSpaces rightSpaces of
LT -> AlignLeft
EQ -> AlignCenter
GT -> AlignRight
endOfCell
return $ (align, B.plain (B.trimInlines $ mconcat content))
where
cellEnd = (void newline <|> (many1 space *> endOfCell))
endOfCell :: T2T ()
endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline)
-- Raw area
verbatim :: T2T Blocks
verbatim = genericBlock anyLineNewline B.codeBlock "```"
rawBlock :: T2T Blocks
rawBlock = genericBlock anyLineNewline (B.para . B.str) "\"\"\""
taggedBlock :: T2T Blocks
taggedBlock = do
target <- getTarget
genericBlock anyLineNewline (B.rawBlock target) "'''"
-- Generic
genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks
genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s
blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks
blockMarkupArea p f s = try $ (do
string s *> blankline
f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline))))
blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks
blockMarkupLine p f s = try (f <$> (string s *> space *> p))
-- Can be in either block or inline position
comment :: Monoid a => T2T a
comment = try $ do
atStart
mempty <$ (char '%' *> anyLine)
-- Inline
parseInlines :: T2T Inlines
parseInlines = trimInlines . mconcat <$> many1 inline
inline :: T2T Inlines
inline = do
choice
[ endline
, commentLine
, whitespace
, url
, link
, image
, bold
, underline
, code
, raw
, tagged
, strike
, italic
, code
, str
, symbol
]
bold :: T2T Inlines
bold = inlineMarkup inline B.strong '*' (B.str)
underline :: T2T Inlines
underline = inlineMarkup inline B.emph '_' (B.str)
strike :: T2T Inlines
strike = inlineMarkup inline B.strikeout '-' (B.str)
italic :: T2T Inlines
italic = inlineMarkup inline B.emph '/' (B.str)
code :: T2T Inlines
code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id
raw :: T2T Inlines
raw = inlineMarkup ((:[]) <$> anyChar) B.text '"' id
tagged :: T2T Inlines
tagged = do
target <- getTarget
inlineMarkup ((:[]) <$> anyChar) (B.rawInline target) '\'' id
-- Parser for markup indicated by a double character.
-- Inline markup is greedy and glued
-- Greedy meaning ***a*** = Bold [Str "*a*"]
-- Glued meaning that markup must be tight to content
-- Markup can't pass newlines
inlineMarkup :: Monoid a
=> (T2T a) -- Content parser
-> (a -> Inlines) -- Constructor
-> Char -- Fence
-> (String -> a) -- Special Case to handle ******
-> T2T Inlines
inlineMarkup p f c special = try $ do
start <- many1 (char c)
let l = length start
guard (l >= 2)
when (l == 2) (void $ notFollowedBy space)
-- We must make sure that there is no space before the start of the
-- closing tags
body <- optionMaybe (try $ manyTill (noneOf "\n\r") $
(try $ lookAhead (noneOf " " >> string [c,c] )))
case body of
Just middle -> do
lastChar <- anyChar
end <- many1 (char c)
let parser inp = parseFromString (mconcat <$> many p) inp
let start' = special (drop 2 start)
body' <- parser (middle ++ [lastChar])
let end' = special (drop 2 end)
return $ f (start' <> body' <> end')
Nothing -> do -- Either bad or case such as *****
guard (l >= 5)
let body' = (replicate (l - 4) c)
return $ f (special body')
link :: T2T Inlines
link = try imageLink <|> titleLink
-- Link with title
titleLink :: T2T Inlines
titleLink = try $ do
char '['
notFollowedBy space
tokens <- sepBy1 (many $ noneOf " ]") space
guard (length tokens >= 2)
char ']'
let link' = last tokens
guard (length link' > 0)
let tit = concat (intersperse " " (init tokens))
return $ B.link link' "" (B.text tit)
-- Link with image
imageLink :: T2T Inlines
imageLink = try $ do
char '['
body <- image
many1 space
l <- manyTill (noneOf "\n\r ") (char ']')
return (B.link l "" body)
-- raw URLs in text are automatically linked
url :: T2T Inlines
url = try $ do
(rawUrl, escapedUrl) <- (try uri <|> emailAddress)
return $ B.link rawUrl "" (B.str escapedUrl)
uri :: T2T (String, String)
uri = try $ do
address <- t2tURI
return (address, escapeURI address)
-- The definition of a URI in the T2T source differs from the
-- actual definition. This is a transcription of the definition in
-- the source of v2.6
--isT2TURI :: String -> Bool
--isT2TURI (parse t2tURI "" -> Right _) = True
--isT2TURI _ = False
t2tURI :: T2T String
t2tURI = do
start <- try ((++) <$> proto <*> urlLogin) <|> guess
domain <- many1 chars
sep <- many (char '/')
form' <- option mempty ((:) <$> char '?' <*> many1 form)
anchor' <- option mempty ((:) <$> char '#' <*> many anchor)
return (start ++ domain ++ sep ++ form' ++ anchor')
where
protos = ["http", "https", "ftp", "telnet", "gopher", "wais"]
proto = (++) <$> oneOfStrings protos <*> string "://"
guess = (++) <$> (((++) <$> stringAnyCase "www" <*> option mempty ((:[]) <$> oneOf "23"))
<|> stringAnyCase "ftp") <*> ((:[]) <$> char '.')
login = alphaNum <|> oneOf "_.-"
pass = many (noneOf " @")
chars = alphaNum <|> oneOf "%._/~:,=$@&+-"
anchor = alphaNum <|> oneOf "%._0"
form = chars <|> oneOf ";*"
urlLogin = option mempty $ try ((\x y z -> x ++ y ++ [z]) <$> many1 login <*> option mempty ((:) <$> char ':' <*> pass) <*> char '@')
image :: T2T Inlines
image = try $ do
-- List taken from txt2tags source
let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"]
char '['
path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions))
ext <- oneOfStrings extensions
char ']'
return $ B.image (path ++ ext) "" mempty
-- Characters used in markup
specialChars :: String
specialChars = "%*-_/|:+"
tab :: T2T Char
tab = char '\t'
space :: T2T Char
space = char ' '
spaces :: T2T String
spaces = many space
endline :: T2T Inlines
endline = try $ do
newline
notFollowedBy blankline
notFollowedBy hrule
notFollowedBy title
notFollowedBy verbatim
notFollowedBy rawBlock
notFollowedBy taggedBlock
notFollowedBy quote
notFollowedBy list
notFollowedBy table
return $ B.space
str :: T2T Inlines
str = try $ do
B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
whitespace :: T2T Inlines
whitespace = try $ B.space <$ spaceChar
symbol :: T2T Inlines
symbol = B.str . (:[]) <$> oneOf specialChars
-- Utility
getTarget :: T2T String
getTarget = do
mv <- lookupMeta "target" . stateMeta <$> getState
let MetaString target = fromMaybe (MetaString "html") mv
return target
atStart :: T2T ()
atStart = (sourceColumn <$> getPosition) >>= guard . (== 1)
ignoreSpacesCap :: T2T String -> T2T String
ignoreSpacesCap p = map toLower <$> (spaces *> p <* spaces)