Implemented implicit reference-style links to section headers in markdown.

For example, if you have a header '# Supported architectures', you can
link to it with '[Supported architectures]'.  If there are multiple
headers with this label, the link will point to the first of them.
Implicit references are always overridden by explicitly specified references.
Addresses Issue #20.

+ Moved isPunctuation, uniqueIdentifiers, and inlineListToIdentifier from
  Text.Pandoc.Writers.HTML to Text.Pandoc.Shared.

+ Added stHeaders to ParserState.   This holds a list of header texts
  used in the document, and is used to construct implicit header references.

+ In Text.Pandoc.Readers.Markdown, added call to headerReference
  parser in initial parsing pass, constructing a list of section header
  labels. This is then passed to uniqueIdentifiers to produce
  identifiers, and a list of implicit references is constructed. This is
  added to the end of the explicitly specified references, so it will be
  overridden by explicitly specified references. All of this processing
  is skipped if --strict was specified.

+ Modified documentation in README.



git-svn-id: https://pandoc.googlecode.com/svn/trunk@1086 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2007-11-22 17:14:21 +00:00
parent 8d334b84cc
commit f7b705b44c
4 changed files with 170 additions and 127 deletions

26
README
View file

@ -299,7 +299,7 @@ For further documentation, see the `pandoc(1)` man page.
: includes *string* as a prefix at the beginning of the title that : includes *string* as a prefix at the beginning of the title that
appears in the HTML header (but not in the title as it appears at appears in the HTML header (but not in the title as it appears at
the beginning of the HTML body). (See below on the beginning of the HTML body). (See below on
[Title Blocks](#title-blocks).) [Title Blocks].)
`-S` or `--smart` `-S` or `--smart`
: causes `pandoc` to produce typographically correct output, along the : causes `pandoc` to produce typographically correct output, along the
@ -797,8 +797,28 @@ another. A link to this section, for example, might look like this:
See the section on [header identifiers](#header-identifiers-in-html). See the section on [header identifiers](#header-identifiers-in-html).
Note, however, that this method of providing links to sections works Note that this method of providing links to sections works only in
only in HTML. HTML.
Shortcut links to section headers
---------------------------------
You may use the following shortcut to link to a section header:
See the section on [Header identifiers in HTML].
Pandoc will behave as if the document contains
[Header identifiers in HTML]: #header-identifiers-in-html
If there are multiple headers labeled "Header identifiers in HTML",
this will link to the first one. Note that implicit references are always
overridden by explicit ones, so the link in the following text
will be to `/url`, not `#header-identifiers-in-html`:
See the section on [Header identifiers in HTML].
[Header identifiers in HTML]: /url
Box-style blockquotes Box-style blockquotes
--------------------- ---------------------

View file

@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- | {- |
Module : Text.Pandoc.Readers.Markdown Module : Text.Pandoc.Readers.Markdown
Copyright : Copyright (C) 2006-7 John MacFarlane Copyright : Copyright (C) 2006-7 John MacFarlane
License : GNU GPL, version 2 or above License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu> Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha Stability : alpha
@ -27,8 +27,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Conversion of markdown-formatted plain text to 'Pandoc' document. Conversion of markdown-formatted plain text to 'Pandoc' document.
-} -}
module Text.Pandoc.Readers.Markdown ( module Text.Pandoc.Readers.Markdown (
readMarkdown readMarkdown
) where ) where
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy ) import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy )
@ -36,9 +36,9 @@ import Data.Ord ( comparing )
import Data.Char ( isAlphaNum ) import Data.Char ( isAlphaNum )
import Network.URI ( isURI ) import Network.URI ( isURI )
import Text.Pandoc.Definition import Text.Pandoc.Definition
import Text.Pandoc.Shared import Text.Pandoc.Shared
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
anyHtmlInlineTag, anyHtmlTag, anyHtmlInlineTag, anyHtmlTag,
anyHtmlEndTag, htmlEndTag, extractTagType, anyHtmlEndTag, htmlEndTag, extractTagType,
htmlBlockElement ) htmlBlockElement )
@ -68,14 +68,14 @@ specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221"
indentSpaces = try $ do indentSpaces = try $ do
state <- getState state <- getState
let tabStop = stateTabStop state let tabStop = stateTabStop state
try (count tabStop (char ' ')) <|> try (count tabStop (char ' ')) <|>
(many (char ' ') >> string "\t") <?> "indentation" (many (char ' ') >> string "\t") <?> "indentation"
nonindentSpaces = do nonindentSpaces = do
state <- getState state <- getState
let tabStop = stateTabStop state let tabStop = stateTabStop state
sps <- many (char ' ') sps <- many (char ' ')
if length sps < tabStop if length sps < tabStop
then return sps then return sps
else unexpected "indented line" else unexpected "indented line"
@ -90,8 +90,8 @@ failUnlessSmart = do
if stateSmart state then return () else fail "Smart typography feature" if stateSmart state then return () else fail "Smart typography feature"
-- | Parse an inline Str element with a given content. -- | Parse an inline Str element with a given content.
inlineString str = try $ do inlineString str = try $ do
(Str res) <- inline (Str res) <- inline
if res == str then return res else fail $ "unexpected Str content" if res == str then return res else fail $ "unexpected Str content"
-- | Parse a sequence of inline elements between a string -- | Parse a sequence of inline elements between a string
@ -102,9 +102,9 @@ inlinesInBalanced opener closer = try $ do
string opener string opener
result <- manyTill ( (do lookAhead (inlineString opener) result <- manyTill ( (do lookAhead (inlineString opener)
-- because it might be a link... -- because it might be a link...
bal <- inlinesInBalanced opener closer bal <- inlinesInBalanced opener closer
return $ [Str opener] ++ bal ++ [Str closer]) return $ [Str opener] ++ bal ++ [Str closer])
<|> (count 1 inline)) <|> (count 1 inline))
(try (string closer)) (try (string closer))
return $ concat result return $ concat result
@ -114,7 +114,7 @@ inlinesInBalanced opener closer = try $ do
titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline
authorsLine = try $ do authorsLine = try $ do
char '%' char '%'
skipSpaces skipSpaces
authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
@ -142,15 +142,24 @@ parseMarkdown = do
startPos <- getPosition startPos <- getPosition
-- go through once just to get list of reference keys -- go through once just to get list of reference keys
-- docMinusKeys is the raw document with blanks where the keys were... -- docMinusKeys is the raw document with blanks where the keys were...
docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= docMinusKeys <- manyTill (referenceKey <|> headerReference <|>
return . concat lineClump) eof >>= return . concat
setInput docMinusKeys setInput docMinusKeys
setPosition startPos setPosition startPos
st <- getState st <- getState
-- get headers and construct implicit references unless strict
if stateStrict st
then return ()
else do let oldkeys = stateKeys st
let headers = reverse $ stateHeaders st
let idents = uniqueIdentifiers headers
let implicitRefs = zipWith (\hd ident -> (hd, ("#" ++ ident, "")))
headers idents
updateState $ \st -> st { stateKeys = oldkeys ++ implicitRefs }
-- go through again for notes unless strict... -- go through again for notes unless strict...
if stateStrict st if stateStrict st
then return () then return ()
else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=
return . concat return . concat
st <- getState st <- getState
let reversedNotes = stateNotes st let reversedNotes = stateNotes st
@ -159,10 +168,10 @@ parseMarkdown = do
setPosition startPos setPosition startPos
-- now parse it for real... -- now parse it for real...
(title, author, date) <- option ([],[],"") titleBlock (title, author, date) <- option ([],[],"") titleBlock
blocks <- parseBlocks blocks <- parseBlocks
return $ Pandoc (Meta title author date) $ filter (/= Null) blocks return $ Pandoc (Meta title author date) $ filter (/= Null) blocks
-- --
-- initial pass for references and notes -- initial pass for references and notes
-- --
@ -185,7 +194,17 @@ referenceKey = try $ do
-- return blanks so line count isn't affected -- return blanks so line count isn't affected
return $ replicate (sourceLine endPos - sourceLine startPos) '\n' return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
referenceTitle = try $ do headerReference = try $ do
failIfStrict
startPos <- getPosition
(Header level text) <- lookAhead $ atxHeader <|> setextHeader
st <- getState
let headers = stateHeaders st
updateState $ \st -> st { stateHeaders = text:headers }
endPos <- getPosition
lineClump -- return the raw header, because we need to parse it later
referenceTitle = try $ do
(many1 spaceChar >> option '\n' newline) <|> newline (many1 spaceChar >> option '\n' newline) <|> newline
skipSpaces skipSpaces
tit <- (charsInBalanced '(' ')' >>= return . unwords . words) tit <- (charsInBalanced '(' ')' >>= return . unwords . words)
@ -200,7 +219,7 @@ rawLine = do
notFollowedBy blankline notFollowedBy blankline
notFollowedBy' noteMarker notFollowedBy' noteMarker
contents <- many1 nonEndline contents <- many1 nonEndline
end <- option "" (newline >> optional indentSpaces >> return "\n") end <- option "" (newline >> optional indentSpaces >> return "\n")
return $ contents ++ end return $ contents ++ end
rawLines = many1 rawLine >>= return . concat rawLines = many1 rawLine >>= return . concat
@ -229,7 +248,7 @@ noteBlock = try $ do
parseBlocks = manyTill block eof parseBlocks = manyTill block eof
block = choice [ header block = choice [ header
, table , table
, codeBlock , codeBlock
, hrule , hrule
@ -260,7 +279,7 @@ setextHeader = try $ do
-- first, see if this block has any chance of being a setextHeader: -- first, see if this block has any chance of being a setextHeader:
lookAhead (anyLine >> oneOf setextHChars) lookAhead (anyLine >> oneOf setextHChars)
text <- many1Till inline newline >>= return . normalizeSpaces text <- many1Till inline newline >>= return . normalizeSpaces
level <- choice $ zipWith level <- choice $ zipWith
(\ch lev -> try (many1 $ char ch) >> blanklines >> return lev) (\ch lev -> try (many1 $ char ch) >> blanklines >> return lev)
setextHChars [1..(length setextHChars)] setextHChars [1..(length setextHChars)]
return $ Header level text return $ Header level text
@ -285,7 +304,7 @@ hrule = try $ do
indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
codeBlock = do codeBlock = do
contents <- many1 (indentedLine <|> contents <- many1 (indentedLine <|>
try (do b <- blanklines try (do b <- blanklines
l <- indentedLine l <- indentedLine
return $ b ++ l)) return $ b ++ l))
@ -300,7 +319,7 @@ emacsBoxQuote = try $ do
failIfStrict failIfStrict
string ",----" string ",----"
manyTill anyChar newline manyTill anyChar newline
raw <- manyTill raw <- manyTill
(try (char '|' >> optional (char ' ') >> manyTill anyChar newline)) (try (char '|' >> optional (char ' ') >> manyTill anyChar newline))
(try (string "`----")) (try (string "`----"))
blanklines blanklines
@ -310,7 +329,7 @@ emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ')
emailBlockQuote = try $ do emailBlockQuote = try $ do
emailBlockQuoteStart emailBlockQuoteStart
raw <- sepBy (many (nonEndline <|> raw <- sepBy (many (nonEndline <|>
(try (endline >> notFollowedBy emailBlockQuoteStart >> (try (endline >> notFollowedBy emailBlockQuoteStart >>
return '\n')))) return '\n'))))
(try (newline >> emailBlockQuoteStart)) (try (newline >> emailBlockQuoteStart))
@ -318,12 +337,12 @@ emailBlockQuote = try $ do
optional blanklines optional blanklines
return raw return raw
blockQuote = do blockQuote = do
raw <- emailBlockQuote <|> emacsBoxQuote raw <- emailBlockQuote <|> emacsBoxQuote
-- parse the extracted block, which may contain various block elements: -- parse the extracted block, which may contain various block elements:
contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n" contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n"
return $ BlockQuote contents return $ BlockQuote contents
-- --
-- list blocks -- list blocks
-- --
@ -358,7 +377,7 @@ orderedListStart style delim = try $ do
then do many1 digit then do many1 digit
char '.' char '.'
return 1 return 1
else orderedListMarker style delim else orderedListMarker style delim
if delim == Period && (style == UpperAlpha || (style == UpperRoman && if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
num `elem` [1, 5, 10, 50, 100, 500, 1000])) num `elem` [1, 5, 10, 50, 100, 500, 1000]))
then char '\t' <|> (spaceChar >> spaceChar) then char '\t' <|> (spaceChar >> spaceChar)
@ -382,7 +401,7 @@ rawListItem start = try $ do
blanks <- many blankline blanks <- many blankline
return $ concat result ++ blanks return $ concat result ++ blanks
-- continuation of a list item - indented and separated by blankline -- continuation of a list item - indented and separated by blankline
-- or (in compact lists) endline. -- or (in compact lists) endline.
-- note: nested lists are parsed as continuations -- note: nested lists are parsed as continuations
listContinuation start = try $ do listContinuation start = try $ do
@ -398,7 +417,7 @@ listContinuationLine start = try $ do
result <- manyTill anyChar newline result <- manyTill anyChar newline
return $ result ++ "\n" return $ result ++ "\n"
listItem start = try $ do listItem start = try $ do
first <- rawListItem start first <- rawListItem start
continuations <- many (listContinuation start) continuations <- many (listContinuation start)
-- parsing with ListItemState forces markers at beginning of lines to -- parsing with ListItemState forces markers at beginning of lines to
@ -418,7 +437,7 @@ orderedList = try $ do
items <- many1 (listItem (orderedListStart style delim)) items <- many1 (listItem (orderedListStart style delim))
return $ OrderedList (start, style, delim) $ compactify items return $ OrderedList (start, style, delim) $ compactify items
bulletList = many1 (listItem bulletListStart) >>= bulletList = many1 (listItem bulletListStart) >>=
return . BulletList . compactify return . BulletList . compactify
-- definition lists -- definition lists
@ -459,7 +478,7 @@ definitionList = do
-- paragraph block -- paragraph block
-- --
para = try $ do para = try $ do
result <- many1 inline result <- many1 inline
newline newline
blanklines <|> do st <- getState blanklines <|> do st <- getState
@ -468,9 +487,9 @@ para = try $ do
else lookAhead emacsBoxQuote >> return "" else lookAhead emacsBoxQuote >> return ""
return $ Para $ normalizeSpaces result return $ Para $ normalizeSpaces result
plain = many1 inline >>= return . Plain . normalizeSpaces plain = many1 inline >>= return . Plain . normalizeSpaces
-- --
-- raw html -- raw html
-- --
@ -487,25 +506,25 @@ htmlBlock = do
else rawHtmlBlocks else rawHtmlBlocks
-- True if tag is self-closing -- True if tag is self-closing
isSelfClosing tag = isSelfClosing tag =
isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
strictHtmlBlock = try $ do strictHtmlBlock = try $ do
tag <- anyHtmlBlockTag tag <- anyHtmlBlockTag
let tag' = extractTagType tag let tag' = extractTagType tag
if isSelfClosing tag || tag' == "hr" if isSelfClosing tag || tag' == "hr"
then return tag then return tag
else do contents <- many (notFollowedBy' (htmlEndTag tag') >> else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
(htmlElement <|> (count 1 anyChar))) (htmlElement <|> (count 1 anyChar)))
end <- htmlEndTag tag' end <- htmlEndTag tag'
return $ tag ++ concat contents ++ end return $ tag ++ concat contents ++ end
rawHtmlBlocks = do rawHtmlBlocks = do
htmlBlocks <- many1 rawHtmlBlock htmlBlocks <- many1 rawHtmlBlock
let combined = concatMap (\(RawHtml str) -> str) htmlBlocks let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
let combined' = if not (null combined) && last combined == '\n' let combined' = if not (null combined) && last combined == '\n'
then init combined -- strip extra newline then init combined -- strip extra newline
else combined else combined
return $ RawHtml combined' return $ RawHtml combined'
-- --
@ -516,7 +535,7 @@ rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment
-- --
-- Tables -- Tables
-- --
-- Parse a dashed line with optional trailing spaces; return its length -- Parse a dashed line with optional trailing spaces; return its length
-- and the length including trailing space. -- and the length including trailing space.
@ -525,7 +544,7 @@ dashedLine ch = do
sp <- many spaceChar sp <- many spaceChar
return $ (length dashes, length $ dashes ++ sp) return $ (length dashes, length $ dashes ++ sp)
-- Parse a table header with dashed lines of '-' preceded by -- Parse a table header with dashed lines of '-' preceded by
-- one line of text. -- one line of text.
simpleTableHeader = try $ do simpleTableHeader = try $ do
rawContent <- anyLine rawContent <- anyLine
@ -548,7 +567,7 @@ tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n"
rawTableLine indices = do rawTableLine indices = do
notFollowedBy' (blanklines <|> tableFooter) notFollowedBy' (blanklines <|> tableFooter)
line <- many1Till anyChar newline line <- many1Till anyChar newline
return $ map removeLeadingTrailingSpace $ tail $ return $ map removeLeadingTrailingSpace $ tail $
splitByIndices (init indices) line splitByIndices (init indices) line
-- Parse a table line and return a list of lists of blocks (columns). -- Parse a table line and return a list of lists of blocks (columns).
@ -565,8 +584,8 @@ multilineRow indices = do
widthsFromIndices :: Int -- Number of columns on terminal widthsFromIndices :: Int -- Number of columns on terminal
-> [Int] -- Indices -> [Int] -- Indices
-> [Float] -- Fractional relative sizes of columns -> [Float] -- Fractional relative sizes of columns
widthsFromIndices _ [] = [] widthsFromIndices _ [] = []
widthsFromIndices numColumns indices = widthsFromIndices numColumns indices =
let lengths = zipWith (-) indices (0:indices) let lengths = zipWith (-) indices (0:indices)
totLength = sum lengths totLength = sum lengths
quotient = if totLength > numColumns quotient = if totLength > numColumns
@ -605,14 +624,14 @@ simpleTable = tableWith simpleTableHeader tableLine blanklines
multilineTable = tableWith multilineTableHeader multilineRow tableFooter multilineTable = tableWith multilineTableHeader multilineRow tableFooter
multilineTableHeader = try $ do multilineTableHeader = try $ do
tableSep tableSep
rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
initSp <- nonindentSpaces initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-') dashes <- many1 (dashedLine '-')
newline newline
let (lengths, lines) = unzip dashes let (lengths, lines) = unzip dashes
let indices = scanl (+) (length initSp) lines let indices = scanl (+) (length initSp) lines
let rawHeadsList = transpose $ map let rawHeadsList = transpose $ map
(\ln -> tail $ splitByIndices (init indices) ln) (\ln -> tail $ splitByIndices (init indices) ln)
rawContent rawContent
let rawHeads = map (joinWithSep " ") rawHeadsList let rawHeads = map (joinWithSep " ") rawHeadsList
@ -625,7 +644,7 @@ multilineTableHeader = try $ do
alignType :: [String] -> Int -> Alignment alignType :: [String] -> Int -> Alignment
alignType [] len = AlignDefault alignType [] len = AlignDefault
alignType strLst len = alignType strLst len =
let str = head $ sortBy (comparing length) $ let str = head $ sortBy (comparing length) $
map removeTrailingSpace strLst map removeTrailingSpace strLst
leftSpace = if null str then False else (str !! 0) `elem` " \t" leftSpace = if null str then False else (str !! 0) `elem` " \t"
rightSpace = length str < len || (str !! (len - 1)) `elem` " \t" rightSpace = length str < len || (str !! (len - 1)) `elem` " \t"
@ -637,7 +656,7 @@ alignType strLst len =
table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table" table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table"
-- --
-- inline -- inline
-- --
@ -667,7 +686,7 @@ inline = choice [ str
escapedChar = do escapedChar = do
char '\\' char '\\'
state <- getState state <- getState
result <- option '\\' $ if stateStrict state result <- option '\\' $ if stateStrict state
then oneOf "\\`*_{}[]()>#+-.!~" then oneOf "\\`*_{}[]()>#+-.!~"
else satisfy (not . isAlphaNum) else satisfy (not . isAlphaNum)
return $ Str [result] return $ Str [result]
@ -681,17 +700,17 @@ ltSign = do
specialCharsMinusLt = filter (/= '<') specialChars specialCharsMinusLt = filter (/= '<') specialChars
symbol = do symbol = do
result <- oneOf specialCharsMinusLt result <- oneOf specialCharsMinusLt
return $ Str [result] return $ Str [result]
-- parses inline code, between n `s and n `s -- parses inline code, between n `s and n `s
code = try $ do code = try $ do
starts <- many1 (char '`') starts <- many1 (char '`')
skipSpaces skipSpaces
result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|>
(char '\n' >> return " ")) (char '\n' >> return " "))
(try (skipSpaces >> count (length starts) (char '`') >> (try (skipSpaces >> count (length starts) (char '`') >>
notFollowedBy (char '`'))) notFollowedBy (char '`')))
return $ Code $ removeLeadingTrailingSpace $ concat result return $ Code $ removeLeadingTrailingSpace $ concat result
@ -707,30 +726,30 @@ math = try $ do
return $ TeX ("$" ++ (joinWithSep " " words) ++ "$") return $ TeX ("$" ++ (joinWithSep " " words) ++ "$")
emph = ((enclosed (char '*') (char '*') inline) <|> emph = ((enclosed (char '*') (char '*') inline) <|>
(enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>= (enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>=
return . Emph . normalizeSpaces return . Emph . normalizeSpaces
strong = ((enclosed (string "**") (try $ string "**") inline) <|> strong = ((enclosed (string "**") (try $ string "**") inline) <|>
(enclosed (string "__") (try $ string "__") inline)) >>= (enclosed (string "__") (try $ string "__") inline)) >>=
return . Strong . normalizeSpaces return . Strong . normalizeSpaces
strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>= strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
return . Strikeout . normalizeSpaces return . Strikeout . normalizeSpaces
superscript = failIfStrict >> enclosed (char '^') (char '^') superscript = failIfStrict >> enclosed (char '^') (char '^')
(notFollowedBy' whitespace >> inline) >>= -- may not contain Space (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
return . Superscript return . Superscript
subscript = failIfStrict >> enclosed (char '~') (char '~') subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy' whitespace >> inline) >>= -- may not contain Space (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
return . Subscript return . Subscript
smartPunctuation = failUnlessSmart >> smartPunctuation = failUnlessSmart >>
choice [ quoted, apostrophe, dash, ellipses ] choice [ quoted, apostrophe, dash, ellipses ]
apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe
quoted = doubleQuoted <|> singleQuoted quoted = doubleQuoted <|> singleQuoted
withQuoteContext context parser = do withQuoteContext context parser = do
oldState <- getState oldState <- getState
@ -746,7 +765,7 @@ singleQuoted = try $ do
withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>=
return . Quoted SingleQuote . normalizeSpaces return . Quoted SingleQuote . normalizeSpaces
doubleQuoted = try $ do doubleQuoted = try $ do
doubleQuoteStart doubleQuoteStart
withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>=
return . Quoted DoubleQuote . normalizeSpaces return . Quoted DoubleQuote . normalizeSpaces
@ -757,13 +776,13 @@ failIfInQuoteContext context = do
then fail "already inside quotes" then fail "already inside quotes"
else return () else return ()
singleQuoteStart = do singleQuoteStart = do
failIfInQuoteContext InSingleQuote failIfInQuoteContext InSingleQuote
char '\8216' <|> char '\8216' <|>
(try $ do char '\'' (try $ do char '\''
notFollowedBy (oneOf ")!],.;:-? \t\n") notFollowedBy (oneOf ")!],.;:-? \t\n")
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
satisfy (not . isAlphaNum))) satisfy (not . isAlphaNum)))
-- possess/contraction -- possess/contraction
return '\'') return '\'')
@ -807,13 +826,13 @@ endline = try $ do
newline newline
notFollowedBy blankline notFollowedBy blankline
st <- getState st <- getState
if stateStrict st if stateStrict st
then do notFollowedBy emailBlockQuoteStart then do notFollowedBy emailBlockQuoteStart
notFollowedBy (char '#') -- atx header notFollowedBy (char '#') -- atx header
else return () else return ()
-- parse potential list-starts differently if in a list: -- parse potential list-starts differently if in a list:
if stateParserContext st == ListItemState if stateParserContext st == ListItemState
then notFollowedBy' (bulletListStart <|> then notFollowedBy' (bulletListStart <|>
(anyOrderedListStart >> return ())) (anyOrderedListStart >> return ()))
else return () else return ()
return Space return Space
@ -827,7 +846,7 @@ reference = notFollowedBy' (string "[^") >> -- footnote reference
inlinesInBalanced "[" "]" >>= (return . normalizeSpaces) inlinesInBalanced "[" "]" >>= (return . normalizeSpaces)
-- source for a link, with optional title -- source for a link, with optional title
source = try $ do source = try $ do
char '(' char '('
optional (char '<') optional (char '<')
src <- many (noneOf ")> \t\n") src <- many (noneOf ")> \t\n")
@ -837,7 +856,7 @@ source = try $ do
char ')' char ')'
return (removeTrailingSpace src, tit) return (removeTrailingSpace src, tit)
linkTitle = try $ do linkTitle = try $ do
(many1 spaceChar >> option '\n' newline) <|> newline (many1 spaceChar >> option '\n' newline) <|> newline
skipSpaces skipSpaces
delim <- char '\'' <|> char '"' delim <- char '\'' <|> char '"'
@ -852,13 +871,13 @@ link = try $ do
-- a link like [this][ref] or [this][] or [this] -- a link like [this][ref] or [this][] or [this]
referenceLink label = do referenceLink label = do
ref <- option [] (try (optional (char ' ') >> ref <- option [] (try (optional (char ' ') >>
optional (newline >> skipSpaces) >> reference)) optional (newline >> skipSpaces) >> reference))
let ref' = if null ref then label else ref let ref' = if null ref then label else ref
state <- getState state <- getState
case lookupKeySrc (stateKeys state) ref' of case lookupKeySrc (stateKeys state) ref' of
Nothing -> fail "no corresponding key" Nothing -> fail "no corresponding key"
Just target -> return target Just target -> return target
emailAddress = try $ do emailAddress = try $ do
name <- many1 (alphaNum <|> char '+') name <- many1 (alphaNum <|> char '+')
@ -879,7 +898,7 @@ autoLink = try $ do
char '>' char '>'
let src' = if "mailto:" `isPrefixOf` src let src' = if "mailto:" `isPrefixOf` src
then drop 7 src then drop 7 src
else src else src
st <- getState st <- getState
return $ if stateStrict st return $ if stateStrict st
then Link [Str src'] (src, "") then Link [Str src'] (src, "")
@ -910,7 +929,7 @@ rawLaTeXInline' = failIfStrict >> rawLaTeXInline
rawHtmlInline' = do rawHtmlInline' = do
st <- getState st <- getState
result <- choice $ if stateStrict st result <- choice $ if stateStrict st
then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
else [htmlBlockElement, anyHtmlInlineTag] else [htmlBlockElement, anyHtmlInlineTag]
return $ HtmlInline result return $ HtmlInline result

View file

@ -34,6 +34,7 @@ module Text.Pandoc.Shared (
substitute, substitute,
joinWithSep, joinWithSep,
-- * Text processing -- * Text processing
isPunctuation,
backslashEscapes, backslashEscapes,
escapeStringUsing, escapeStringUsing,
stripTrailingNewlines, stripTrailingNewlines,
@ -91,6 +92,7 @@ module Text.Pandoc.Shared (
Element (..), Element (..),
hierarchicalize, hierarchicalize,
isHeaderBlock, isHeaderBlock,
uniqueIdentifiers,
-- * Writer options -- * Writer options
WriterOptions (..), WriterOptions (..),
defaultWriterOptions defaultWriterOptions
@ -102,7 +104,7 @@ import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty )
import qualified Text.PrettyPrint.HughesPJ as PP import qualified Text.PrettyPrint.HughesPJ as PP
import Text.Pandoc.CharacterReferences ( characterReference ) import Text.Pandoc.CharacterReferences ( characterReference )
import Data.Char ( toLower, toUpper, ord, isLower, isUpper ) import Data.Char ( toLower, toUpper, ord, isLower, isUpper )
import Data.List ( find, isPrefixOf ) import Data.List ( find, isPrefixOf, intersperse )
import Control.Monad ( join ) import Control.Monad ( join )
-- --
@ -144,6 +146,15 @@ joinWithSep sep lst = foldr1 (\a b -> a ++ sep ++ b) lst
-- Text processing -- Text processing
-- --
-- | True if character is a punctuation character (unicode).
isPunctuation :: Char -> Bool
isPunctuation c =
let c' = ord c
in if c `elem` "!\"'()*,-./:;<>?[\\]`{|}~" || c' >= 0x2000 && c' <= 0x206F ||
c' >= 0xE000 && c' <= 0xE0FF
then True
else False
-- | Returns an association list of backslash escapes for the -- | Returns an association list of backslash escapes for the
-- designated characters. -- designated characters.
backslashEscapes :: [Char] -- ^ list of special characters to escape backslashEscapes :: [Char] -- ^ list of special characters to escape
@ -566,7 +577,8 @@ data ParserState = ParserState
stateStrict :: Bool, -- ^ Use strict markdown syntax? stateStrict :: Bool, -- ^ Use strict markdown syntax?
stateSmart :: Bool, -- ^ Use smart typography? stateSmart :: Bool, -- ^ Use smart typography?
stateColumns :: Int, -- ^ Number of columns in terminal stateColumns :: Int, -- ^ Number of columns in terminal
stateHeaderTable :: [HeaderType] -- ^ Ordered list of header types used stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used
stateHeaders :: [[Inline]] -- ^ List of header texts used
} }
deriving Show deriving Show
@ -585,7 +597,8 @@ defaultParserState =
stateStrict = False, stateStrict = False,
stateSmart = False, stateSmart = False,
stateColumns = 80, stateColumns = 80,
stateHeaderTable = [] } stateHeaderTable = [],
stateHeaders = [] }
data HeaderType data HeaderType
= SingleHeader Char -- ^ Single line of characters underneath = SingleHeader Char -- ^ Single line of characters underneath
@ -787,6 +800,43 @@ isHeaderBlock :: Block -> Bool
isHeaderBlock (Header _ _) = True isHeaderBlock (Header _ _) = True
isHeaderBlock _ = False isHeaderBlock _ = False
-- | Convert Pandoc inline list to plain text identifier.
inlineListToIdentifier :: [Inline] -> String
inlineListToIdentifier [] = ""
inlineListToIdentifier (x:xs) =
xAsText ++ inlineListToIdentifier xs
where xAsText = case x of
Str s -> filter (\c -> c == '-' || not (isPunctuation c)) $
concat $ intersperse "-" $ words $ map toLower s
Emph lst -> inlineListToIdentifier lst
Strikeout lst -> inlineListToIdentifier lst
Superscript lst -> inlineListToIdentifier lst
Subscript lst -> inlineListToIdentifier lst
Strong lst -> inlineListToIdentifier lst
Quoted _ lst -> inlineListToIdentifier lst
Code s -> s
Space -> "-"
EmDash -> "-"
EnDash -> "-"
Apostrophe -> ""
Ellipses -> ""
LineBreak -> "-"
TeX _ -> ""
HtmlInline _ -> ""
Link lst _ -> inlineListToIdentifier lst
Image lst _ -> inlineListToIdentifier lst
Note _ -> ""
-- | Return unique identifiers for list of inline lists.
uniqueIdentifiers :: [[Inline]] -> [String]
uniqueIdentifiers ls =
let addIdentifier (nonuniqueIds, uniqueIds) l =
let new = inlineListToIdentifier l
matches = length $ filter (== new) nonuniqueIds
new' = new ++ if matches > 0 then ("-" ++ show matches) else ""
in (new:nonuniqueIds, new':uniqueIds)
in reverse $ snd $ foldl addIdentifier ([],[]) ls
-- --
-- Writer options -- Writer options
-- --

View file

@ -193,15 +193,6 @@ obfuscateChar char =
obfuscateString :: String -> String obfuscateString :: String -> String
obfuscateString = concatMap obfuscateChar . decodeCharacterReferences obfuscateString = concatMap obfuscateChar . decodeCharacterReferences
-- | True if character is a punctuation character (unicode).
isPunctuation :: Char -> Bool
isPunctuation c =
let c' = ord c
in if c `elem` "!\"'()*,-./:;<>?[\\]`{|}~" || c' >= 0x2000 && c' <= 0x206F ||
c' >= 0xE000 && c' <= 0xE0FF
then True
else False
-- | Add CSS for document header. -- | Add CSS for document header.
addToCSS :: String -> State WriterState () addToCSS :: String -> State WriterState ()
addToCSS item = do addToCSS item = do
@ -209,43 +200,6 @@ addToCSS item = do
let current = stCSS st let current = stCSS st
put $ st {stCSS = S.insert item current} put $ st {stCSS = S.insert item current}
-- | Convert Pandoc inline list to plain text identifier.
inlineListToIdentifier :: [Inline] -> String
inlineListToIdentifier [] = ""
inlineListToIdentifier (x:xs) =
xAsText ++ inlineListToIdentifier xs
where xAsText = case x of
Str s -> filter (\c -> c == '-' || not (isPunctuation c)) $
concat $ intersperse "-" $ words $ map toLower s
Emph lst -> inlineListToIdentifier lst
Strikeout lst -> inlineListToIdentifier lst
Superscript lst -> inlineListToIdentifier lst
Subscript lst -> inlineListToIdentifier lst
Strong lst -> inlineListToIdentifier lst
Quoted _ lst -> inlineListToIdentifier lst
Code s -> s
Space -> "-"
EmDash -> "-"
EnDash -> "-"
Apostrophe -> ""
Ellipses -> ""
LineBreak -> "-"
TeX _ -> ""
HtmlInline _ -> ""
Link lst _ -> inlineListToIdentifier lst
Image lst _ -> inlineListToIdentifier lst
Note _ -> ""
-- | Return unique identifiers for list of inline lists.
uniqueIdentifiers :: [[Inline]] -> [String]
uniqueIdentifiers ls =
let addIdentifier (nonuniqueIds, uniqueIds) l =
let new = inlineListToIdentifier l
matches = length $ filter (== new) nonuniqueIds
new' = new ++ if matches > 0 then ("-" ++ show matches) else ""
in (new:nonuniqueIds, new':uniqueIds)
in reverse $ snd $ foldl addIdentifier ([],[]) ls
-- | Convert Pandoc block element to HTML. -- | Convert Pandoc block element to HTML.
blockToHtml :: WriterOptions -> Block -> State WriterState Html blockToHtml :: WriterOptions -> Block -> State WriterState Html
blockToHtml opts Null = return $ noHtml blockToHtml opts Null = return $ noHtml