New treatment of dashes in --smart mode.

* `---` is always em-dash, `--` is always en-dash.
* pandoc no longer tries to guess when `-` should be en-dash.
* A new option, `--old-dashes`, is provided for legacy documents.

Rationale: The rules for en-dash are too complex and
language-dependent for a guesser to work reliably.  This
change gives users greater control.  The alternative of
using unicode isn't very good, since unicode em- and en-
dashes are barely distinguishable in a monospace font.
This commit is contained in:
John MacFarlane 2012-01-01 13:48:28 -08:00
parent 3cf60c7306
commit da8425598a
6 changed files with 56 additions and 15 deletions

16
README
View file

@ -206,11 +206,17 @@ Options
`-S`, `--smart` `-S`, `--smart`
: Produce typographically correct output, converting straight quotes : Produce typographically correct output, converting straight quotes
to curly quotes, `---` and `--` to dashes, and `...` to ellipses. to curly quotes, `---` to em-dashes, `--` to en-dashes, and
Nonbreaking spaces are inserted after certain abbreviations, such `...` to ellipses. Nonbreaking spaces are inserted after certain
as "Mr." (Note: This option is significant only when the input format is abbreviations, such as "Mr." (Note: This option is significant only when
`markdown` or `textile`. It is selected automatically when the input the input format is `markdown` or `textile`. It is selected automatically
format is `textile` or the output format is `latex` or `context`.) when the input format is `textile` or the output format is `latex` or
`context`.)
`--old-dashes`
: Selects the pandoc <= 1.8.2.1 behavior for parsing smart dashes: `-` before
a numeral is an en-dash, and `--` is an em-dash. This option is selected
automatically for `textile` input.
`-5`, `--html5` `-5`, `--html5`
: Produce HTML5 instead of HTML4. This option has no effect for writers : Produce HTML5 instead of HTML4. This option has no effect for writers

View file

@ -614,6 +614,9 @@ data ParserState = ParserState
stateDate :: [Inline], -- ^ Date of document stateDate :: [Inline], -- ^ Date of document
stateStrict :: Bool, -- ^ Use strict markdown syntax? stateStrict :: Bool, -- ^ Use strict markdown syntax?
stateSmart :: Bool, -- ^ Use smart typography? stateSmart :: Bool, -- ^ Use smart typography?
stateOldDashes :: Bool, -- ^ Use pandoc <= 1.8.2.1 behavior
-- in parsing dashes; -- is em-dash;
-- before numeral is en-dash
stateLiterateHaskell :: Bool, -- ^ Treat input as literate haskell stateLiterateHaskell :: Bool, -- ^ Treat input as literate haskell
stateColumns :: Int, -- ^ Number of columns in terminal stateColumns :: Int, -- ^ Number of columns in terminal
stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used
@ -642,6 +645,7 @@ defaultParserState =
stateDate = [], stateDate = [],
stateStrict = False, stateStrict = False,
stateSmart = False, stateSmart = False,
stateOldDashes = False,
stateLiterateHaskell = False, stateLiterateHaskell = False,
stateColumns = 80, stateColumns = 80,
stateHeaderTable = [], stateHeaderTable = [],
@ -788,17 +792,37 @@ ellipses = do
try (charOrRef "\8230\133") <|> try (string "..." >> return '…') try (charOrRef "\8230\133") <|> try (string "..." >> return '…')
return (Str "\8230") return (Str "\8230")
dash :: GenParser Char st Inline dash :: GenParser Char ParserState Inline
dash = enDash <|> emDash dash = do
oldDashes <- stateOldDashes `fmap` getState
if oldDashes
then emDashOld <|> enDashOld
else Str `fmap` (hyphenDash <|> emDash <|> enDash)
enDash :: GenParser Char st Inline -- Two hyphens = en-dash, three = em-dash
hyphenDash :: GenParser Char st String
hyphenDash = do
try $ string "--"
option "\8211" (char '-' >> return "\8212")
emDash :: GenParser Char st String
emDash = do
try (charOrRef "\8212\151")
return "\8212"
enDash :: GenParser Char st String
enDash = do enDash = do
try (charOrRef "\8212\151")
return "\8211"
enDashOld :: GenParser Char st Inline
enDashOld = do
try (charOrRef "\8211\150") <|> try (charOrRef "\8211\150") <|>
try (char '-' >> lookAhead (satisfy isDigit) >> return '') try (char '-' >> lookAhead (satisfy isDigit) >> return '')
return (Str "\8211") return (Str "\8211")
emDash :: GenParser Char st Inline emDashOld :: GenParser Char st Inline
emDash = do emDashOld = do
try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-') try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-')
return (Str "\8212") return (Str "\8212")

View file

@ -68,7 +68,8 @@ import Control.Monad ( guard, liftM )
readTextile :: ParserState -- ^ Parser state, including options for parser readTextile :: ParserState -- ^ Parser state, including options for parser
-> String -- ^ String to parse (assuming @'\n'@ line endings) -> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Pandoc -> Pandoc
readTextile state s = (readWith parseTextile) state (s ++ "\n\n") readTextile state s =
(readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")
-- --

View file

@ -103,6 +103,7 @@ data Opt = Opt
, optSelfContained :: Bool -- ^ Make HTML accessible offline , optSelfContained :: Bool -- ^ Make HTML accessible offline
, optXeTeX :: Bool -- ^ Format latex for xetex , optXeTeX :: Bool -- ^ Format latex for xetex
, optSmart :: Bool -- ^ Use smart typography , optSmart :: Bool -- ^ Use smart typography
, optOldDashes :: Bool -- ^ Parse dashes like pandoc <=1.8.2.1
, optHtml5 :: Bool -- ^ Produce HTML5 in HTML , optHtml5 :: Bool -- ^ Produce HTML5 in HTML
, optHighlight :: Bool -- ^ Highlight source code , optHighlight :: Bool -- ^ Highlight source code
, optHighlightStyle :: Style -- ^ Style to use for highlighted code , optHighlightStyle :: Style -- ^ Style to use for highlighted code
@ -149,6 +150,7 @@ defaultOpts = Opt
, optSelfContained = False , optSelfContained = False
, optXeTeX = False , optXeTeX = False
, optSmart = False , optSmart = False
, optOldDashes = False
, optHtml5 = False , optHtml5 = False
, optHighlight = True , optHighlight = True
, optHighlightStyle = pygments , optHighlightStyle = pygments
@ -245,6 +247,12 @@ options =
(\opt -> return opt { optSmart = True })) (\opt -> return opt { optSmart = True }))
"" -- "Use smart quotes, dashes, and ellipses" "" -- "Use smart quotes, dashes, and ellipses"
, Option "" ["old-dashes"]
(NoArg
(\opt -> return opt { optSmart = True
, optOldDashes = True }))
"" -- "Use smart quotes, dashes, and ellipses"
, Option "5" ["html5"] , Option "5" ["html5"]
(NoArg (NoArg
(\opt -> do (\opt -> do
@ -735,6 +743,7 @@ main = do
, optIncremental = incremental , optIncremental = incremental
, optSelfContained = selfContained , optSelfContained = selfContained
, optSmart = smart , optSmart = smart
, optOldDashes = oldDashes
, optHtml5 = html5 , optHtml5 = html5
, optHighlight = highlight , optHighlight = highlight
, optHighlightStyle = highlightStyle , optHighlightStyle = highlightStyle
@ -858,6 +867,7 @@ main = do
stateCitations = map CSL.refId refs, stateCitations = map CSL.refId refs,
stateSmart = smart || writerName' `elem` stateSmart = smart || writerName' `elem`
["latex", "context", "latex+lhs", "beamer"], ["latex", "context", "latex+lhs", "beamer"],
stateOldDashes = oldDashes,
stateColumns = columns, stateColumns = columns,
stateStrict = strict, stateStrict = strict,
stateIndentedCodeClasses = codeBlockClasses, stateIndentedCodeClasses = codeBlockClasses,

View file

@ -165,14 +165,14 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite",Str ":
,([Str "city"], ,([Str "city"],
[[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]]) [[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]])
,([Str "phone"], ,([Str "phone"],
[[Para [Str "123",Str "\8211",Str "4567"]]])]] [[Para [Str "123",Str "-",Str "4567"]]])]]
,DefinitionList ,DefinitionList
[([Str "address"], [([Str "address"],
[[Para [Str "61",Space,Str "Main",Space,Str "St",Str "."]]]) [[Para [Str "61",Space,Str "Main",Space,Str "St",Str "."]]])
,([Str "city"], ,([Str "city"],
[[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]]) [[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]])
,([Str "phone"], ,([Str "phone"],
[[Para [Str "123",Str "\8211",Str "4567"]]])] [[Para [Str "123",Str "-",Str "4567"]]])]
,Header 1 [Str "HTML",Space,Str "Blocks"] ,Header 1 [Str "HTML",Space,Str "Blocks"]
,Para [Str "Simple",Space,Str "block",Space,Str "on",Space,Str "one",Space,Str "line",Str ":"] ,Para [Str "Simple",Space,Str "block",Space,Str "on",Space,Str "one",Space,Str "line",Str ":"]
,RawBlock "html" "<div>foo</div>\n" ,RawBlock "html" "<div>foo</div>\n"

View file

@ -492,9 +492,9 @@ So is 'pine.'
Here is some quoted '`code`' and a "[quoted link][1]". Here is some quoted '`code`' and a "[quoted link][1]".
Some dashes: one---two --- three--four -- five. Some dashes: one---two --- three---four --- five.
Dashes between numbers: 5-7, 255-66, 1987-1999. Dashes between numbers: 5--7, 255--66, 1987--1999.
Ellipses...and...and.... Ellipses...and...and....