From da8425598a8ab4a98388e8ee346a2ae7ec540aa0 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 1 Jan 2012 13:48:28 -0800 Subject: [PATCH] New treatment of dashes in --smart mode. * `---` is always em-dash, `--` is always en-dash. * pandoc no longer tries to guess when `-` should be en-dash. * A new option, `--old-dashes`, is provided for legacy documents. Rationale: The rules for en-dash are too complex and language-dependent for a guesser to work reliably. This change gives users greater control. The alternative of using unicode isn't very good, since unicode em- and en- dashes are barely distinguishable in a monospace font. --- README | 16 +++++++++----- src/Text/Pandoc/Parsing.hs | 34 +++++++++++++++++++++++++----- src/Text/Pandoc/Readers/Textile.hs | 3 ++- src/pandoc.hs | 10 +++++++++ tests/rst-reader.native | 4 ++-- tests/testsuite.txt | 4 ++-- 6 files changed, 56 insertions(+), 15 deletions(-) diff --git a/README b/README index 9380c895e..5b4de3942 100644 --- a/README +++ b/README @@ -206,11 +206,17 @@ Options `-S`, `--smart` : Produce typographically correct output, converting straight quotes - to curly quotes, `---` and `--` to dashes, and `...` to ellipses. - Nonbreaking spaces are inserted after certain abbreviations, such - as "Mr." (Note: This option is significant only when the input format is - `markdown` or `textile`. It is selected automatically when the input - format is `textile` or the output format is `latex` or `context`.) + to curly quotes, `---` to em-dashes, `--` to en-dashes, and + `...` to ellipses. Nonbreaking spaces are inserted after certain + abbreviations, such as "Mr." (Note: This option is significant only when + the input format is `markdown` or `textile`. It is selected automatically + when the input format is `textile` or the output format is `latex` or + `context`.) + +`--old-dashes` +: Selects the pandoc <= 1.8.2.1 behavior for parsing smart dashes: `-` before + a numeral is an en-dash, and `--` is an em-dash. This option is selected + automatically for `textile` input. `-5`, `--html5` : Produce HTML5 instead of HTML4. This option has no effect for writers diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index c2c512033..71da3a730 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -614,6 +614,9 @@ data ParserState = ParserState stateDate :: [Inline], -- ^ Date of document stateStrict :: Bool, -- ^ Use strict markdown syntax? stateSmart :: Bool, -- ^ Use smart typography? + stateOldDashes :: Bool, -- ^ Use pandoc <= 1.8.2.1 behavior + -- in parsing dashes; -- is em-dash; + -- before numeral is en-dash stateLiterateHaskell :: Bool, -- ^ Treat input as literate haskell stateColumns :: Int, -- ^ Number of columns in terminal stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used @@ -642,6 +645,7 @@ defaultParserState = stateDate = [], stateStrict = False, stateSmart = False, + stateOldDashes = False, stateLiterateHaskell = False, stateColumns = 80, stateHeaderTable = [], @@ -788,17 +792,37 @@ ellipses = do try (charOrRef "\8230\133") <|> try (string "..." >> return '…') return (Str "\8230") -dash :: GenParser Char st Inline -dash = enDash <|> emDash +dash :: GenParser Char ParserState Inline +dash = do + oldDashes <- stateOldDashes `fmap` getState + if oldDashes + then emDashOld <|> enDashOld + else Str `fmap` (hyphenDash <|> emDash <|> enDash) -enDash :: GenParser Char st Inline +-- Two hyphens = en-dash, three = em-dash +hyphenDash :: GenParser Char st String +hyphenDash = do + try $ string "--" + option "\8211" (char '-' >> return "\8212") + +emDash :: GenParser Char st String +emDash = do + try (charOrRef "\8212\151") + return "\8212" + +enDash :: GenParser Char st String enDash = do + try (charOrRef "\8212\151") + return "\8211" + +enDashOld :: GenParser Char st Inline +enDashOld = do try (charOrRef "\8211\150") <|> try (char '-' >> lookAhead (satisfy isDigit) >> return '–') return (Str "\8211") -emDash :: GenParser Char st Inline -emDash = do +emDashOld :: GenParser Char st Inline +emDashOld = do try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-') return (Str "\8212") diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 4693bd06d..3b5954368 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -68,7 +68,8 @@ import Control.Monad ( guard, liftM ) readTextile :: ParserState -- ^ Parser state, including options for parser -> String -- ^ String to parse (assuming @'\n'@ line endings) -> Pandoc -readTextile state s = (readWith parseTextile) state (s ++ "\n\n") +readTextile state s = + (readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n") -- diff --git a/src/pandoc.hs b/src/pandoc.hs index fc28c4c3f..3660fc167 100644 --- a/src/pandoc.hs +++ b/src/pandoc.hs @@ -103,6 +103,7 @@ data Opt = Opt , optSelfContained :: Bool -- ^ Make HTML accessible offline , optXeTeX :: Bool -- ^ Format latex for xetex , optSmart :: Bool -- ^ Use smart typography + , optOldDashes :: Bool -- ^ Parse dashes like pandoc <=1.8.2.1 , optHtml5 :: Bool -- ^ Produce HTML5 in HTML , optHighlight :: Bool -- ^ Highlight source code , optHighlightStyle :: Style -- ^ Style to use for highlighted code @@ -149,6 +150,7 @@ defaultOpts = Opt , optSelfContained = False , optXeTeX = False , optSmart = False + , optOldDashes = False , optHtml5 = False , optHighlight = True , optHighlightStyle = pygments @@ -245,6 +247,12 @@ options = (\opt -> return opt { optSmart = True })) "" -- "Use smart quotes, dashes, and ellipses" + , Option "" ["old-dashes"] + (NoArg + (\opt -> return opt { optSmart = True + , optOldDashes = True })) + "" -- "Use smart quotes, dashes, and ellipses" + , Option "5" ["html5"] (NoArg (\opt -> do @@ -735,6 +743,7 @@ main = do , optIncremental = incremental , optSelfContained = selfContained , optSmart = smart + , optOldDashes = oldDashes , optHtml5 = html5 , optHighlight = highlight , optHighlightStyle = highlightStyle @@ -858,6 +867,7 @@ main = do stateCitations = map CSL.refId refs, stateSmart = smart || writerName' `elem` ["latex", "context", "latex+lhs", "beamer"], + stateOldDashes = oldDashes, stateColumns = columns, stateStrict = strict, stateIndentedCodeClasses = codeBlockClasses, diff --git a/tests/rst-reader.native b/tests/rst-reader.native index 8d273a1d7..e0eb4d438 100644 --- a/tests/rst-reader.native +++ b/tests/rst-reader.native @@ -165,14 +165,14 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite",Str ": ,([Str "city"], [[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]]) ,([Str "phone"], - [[Para [Str "123",Str "\8211",Str "4567"]]])]] + [[Para [Str "123",Str "-",Str "4567"]]])]] ,DefinitionList [([Str "address"], [[Para [Str "61",Space,Str "Main",Space,Str "St",Str "."]]]) ,([Str "city"], [[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]]) ,([Str "phone"], - [[Para [Str "123",Str "\8211",Str "4567"]]])] + [[Para [Str "123",Str "-",Str "4567"]]])] ,Header 1 [Str "HTML",Space,Str "Blocks"] ,Para [Str "Simple",Space,Str "block",Space,Str "on",Space,Str "one",Space,Str "line",Str ":"] ,RawBlock "html" "
foo
\n" diff --git a/tests/testsuite.txt b/tests/testsuite.txt index ccee0764a..3bb5d8cb5 100644 --- a/tests/testsuite.txt +++ b/tests/testsuite.txt @@ -492,9 +492,9 @@ So is 'pine.' Here is some quoted '`code`' and a "[quoted link][1]". -Some dashes: one---two --- three--four -- five. +Some dashes: one---two --- three---four --- five. -Dashes between numbers: 5-7, 255-66, 1987-1999. +Dashes between numbers: 5--7, 255--66, 1987--1999. Ellipses...and...and....