CommonMark reader: Handle ascii_identifiers extension (#4733)

Non-ascii characters were not stripped from identifiers even if the
`ascii_identifiers` extension was enabled (which is is by default for
gfm).

Closes #4742
This commit is contained in:
Anders Waldenborg 2018-06-29 10:41:26 +02:00 committed by John MacFarlane
parent bb5a2464d5
commit 904924d172
2 changed files with 42 additions and 12 deletions

View file

@ -39,7 +39,9 @@ import Control.Monad.State
import Data.Char (isAlphaNum, isLetter, isSpace, toLower)
import Data.List (groupBy)
import qualified Data.Map as Map
import Data.Maybe (mapMaybe)
import Data.Text (Text, unpack)
import Text.Pandoc.Asciify (toAsciiChar)
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Definition
import Text.Pandoc.Emoji (emojis)
@ -51,7 +53,7 @@ import Text.Pandoc.Walk (walkM)
readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readCommonMark opts s = return $
(if isEnabled Ext_gfm_auto_identifiers opts
then addHeaderIdentifiers
then addHeaderIdentifiers opts
else id) $
nodeToPandoc opts $ commonmarkToNode opts' exts s
where opts' = [ optSmart | isEnabled Ext_smart opts ]
@ -70,13 +72,13 @@ convertEmojis (':':xs) =
convertEmojis (x:xs) = x : convertEmojis xs
convertEmojis [] = []
addHeaderIdentifiers :: Pandoc -> Pandoc
addHeaderIdentifiers doc = evalState (walkM addHeaderId doc) mempty
addHeaderIdentifiers :: ReaderOptions -> Pandoc -> Pandoc
addHeaderIdentifiers opts doc = evalState (walkM (addHeaderId opts) doc) mempty
addHeaderId :: Block -> State (Map.Map String Int) Block
addHeaderId (Header lev (_,classes,kvs) ils) = do
addHeaderId :: ReaderOptions -> Block -> State (Map.Map String Int) Block
addHeaderId opts (Header lev (_,classes,kvs) ils) = do
idmap <- get
let ident = toIdent ils
let ident = toIdent opts ils
ident' <- case Map.lookup ident idmap of
Nothing -> do
put (Map.insert ident 1 idmap)
@ -85,13 +87,16 @@ addHeaderId (Header lev (_,classes,kvs) ils) = do
put (Map.adjust (+ 1) ident idmap)
return (ident ++ "-" ++ show i)
return $ Header lev (ident',classes,kvs) ils
addHeaderId x = return x
addHeaderId _ x = return x
toIdent :: [Inline] -> String
toIdent = map (\c -> if isSpace c then '-' else c)
. filter (\c -> isLetter c || isAlphaNum c || isSpace c ||
c == '_' || c == '-')
. map toLower . stringify
toIdent :: ReaderOptions -> [Inline] -> String
toIdent opts = map (\c -> if isSpace c then '-' else c)
. filterer
. map toLower . stringify
where filterer = if isEnabled Ext_ascii_identifiers opts
then mapMaybe toAsciiChar
else filter (\c -> isLetter c || isAlphaNum c || isSpace c ||
c == '_' || c == '-')
nodeToPandoc :: ReaderOptions -> Node -> Pandoc
nodeToPandoc opts (Node _ DOCUMENT nodes) =

25
test/command/4742.md Normal file
View file

@ -0,0 +1,25 @@
Check that the commonmark reader handles the `ascii_identifiers`
extension properly.
```
% pandoc -f commonmark+gfm_auto_identifiers+ascii_identifiers -t native
# non ascii ⚠️ räksmörgås
^D
[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
```
```
% pandoc -f commonmark+gfm_auto_identifiers-ascii_identifiers -t native
# non ascii ⚠️ räksmörgås
^D
[Header 1 ("non-ascii-\65039-r\228ksm\246rg\229s",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
```
`gfm` should have `ascii_identifiers` enabled by default.
```
% pandoc -f gfm -t native
# non ascii ⚠️ räksmörgås
^D
[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
```