CommonMark reader: Handle ascii_identifiers extension (#4733)
Non-ascii characters were not stripped from identifiers even if the `ascii_identifiers` extension was enabled (which is is by default for gfm). Closes #4742
This commit is contained in:
parent
bb5a2464d5
commit
904924d172
2 changed files with 42 additions and 12 deletions
|
@ -39,7 +39,9 @@ import Control.Monad.State
|
|||
import Data.Char (isAlphaNum, isLetter, isSpace, toLower)
|
||||
import Data.List (groupBy)
|
||||
import qualified Data.Map as Map
|
||||
import Data.Maybe (mapMaybe)
|
||||
import Data.Text (Text, unpack)
|
||||
import Text.Pandoc.Asciify (toAsciiChar)
|
||||
import Text.Pandoc.Class (PandocMonad)
|
||||
import Text.Pandoc.Definition
|
||||
import Text.Pandoc.Emoji (emojis)
|
||||
|
@ -51,7 +53,7 @@ import Text.Pandoc.Walk (walkM)
|
|||
readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
|
||||
readCommonMark opts s = return $
|
||||
(if isEnabled Ext_gfm_auto_identifiers opts
|
||||
then addHeaderIdentifiers
|
||||
then addHeaderIdentifiers opts
|
||||
else id) $
|
||||
nodeToPandoc opts $ commonmarkToNode opts' exts s
|
||||
where opts' = [ optSmart | isEnabled Ext_smart opts ]
|
||||
|
@ -70,13 +72,13 @@ convertEmojis (':':xs) =
|
|||
convertEmojis (x:xs) = x : convertEmojis xs
|
||||
convertEmojis [] = []
|
||||
|
||||
addHeaderIdentifiers :: Pandoc -> Pandoc
|
||||
addHeaderIdentifiers doc = evalState (walkM addHeaderId doc) mempty
|
||||
addHeaderIdentifiers :: ReaderOptions -> Pandoc -> Pandoc
|
||||
addHeaderIdentifiers opts doc = evalState (walkM (addHeaderId opts) doc) mempty
|
||||
|
||||
addHeaderId :: Block -> State (Map.Map String Int) Block
|
||||
addHeaderId (Header lev (_,classes,kvs) ils) = do
|
||||
addHeaderId :: ReaderOptions -> Block -> State (Map.Map String Int) Block
|
||||
addHeaderId opts (Header lev (_,classes,kvs) ils) = do
|
||||
idmap <- get
|
||||
let ident = toIdent ils
|
||||
let ident = toIdent opts ils
|
||||
ident' <- case Map.lookup ident idmap of
|
||||
Nothing -> do
|
||||
put (Map.insert ident 1 idmap)
|
||||
|
@ -85,13 +87,16 @@ addHeaderId (Header lev (_,classes,kvs) ils) = do
|
|||
put (Map.adjust (+ 1) ident idmap)
|
||||
return (ident ++ "-" ++ show i)
|
||||
return $ Header lev (ident',classes,kvs) ils
|
||||
addHeaderId x = return x
|
||||
addHeaderId _ x = return x
|
||||
|
||||
toIdent :: [Inline] -> String
|
||||
toIdent = map (\c -> if isSpace c then '-' else c)
|
||||
. filter (\c -> isLetter c || isAlphaNum c || isSpace c ||
|
||||
c == '_' || c == '-')
|
||||
. map toLower . stringify
|
||||
toIdent :: ReaderOptions -> [Inline] -> String
|
||||
toIdent opts = map (\c -> if isSpace c then '-' else c)
|
||||
. filterer
|
||||
. map toLower . stringify
|
||||
where filterer = if isEnabled Ext_ascii_identifiers opts
|
||||
then mapMaybe toAsciiChar
|
||||
else filter (\c -> isLetter c || isAlphaNum c || isSpace c ||
|
||||
c == '_' || c == '-')
|
||||
|
||||
nodeToPandoc :: ReaderOptions -> Node -> Pandoc
|
||||
nodeToPandoc opts (Node _ DOCUMENT nodes) =
|
||||
|
|
25
test/command/4742.md
Normal file
25
test/command/4742.md
Normal file
|
@ -0,0 +1,25 @@
|
|||
Check that the commonmark reader handles the `ascii_identifiers`
|
||||
extension properly.
|
||||
|
||||
```
|
||||
% pandoc -f commonmark+gfm_auto_identifiers+ascii_identifiers -t native
|
||||
# non ascii ⚠️ räksmörgås
|
||||
^D
|
||||
[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
|
||||
```
|
||||
|
||||
```
|
||||
% pandoc -f commonmark+gfm_auto_identifiers-ascii_identifiers -t native
|
||||
# non ascii ⚠️ räksmörgås
|
||||
^D
|
||||
[Header 1 ("non-ascii-\65039-r\228ksm\246rg\229s",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
|
||||
```
|
||||
|
||||
`gfm` should have `ascii_identifiers` enabled by default.
|
||||
|
||||
```
|
||||
% pandoc -f gfm -t native
|
||||
# non ascii ⚠️ räksmörgås
|
||||
^D
|
||||
[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
|
||||
```
|
Loading…
Add table
Reference in a new issue