More intelligent handling of text encodings.

Previously, UTF-8 was enforced for both input and output.

The new system:

* For input, UTF-8 is tried first; if an error is raised, the
  locale encoding is tried.
* For output, the locale encoding is always used.
This commit is contained in:
John MacFarlane 2012-09-23 22:12:21 -07:00
parent 31107741f0
commit 7272735b3d
5 changed files with 30 additions and 20 deletions

View file

@ -146,7 +146,6 @@ where
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Builder (Blocks)
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
import Text.Parsec
import Text.Parsec.Pos (newPos)
import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation )
@ -708,7 +707,7 @@ readWith parser state input =
testStringWith :: (Show a) => Parser [Char] ParserState a
-> String
-> IO ()
testStringWith parser str = UTF8.putStrLn $ show $
testStringWith parser str = putStrLn $ show $
readWith parser defaultParserState str
-- | Parsing options.

View file

@ -90,7 +90,7 @@ import Paths_pandoc (getDataFileName)
import Text.Pandoc.Pretty (charWidth)
import System.Locale (defaultTimeLocale)
import Data.Time
import System.IO (stderr)
import System.IO (stderr, hPutStrLn)
import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..),
renderOptions)
@ -503,14 +503,14 @@ readDataFile userDir fname = findDataFile userDir fname >>= UTF8.readFile
err :: Int -> String -> IO a
err exitCode msg = do
name <- getProgName
UTF8.hPutStrLn stderr $ name ++ ": " ++ msg
hPutStrLn stderr $ name ++ ": " ++ msg
exitWith $ ExitFailure exitCode
return undefined
warn :: String -> IO ()
warn msg = do
name <- getProgName
UTF8.hPutStrLn stderr $ name ++ ": " ++ msg
hPutStrLn stderr $ name ++ ": " ++ msg
--
-- Safe read

View file

@ -25,7 +25,11 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Stability : alpha
Portability : portable
UTF-8 aware string IO functions that will work with GHC 6.10, 6.12, or 7.
UTF-8 aware string IO functions that will work with GHC 6.12 or 7.
The reading functions first attempt to read UTF-8; if an encoding
error is encountered, the local encoding is used instead. This
should work well in practice because text in other encodings
is usually not valid UTF-8.
-}
module Text.Pandoc.UTF8 ( readFile
, writeFile
@ -45,10 +49,11 @@ where
#else
import Codec.Binary.UTF8.String (encodeString, decodeString)
#endif
import Control.Exception (catch, throwIO)
import GHC.IO.Exception (IOException(..), IOErrorType(..))
import System.IO hiding (readFile, writeFile, getContents,
putStr, putStrLn, hPutStr, hPutStrLn, hGetContents)
import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn )
import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn, catch )
import qualified System.IO as IO
readFile :: FilePath -> IO String
@ -75,7 +80,14 @@ hPutStrLn :: Handle -> String -> IO ()
hPutStrLn h s = hSetEncoding h utf8 >> IO.hPutStrLn h s
hGetContents :: Handle -> IO String
hGetContents h = hSetEncoding h utf8_bom >> IO.hGetContents h
hGetContents h = do
hSetEncoding h utf8_bom
catch (IO.hGetContents h) $ \e ->
case ioe_type e of
InvalidArgument -> do
hSetEncoding h localeEncoding
IO.hGetContents h
_ -> throwIO e
encodePath :: FilePath -> FilePath
decodeArg :: String -> String

View file

@ -33,8 +33,7 @@ import System.FilePath ( (</>) )
import qualified Data.ByteString.Lazy as B
import qualified Data.Map as M
import Data.ByteString.Lazy.UTF8 ( fromString, toString )
import Text.Pandoc.UTF8 as UTF8
import System.IO ( stderr )
import System.IO ( stderr, hPutStrLn )
import Codec.Archive.Zip
import Data.Time.Clock.POSIX
import Paths_pandoc ( getDataFileName )
@ -663,7 +662,7 @@ inlineToOpenXML opts (Image alt (src, tit)) = do
, mknode "wp:docPr" [("descr",tit),("id","1"),("name","Picture")] ()
, graphic ] ]
else do
liftIO $ UTF8.hPutStrLn stderr $
liftIO $ hPutStrLn stderr $
"Could not find image `" ++ src ++ "', skipping..."
inlinesToOpenXML opts alt

View file

@ -46,7 +46,7 @@ import System.Console.GetOpt
import Data.Char ( toLower )
import Data.List ( intercalate, isPrefixOf )
import System.Directory ( getAppUserDataDirectory, doesFileExist, findExecutable )
import System.IO ( stdout )
import System.IO ( stdout, hPutStr, hPutStrLn )
import System.IO.Error ( isDoesNotExistError )
import qualified Control.Exception as E
import Control.Exception.Extensible ( throwIO )
@ -312,7 +312,7 @@ options =
(\arg _ -> do
templ <- getDefaultTemplate Nothing arg
case templ of
Right t -> UTF8.hPutStr stdout t
Right t -> hPutStr stdout t
Left e -> error $ show e
exitWith ExitSuccess)
"FORMAT")
@ -663,7 +663,7 @@ options =
(NoArg
(\_ -> do
prg <- getProgName
UTF8.hPutStrLn stdout (prg ++ " " ++ pandocVersion ++ compileInfo ++
hPutStrLn stdout (prg ++ " " ++ pandocVersion ++ compileInfo ++
copyrightMessage)
exitWith ExitSuccess ))
"" -- "Print version"
@ -672,7 +672,7 @@ options =
(NoArg
(\_ -> do
prg <- getProgName
UTF8.hPutStr stdout (usageMessage prg options)
hPutStr stdout (usageMessage prg options)
exitWith ExitSuccess ))
"" -- "Show help"
@ -827,8 +827,8 @@ main = do
} = opts
when dumpArgs $
do UTF8.hPutStrLn stdout outputFile
mapM_ (\arg -> UTF8.hPutStrLn stdout arg) args
do hPutStrLn stdout outputFile
mapM_ (\arg -> hPutStrLn stdout arg) args
exitWith ExitSuccess
let sources = if ignoreArgs then [] else args
@ -1026,8 +1026,8 @@ main = do
writeBinary = B.writeFile (UTF8.encodePath outputFile)
let writerFn :: FilePath -> String -> IO ()
writerFn "-" = UTF8.putStr
writerFn f = UTF8.writeFile f
writerFn "-" = putStr
writerFn f = writeFile f
case getWriter writerName' of
Left e -> err 9 e