Simplified and improved UTF8 handling:

+ Removed utf8-string cabal configuration flag.
+ Instead, we just include System.IO.UTF8 and Codec.Binary.UTF8.String
  from utf8-string package in the source tree, avoiding a dependency
  on utf8-string and avoiding crufty custom UTF8 code. (The old
  Text.Pandoc.UTF8 had problems with the getContents function.)
+ Removed lots of CPP directives that are no longer needed.
+ In Setup.hs, use '-i..' in running RunTests.hs, so the local UTF8
  code will be found.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1411 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2008-08-14 01:32:54 +00:00
parent 8388427d23
commit d00da31622
12 changed files with 245 additions and 153 deletions

View file

@ -0,0 +1,97 @@
--
-- |
-- Module : Codec.Binary.UTF8.String
-- Copyright : (c) Eric Mertens 2007
-- License : BSD3-style (see LICENSE)
--
-- Maintainer: emertens@galois.com
-- Stability : experimental
-- Portability : portable
--
-- Support for encoding UTF8 Strings to and from @[Word8]@
--
module Codec.Binary.UTF8.String (
encode
, decode
, encodeString
, decodeString
) where
import Data.Word (Word8)
import Data.Bits ((.|.),(.&.),shiftL,shiftR)
import Data.Char (chr,ord)
default(Int)
-- | Encode a string using 'encode' and store the result in a 'String'.
encodeString :: String -> String
encodeString xs = map (toEnum . fromEnum) (encode xs)
-- | Decode a string using 'decode' using a 'String' as input.
-- | This is not safe but it is necessary if UTF-8 encoded text
-- | has been loaded into a 'String' prior to being decoded.
decodeString :: String -> String
decodeString xs = decode (map (toEnum . fromEnum) xs)
replacement_character :: Char
replacement_character = '\xfffd'
-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
encode :: String -> [Word8]
encode = concatMap (map fromIntegral . go . ord)
where
go oc
| oc <= 0x7f = [oc]
| oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6)
, 0x80 + oc .&. 0x3f
]
| oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12)
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
, 0x80 + oc .&. 0x3f
]
| otherwise = [ 0xf0 + (oc `shiftR` 18)
, 0x80 + ((oc `shiftR` 12) .&. 0x3f)
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
, 0x80 + oc .&. 0x3f
]
--
-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
--
decode :: [Word8] -> String
decode [ ] = ""
decode (c:cs)
| c < 0x80 = chr (fromEnum c) : decode cs
| c < 0xc0 = replacement_character : decode cs
| c < 0xe0 = multi1
| c < 0xf0 = multi_byte 2 0xf 0x800
| c < 0xf8 = multi_byte 3 0x7 0x10000
| c < 0xfc = multi_byte 4 0x3 0x200000
| c < 0xfe = multi_byte 5 0x1 0x4000000
| otherwise = replacement_character : decode cs
where
multi1 = case cs of
c1 : ds | c1 .&. 0xc0 == 0x80 ->
let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
in if d >= 0x000080 then toEnum d : decode ds
else replacement_character : decode ds
_ -> replacement_character : decode cs
multi_byte :: Int -> Word8 -> Int -> [Char]
multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
where
aux 0 rs acc
| overlong <= acc && acc <= 0x10ffff &&
(acc < 0xd800 || 0xdfff < acc) &&
(acc < 0xfffe || 0xffff < acc) = chr acc : decode rs
| otherwise = replacement_character : decode rs
aux n (r:rs) acc
| r .&. 0xc0 == 0x80 = aux (n-1) rs
$ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
aux _ rs _ = replacement_character : decode rs

View file

@ -24,10 +24,6 @@ GHC and the required libraries using `apt-get`:
sudo apt-get install ghc6 libghc6-xhtml-dev libghc6-mtl-dev libghc6-network-dev
Pandoc will use the [utf8-string] library if it is installed; otherwise, it
will use its own internal module for UTF-8 I/O. The utf8-string library is
not a required dependency, but it may improve performance slightly.
[GHC]: http://www.haskell.org/ghc/
[GHC Download]: http://www.haskell.org/ghc/download.html
[Cabal]: http://www.haskell.org/cabal/
@ -36,7 +32,6 @@ not a required dependency, but it may improve performance slightly.
[Fink]: http://finkproject.org
[Ubuntu]: http://www.ubuntu.com
[debian]: http://www.debian.org/
[utf8-string]: http://hackage.haskell.org/cgi-bin/hackage-scripts/package/utf8-string
Getting the source
------------------

View file

@ -43,11 +43,7 @@ import Data.Maybe ( fromMaybe )
import Data.Char ( toLower )
import Prelude hiding ( putStrLn, writeFile, readFile, getContents )
import System.IO ( stdout, stderr )
#ifdef _UTF8STRING
import System.IO.UTF8
#else
import Text.Pandoc.UTF8
#endif
#ifdef _CITEPROC
import Text.CSL
import Text.Pandoc.Biblio
@ -61,11 +57,6 @@ copyrightMessage = "\nCopyright (C) 2006-7 John MacFarlane\n" ++
compileInfo :: String
compileInfo =
#ifdef _UTF8STRING
" +utf8-string" ++
#else
" -utf8-string" ++
#endif
#ifdef _CITEPROC
" +citeproc" ++
#else

View file

@ -17,7 +17,7 @@ main = defaultMainWithHooks $
-- | Run test suite.
runTestSuite _ _ _ _ = do
inDirectory "tests" $ runCommand "runhaskell RunTests.hs" >>= waitForProcess
inDirectory "tests" $ runCommand "runhaskell -i.. RunTests.hs" >>= waitForProcess
return ()
-- | If reference.odt needs rebuilding, build it.

118
System/IO/UTF8.hs Normal file
View file

@ -0,0 +1,118 @@
-----------------------------------------------------------------------------
-- |
-- Module : System.IO.UTF8
-- Copyright : (c) Eric Mertens 2007
-- License : BSD3-style (see LICENSE)
--
-- Maintainer: emertens@galois.com
-- Stability : experimental
-- Portability : portable
--
-- String IO preserving UTF8 encoding.
--
module System.IO.UTF8 (
print
, putStr
, putStrLn
, getLine
, readLn
, readFile
, writeFile
, appendFile
, getContents
, hGetLine
, hGetContents
, hPutStr
, hPutStrLn
) where
import Control.Monad (liftM)
import Data.Char (ord, chr)
import Data.Word (Word8)
import Prelude (String, ($), (=<<), (>>=), (.), map, toEnum, fromEnum, Read,
Show(..))
import System.IO (Handle, IO, FilePath)
import qualified System.IO as IO
import Codec.Binary.UTF8.String (encode, decode)
-- | Encode a string in UTF8 form.
encodeString :: String -> String
encodeString xs = bytesToString (encode xs)
-- | Decode a string from UTF8
decodeString :: String -> String
decodeString xs = decode (stringToBytes xs)
-- | Convert a list of bytes to a String
bytesToString :: [Word8] -> String
bytesToString xs = map (chr . fromEnum) xs
-- | String to list of bytes.
stringToBytes :: String -> [Word8]
stringToBytes xs = map (toEnum . ord) xs
-- | The 'print' function outputs a value of any printable type to the
-- standard output device. This function differs from the
-- System.IO.print in that it preserves any UTF8 encoding of the shown value.
--
print :: Show a => a -> IO ()
print x = putStrLn (show x)
-- | Write a UTF8 string to the standard output device
putStr :: String -> IO ()
putStr x = IO.putStr (encodeString x)
-- | The same as 'putStr', but adds a newline character.
putStrLn :: String -> IO ()
putStrLn x = IO.putStrLn (encodeString x)
-- | Read a UTF8 line from the standard input device
getLine :: IO String
getLine = liftM decodeString IO.getLine
-- | The 'readLn' function combines 'getLine' and 'readIO', preserving UTF8
readLn :: Read a => IO a
readLn = IO.readIO =<< getLine
-- | The 'readFile' function reads a file and
-- returns the contents of the file as a UTF8 string.
-- The file is read lazily, on demand, as with 'getContents'.
readFile :: FilePath -> IO String
readFile n = liftM decodeString (IO.openBinaryFile n IO.ReadMode >>=
IO.hGetContents)
-- | The computation 'writeFile' @file str@ function writes the UTF8 string @str@,
-- to the file @file@.
writeFile :: FilePath -> String -> IO ()
writeFile n c = IO.withBinaryFile n IO.WriteMode $ \ h ->
IO.hPutStr h $ encodeString c
-- | The computation 'appendFile' @file str@ function appends the UTF8 string @str@,
-- to the file @file@.
appendFile :: FilePath -> String -> IO ()
appendFile n c = IO.withBinaryFile n IO.AppendMode $ \h ->
IO.hPutStr h $ encodeString c
-- | Read a UTF8 line from a Handle
hGetLine :: Handle -> IO String
hGetLine h = liftM decodeString $ IO.hGetLine h
-- | Lazily read a UTF8 string from a Handle
hGetContents :: Handle -> IO String
hGetContents h = liftM decodeString (IO.hGetContents h)
-- | Write a UTF8 string to a Handle.
hPutStr :: Handle -> String -> IO ()
hPutStr h s = IO.hPutStr h (encodeString s)
-- | Write a UTF8 string to a Handle, appending a newline.
hPutStrLn :: Handle -> String -> IO ()
hPutStrLn h s = IO.hPutStrLn h (encodeString s)
-- | Lazily read stdin as a UTF8 string.
getContents :: IO String
getContents = liftM decodeString IO.getContents

View file

@ -1,4 +1,4 @@
{-# LANGUAGE TemplateHaskell, CPP #-}
{-# LANGUAGE TemplateHaskell #-}
{-
Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu>
@ -44,11 +44,7 @@ import qualified Data.ByteString as B ( writeFile, pack )
import Data.ByteString.Internal ( c2w )
import Prelude hiding ( writeFile, readFile )
import System.IO ( stderr )
#ifdef _UTF8STRING
import System.IO.UTF8
#else
import Text.Pandoc.UTF8
#endif
-- | Produce an ODT file from OpenDocument XML.
saveOpenDocumentAsODT :: FilePath -- ^ Pathname of ODT file to be produced.

View file

@ -1,4 +1,3 @@
{-# LANGUAGE CPP #-}
{-
Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu>
@ -38,11 +37,7 @@ import System.Environment ( getEnvironment )
import Text.Pandoc.Shared ( withTempDir )
import Prelude hiding ( writeFile, readFile, putStrLn )
import System.IO ( stderr, openFile, IOMode (..), hClose )
#ifdef _UTF8STRING
import System.IO.UTF8
#else
import Text.Pandoc.UTF8
#endif
-- | Produce an PDF file from LaTeX.
saveLaTeXAsPDF :: FilePath -- ^ Pathname of PDF file to be produced.

View file

@ -118,11 +118,7 @@ import System.FilePath ( (</>), (<.>) )
import System.IO.Error ( catch, ioError, isAlreadyExistsError )
import System.Directory
import Prelude hiding ( putStrLn, writeFile, readFile, getContents )
#ifdef _UTF8STRING
import System.IO.UTF8
#else
import Text.Pandoc.UTF8
#endif
--
-- List processing

View file

@ -1,4 +1,3 @@
{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-
Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu>
@ -39,11 +38,7 @@ import Language.Haskell.TH.Syntax (Lift (..))
import qualified Data.ByteString as B
import Data.ByteString.Internal ( w2c )
import Prelude hiding ( readFile )
#ifdef _UTF8STRING
import System.IO.UTF8
#else
import Text.Pandoc.UTF8
#endif
-- | Insert contents of text file into a template.
contentsOf :: FilePath -> ExpQ

View file

@ -1,76 +0,0 @@
-- | Functions for IO using UTF-8 encoding.
--
-- The basic encoding and decoding functions are taken from
-- <http://www.cse.ogi.edu/~hallgren/Talks/LHiH/base/lib/UTF8.hs>.
-- (c) 2003, OGI School of Science & Engineering, Oregon Health and
-- Science University.
--
-- From the Char module supplied with HBC.
-- Modified by Martin Norbaeck to pass illegal UTF-8 sequences unchanged.
-- Modified by John MacFarlane to use [Word8] and export IO functions.
module Text.Pandoc.UTF8 (
putStrLn
, putStr
, hPutStrLn
, hPutStr
, getContents
, readFile
, writeFile
) where
import Data.Word
import System.IO ( Handle )
import qualified Data.ByteString.Lazy as BS
import Prelude hiding ( putStrLn, putStr, getContents, readFile, writeFile )
putStrLn :: String -> IO ()
putStrLn = BS.putStrLn . BS.pack . toUTF8
putStr :: String -> IO ()
putStr = BS.putStr . BS.pack . toUTF8
hPutStrLn :: Handle -> String -> IO ()
hPutStrLn h = BS.hPut h . BS.pack . toUTF8 . (++ "\n")
hPutStr :: Handle -> String -> IO ()
hPutStr h = BS.hPut h . BS.pack . toUTF8
readFile :: FilePath -> IO String
readFile p = BS.readFile p >>= return . fromUTF8 . BS.unpack
writeFile :: FilePath -> String -> IO ()
writeFile p = BS.writeFile p . BS.pack . toUTF8
getContents :: IO String
getContents = BS.getContents >>= return . fromUTF8 . BS.unpack
-- | Take a list of bytes in UTF-8 encoding and decode it into a Unicode string.
fromUTF8 :: [Word8] -> String
fromUTF8 [] = ""
fromUTF8 (0xef : 0xbb : 0xbf :cs) = fromUTF8 cs -- skip BOM (byte order marker)
fromUTF8 (c:c':cs) | 0xc0 <= c && c <= 0xdf &&
0x80 <= c' && c' <= 0xbf =
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : fromUTF8 cs
fromUTF8 (c:c':c'':cs) | 0xe0 <= c && c <= 0xef &&
0x80 <= c' && c' <= 0xbf &&
0x80 <= c'' && c'' <= 0xbf =
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : fromUTF8 cs
fromUTF8 (c:cs) = toEnum (fromEnum c) : fromUTF8 cs
-- | Take a Unicode string and encode it as a list of bytes in UTF-8 encoding.
toUTF8 :: String -> [Word8]
toUTF8 "" = []
toUTF8 (c:cs) =
if c > '\x0000' && c < '\x0080' then
toEnum (fromEnum c) : toUTF8 cs
else if c < toEnum 0x0800 then
let i = fromEnum c
in toEnum (0xc0 + i `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
toUTF8 cs
else
let i = fromEnum c
in toEnum (0xe0 + i `div` 0x1000) :
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
toUTF8 cs

52
debian/copyright vendored
View file

@ -58,42 +58,36 @@ by Eric A. Meyer
Released under an explicit Public Domain License
----------------------------------------------------------------------
UTF8.hs
Copyright (c) 2003, OGI School of Science & Engineering, Oregon Health &
Science University, All rights reserved.
Modified by Martin Norbäck, to pass illegal utf-8 sequences through unchanged.
Modified 2006-8 John MacFarlane.
System/IO/UTF8.hs and Codec/Binary/UTF8/String.hs
from the utf8-string package on HackageDB
Copyright (c) 2007, Galois Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Galois Inc. nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
- Neither the name of OGI or OHSU nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
THIS SOFTWARE IS PROVIDED BY Galois Inc. ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL Galois Inc. BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------------
Text/XML/Light/*
from the xml package on HackageDB
(c) 2007 Galois Inc.
All rights reserved.

View file

@ -134,9 +134,6 @@ Flag executable
Flag library
Description: Build the pandoc library.
Default: True
Flag utf8-string
Description: Use utf8-string library for UTF-8 I/O.
Default: True
Flag citeproc
Description: Compile in support for citeproc-hs bibliographic formatting.
Default: False
@ -149,11 +146,6 @@ Library
if flag(highlighting)
Build-depends: highlighting-kate
cpp-options: -D_HIGHLIGHTING
if flag(utf8-string)
Build-depends: utf8-string
cpp-options: -D_UTF8STRING
else
Other-Modules: Text.Pandoc.UTF8
if flag(citeproc)
Build-depends: citeproc-hs
Exposed-Modules: Text.Pandoc.Biblio
@ -190,12 +182,16 @@ Library
Text.Pandoc.Writers.S5
Other-Modules: Text.Pandoc.XML,
Text.Pandoc.TH,
-- from xml package
Text.XML.Light,
Text.XML.Light.Types,
Text.XML.Light.Output,
Text.XML.Light.Input,
Text.XML.Light.Proc,
Text.XML.Light.Cursor
Text.XML.Light.Cursor,
-- from utf8-string package
System.IO.UTF8,
Codec.Binary.UTF8.String
Extensions: CPP, TemplateHaskell, FlexibleInstances
Ghc-Options: -O2 -Wall -threaded
Ghc-Prof-Options: -auto-all
@ -214,11 +210,6 @@ Executable pandoc
if flag(highlighting)
cpp-options: -D_HIGHLIGHTING
if flag(utf8-string)
Build-depends: utf8-string
cpp-options: -D_UTF8STRING
else
Other-Modules: Text.Pandoc.UTF8
if flag(citeproc)
Build-depends: citeproc-hs
cpp-options: -D_CITEPROC