From 0b899ce7efa80df700b17b7a4b9f6de1fb88ce8c Mon Sep 17 00:00:00 2001 From: Ophir Lifshitz Date: Sun, 4 Oct 2015 06:07:23 -0400 Subject: [PATCH 1/2] Docx Reader: Parse soft, no-break hyphen elements --- src/Text/Pandoc/Readers/Docx.hs | 4 ++++ src/Text/Pandoc/Readers/Docx/Parse.hs | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 67a97ae85..8b8d1ede1 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -206,11 +206,15 @@ runElemToInlines :: RunElem -> Inlines runElemToInlines (TextRun s) = text s runElemToInlines (LnBrk) = linebreak runElemToInlines (Tab) = space +runElemToInlines (SoftHyphen) = text "\xad" +runElemToInlines (NoBreakHyphen) = text "\x2011" runElemToString :: RunElem -> String runElemToString (TextRun s) = s runElemToString (LnBrk) = ['\n'] runElemToString (Tab) = ['\t'] +runElemToString (SoftHyphen) = ['\xad'] +runElemToString (NoBreakHyphen) = ['\x2011'] runToString :: Run -> String runToString (Run _ runElems) = concatMap runElemToString runElems diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index cce80fb48..53af19dfd 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -208,7 +208,7 @@ data Run = Run RunStyle [RunElem] | InlineDrawing FilePath B.ByteString deriving Show -data RunElem = TextRun String | LnBrk | Tab +data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen deriving Show data VertAlign = BaseLn | SupScrpt | SubScrpt @@ -877,6 +877,8 @@ elemToRunElem ns element map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str | isElem ns "w" "br" element = return LnBrk | isElem ns "w" "tab" element = return Tab + | isElem ns "w" "softHyphen" element = return SoftHyphen + | isElem ns "w" "noBreakHyphen" element = return NoBreakHyphen | isElem ns "w" "sym" element = return (getSymChar ns element) | otherwise = throwError WrongElem where From dfd06467eadace7b37cb3ebc53c943755d0436eb Mon Sep 17 00:00:00 2001 From: Ophir Lifshitz Date: Sun, 4 Oct 2015 06:08:17 -0400 Subject: [PATCH 2/2] Docx Reader: Create special punctuation test --- tests/Tests/Readers/Docx.hs | 4 ++++ tests/docx/special_punctuation.docx | Bin 0 -> 8408 bytes tests/docx/special_punctuation.native | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 tests/docx/special_punctuation.docx create mode 100644 tests/docx/special_punctuation.native diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index 47292bc99..7e3f1979e 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -130,6 +130,10 @@ tests = [ testGroup "inlines" "literal tabs" "docx/tabs.docx" "docx/tabs.native" + , testCompare + "special punctuation" + "docx/special_punctuation.docx" + "docx/special_punctuation.native" , testCompare "normalizing inlines" "docx/normalize.docx" diff --git a/tests/docx/special_punctuation.docx b/tests/docx/special_punctuation.docx new file mode 100644 index 0000000000000000000000000000000000000000..8e0bb55c94b70a4f6902f53b68cfa4fbaf2547b6 GIT binary patch literal 8408 zcmZ{JbzGEP*Y(gHLk=C%-QC?K2vP$G(%mtH(k&(3NJ*!3BZ4&2p>&sklzgN2`@G)r zJ@3r#y5^r*`?t?J`<%1RUK+}9@OS`N05V`xRYpD*gJ8}G1^}Qz1ORXW0DytCql24; zgBw`e+sVS!h|SC1t|4hyt(OB^_R3FogH>s%RsxxK_thW{lcrdgJ6%NxxVE^e(=uTk zIghE-sx=~MWTbsicU|S3a6uwQa7pIpc>$6d`1fu_wZoZ^QScNJAFBi1d3#u@MpcZU z0o9HCXn|o6kkgf%MTdznTEYiI`6%QLM|sV&U_$imD7BtS%@>pY4I-wRFR5(@JFff6 z>nz?jw>IUOD8~&6u%#mzOyWWyiNWll7}kfiB5^AsRX$PYBEL3Ky|$-#lF9KSGF|+p z3UQ7`4A%q%6oCpgeeZKF!;8zAX!G1$?5$|Z5g<-DW;CJ&b>MHkG#>T@*doN zWZU#XryDf!msktiKkQ=VUe47~hkekqYQF0h-Ze%|&q3IJLf=$Jl-r?5o5UF8gX`QBN)#7mjBrKms`u zm&p$9sBjxP+Mk5$mlh$#C0=h1%YSrfz!E<1(ejHV8_6I*m(R$Idw~?UcyWOE?7sgj z$g_}Fi!LXsRJ6v8qjbxl&x{^ZJ3UG8OAeJHWraEsXCfs3jq^m~EQ2`FI0eV&0e4Cg zexy3cupg!{#_J}(w+F6kCGv82O4?jQ9~Zi9al(%3mb6(wB9*QeIYMo^v9Zf6%7=s( zh8CLS0JJ2}S!LugHx#IO zpd}d$2LJ#8qySGx7jt%V#~1GQ(7OCpkUH8Z0JxS5>}Lhg$K4AJ00Vyk{Xev-0f4MT zQFUmshF<-Q{OG%v1VdG>NUKdnM_?j_Ub)UgP%|bZjBb-Qi zC~dWi-@!{wVA(<#Dg*YKbx@=tRxV9U+g4fSFV@Skh`kfYJMNGTlV?!zFmqF>p3U4t80WCFr zYLR)J@ZD-I_)`N8GA3~j+W3J)nxPi@Hs0$at(9g5Hj}|3T7*vXTH@;;!#))_y4bHi zU3rl@*6VVpRry1Z6PQE_F{WF0?0G<+f2wj?eu$^7VaV!}OxP1-p|kaDOb)ehR3^qL zw>o8rF85ucAEy9<+jEe2w2#N~)lJSYlzsviM8Qj4kda$;Ncy|t)rZn_2+ZF#xQgi0 zcnf8c10Db+AMg+T{}kfCC*gNdHcYibw+1}KwXEXmQMF7w2IQIDSMe<=Cjd1U7vQPv zCF6Kgm2A0yEIJE$#p=)d`C9%(N8TEBH-8>|i8q^F8vO6b+}YRKecB1A9&I$ zNl5%SKTGDl6Oa5LRzhra0=L)zyHv>`(We`fDc5Dn&#O zXTh2SRkL|123W82{^gSV0JR$HZuC2)FSw(zVLPe4)2OJn1tw$&;@yI5li(04d#h7E zinqGXD3%DFH`kjef0Guo-NWMuB?|(5$o?knrK6*pgQJ^;>#x%N&6;YoBql5sy6jc< zE4O1jBhZV`v@9eJpFl)r>@XPAP-0JF#MLwKsq)P!a>1W!6fkk9$V~PpO`x>2{olMlE~x z3v2Gc6uwQ+zrgxv#qzVSyx6>WMjZJ`@z ziS0?fxm8LwzIH z1tDz6an;F>_I~VyaTUd|c&^V&{?ZGUrf3>9v|z=Jg3h0n=EHMxeDHo#^@+u~qPDvS zdsei=-shKWs97W<2&VPNjbQJ}*QH~yQrnO#A14M=zh+>v(XX-8{3 zmzZrsaAy|~r%ErH(S&>82~FP6HY;BSkOIsVfN#oVKsF|9?MzCIA{~Jc5t4D~(W>m& z6ph4bDMkR>ENeOm4n}q#0}Ehspr-~zSX$vL<*_Ubbe>G7q()EFCtQYWGG;Dv)A&iz zNx)n!hieS1VwF&`QG!qevEiS0huoOT`cb)Vq2UBJ=zt9FV4FzrE|=9jaY-U=cx2oh zG*rOGapxkg?--*OJb?j%6Vh~v z6QmlWk-k3!%j<(nyDgbfJB5qs86i)6x;7x_>jU8hg7T--5M#|y-U9uFQ`DQEg)QjI zEMa)Eb0b=jX&ZnW#~w<{p`xbvCC1Qx0O^9LD&OY@hX&Lk-sdyXjbU$q6t&2Ri5^>s zBfu=`kDHbY=63S*AI4+V(j)Dp%)<|zX~(7W^M9xvQJmwf|47`+)8X=`#4B7WQ&n%` zxX5xaO}HEE7o|0b}{l#w_i7sIL;d(8f!TZs)s=rKWb z;#p!t_~rojgck3mZ$0sKh7=_3jSyMG;n#O1Qx4T-@5AS}qsR7FwcV6x!z#>?A%4v$ z7@Yea-woZ#i}wl__NSTa&b>Yv;$s@e(LRR&mW^r;J5_3zHLD=b&l%@YgFI~D7p(Pi zwLNwh|ON0TZ?98*!sjlhESYD3e} zUH-KelAAfE}MjhFaKnOBH`` zNVF2wQ#DL*3DR4~YN@d4Y{lik@%DB6NZnu`(}3ZR%)YaSDBxvxk*GinDe*Kp`H^z!#0*#G>)L_Dvak;9YnS3ubkyC-<0HlHi{;ApI-}AijaAkEYMCwn*KpkF`1` zxgFLd+Gb#-2oLvxC8b#GPqtna^i7hJrBIq>c)u~ zG_m)`4E2+v@zM&OqVU#^soxPauFcmoVbo*K7|azG7%aOI3*HaZbMjsGBIm$vhN8sL zhDj&KoDqAw6o#>AJCA? zv)f9eUj9MHt*;4Qm7M#efGNzdNuRaC=peAUdO}(Pk@2%8R3-|L05DF#p>(WgK2Njg zdO9QHl2WG$e&Sm^b%(S4@JMDcykat2gzodMP^tgF=kQ-zA#0-BrjHUEam_b}+G{L* zS(>qSzMTc~>Jp}JM=m^eMCF^9KPw5EXnk;|%)rCz01+^SsSQEY2G!*^uN|2e1wq|| zzU*S+D~tTj?QNNLE&$RXR$f}5;-+!*h@zt6%X-Nvn1mRMEx_w7m;Tm~KxrwCl#)Qx zgyKryWXW5GXLNR^U4o7W7$c;18<8h^z@@0%ES|SpJU22ruZ>4QGfEBJB=E3<6+yY# zpM0kK&cJ3UPa$vPmJx4#t$FjBct|w1#lK{<%~K`ejs1AaUQ9M{T-7NWmjC80``<@j z;IjMn0+fmfs4M|Of1xr{!^P3bmHmaIi^VTRVjr)n+$Vt^di*nz>x%CXmAdF~MdSlz z$(sAezN6|aEUt;lhm{%|=H@1v=?p^Ozw&~n{ zE$f{#Gx1r&j_B87I?GmP9WW-7HA9O(QBW&p!W9dv3d%}ZIz!f6SB!&iZRvoK&yOL% z0QYcJ%3T!jG&8M}s7G~Cj?2Wh(O8us@;wVTYH6;f2~e=yd@?Efo!@ZRwU$|6%#|0+ zZsi{U#~yY&xLICwp;#}kC2d)a&y0&fW@k|uuc*9seEQ{1c>Cm$ii9y&lQV~sFb%B< z?0*ws>g4o`hb*n%H}SSB=KQ4yW$!Wr@5*e~hNn`iub{vBOt);HQW9%Wt#y z#{&J$p5fi~%V+g%qF74H&b>OPF6w4wVQ<0y`_1`lS`MG9Ixg~oI*xU*TbgR^nB89q z$sWudetCz1(w)(}8j5+!biLT-}R$(AZmDA%S)bp6u}36rf(i=8UyoYRHVjK+Byg5Q}|X>F&$ZHeOR{=kG-1 z5RNU2tl&1nZ!(J-qQGaD;p;+t7R4i3FJ`PF;d<;-JL)tLqo&< z7~(h4K)g2wd%0!NY68@h{t_;I!B4-1{*YT5&H^G-v79sRtT(c;dP$rT^twA&+({_{ z6-HFuXP+f#u{U;#xzMKM!1=jP=U2W3G-1?6D17ZvR@?{OAr zyNxdMnr4oxbWj`=^$e)d*(<6V2$50oaY!jxx~E1`7@0V5l`4r&$s&&N8Z>(BDkiyL zYD&smkyi@r7+{!&HP%4k&tK7vCpL@Gc#Tv~0{F3~jW=D%!P&|)Fh`u3;guJfu$!=q zoa{OGCuj4xPA(6-L-#rAIceME$Kr8xbB-TiG zuU=1OJy%XVW3NjT87pS*7Oj+A1^4;l978=teq?E@%u5PH207Tm;~f|&x)q~n=F{bF z@`6k?_~s}{EHhUVIWLxx_qGBFDCtD@Nu^aXR@<06nr7P7F_`5VA5w+|?|0?#Y2;X?WV?jGZ1u3*ojn z?=apS2gZVDDxb_(d^NC|trZq~*qRXjQqpA35^ejUMsyaIFvBH>i=vZ6)SE$Gyk{RtA(39eUxY$^*LOXj~%H%x7X#BpNp1%YN(Y3QRCC!&?LpV zLhwzxa(xh^Bjak2ucT8IL}RspCx#wfl2RYq6!jWj-3+|@!2HFMU#?6`s}#=e*_%_0WwULlU^bMq(q##4&N__LX2 zXEmiV7UvZ+1&eWgwiOeqXu^GZ746ZMr^lc2!k(R-4M$ggt;DvXRdp7YT&3lU40+nd zizJ00kwb=fc<0$OnXQdoW!LWAi}V{8Vuy+`d|H%sGmPanLMG>3sVV#7Z!6JQ z?=EZe$tq&cs}@YzUxHNWKH~CnZ*m047sJ-vkBh^;;Ii=5&|euX8yPe_g8zISIK<_+ zji6^VaLx7nZh;Og7EePpGTc$1D-o(Y76P;>oVT0>1*n1b9mff;9uAcCJI>q|&So%_ z9FEan)vL7c>)s z=ipbLRQ=hXCLrdTG(I!g3U+a1ocoMFhhX6^%rdw4#WF#HXPJm<*kgh}D)7w@-X~3w zNLwF?Y`r%>RwUH8%BYd%;Cv76Ad~?;-8)eJ zQhW|>+NNf9zje-GvXT004!qFgD`o}NHdnmR#IJMTuewq( z8^bwIKfU2$bX?8O77;AY8eR2%X4~LTZen2OIN=IGqyIo>yw( zR@^DpYnAC#nyWN~PJ1;*4Wi&$@in)lF7YwyTTmhwQ~AuFDZKn__Tpnie7exud_0Q` z$ZXAa=_cNysj-hyGd;bE)ks=OV7T$tT?;htzA@6byHZUR`G%puDDkCjHJ(t9(??6T zhHs>d4iy4OpGpoR_67V6toyCiau$T`iiy~WiWhwHi!N9Mfe;QOk%1?Gy>2Jdt&47v z{CiIhc~K6my4dE$ zUJi(H5}&HW`db!E<~)Phhl;c_q<<9aZ&~1K;pPTyq5bajW$CCoe&fJyztWWvWT*8D zL)K^2Sdea&Qf>7dObZMllW(L_4!BpNRI9bgvr?Zs{!9`_pFOzoGj45^Jn1D@R!f$_ zm^)<=*UMd>y^c;+gAU&SPwpW&i~-6LGQ2{e_@|!5lM+)H$oU4ZG-(@DRHBrCA8AbS zK2bDjmz1`0(xELJYPm6UE4)!I#OYgmR_qa`vyHhjLm6K;()JTGd|Cot{lF-y1DbiMYQZ(Hx7sv!6rN_7TtX|! zU^n{}Rgu#fC7f&x+EdTy)n+3XO6u_5*L;*IT)ZvR)of|Op&*IU(ZKWds9aLyXK#^g zWrHcpq~5D#z=h!}ntHQ14K>4rocg}fJr$_}hYg&)<#CK0Ka5(EhPa`l%sZdU<&S?H zv5{Yvo3Ei3{BmBRcSnPN+ck-T#VrM_f3 z(@WhobpHy=^9_b|LwY?y@HuS^vss7%}DbqyLL{D z=%TGO6p~6$&F10sbUd;h!>+<*J8HhwTzkohUQ4Yps=|t^YR)HyAtCMEXhXGA&2syr z)0?uUSv?WxM`rnrgR1pMDTY@c-PMv>%7f77mml+cWq4YcJ^k0>Fp3q%?BT$cxy|fxQMY5jjywaZ z?obzY!Kl>Iq}VxbG~mRJNl8iyo`08n%Ex}(hx&#kRO!f-6(JZ#p5&dY8v{&XR z_f^7=o1HP6Y(0T{x-oO6d!JD)_vT&4^^quy%Q}TGNHmmTpzHeo_D7*Z`q$^bxTXJJ zfJc_zA9O7AEclnf_X|i4_lvXtW`lu#v*c;=2z<-V4G5&G;;txI;^S?AR9>X7B`v1UVpgzF=_Co%r1@IXC z_=5TeO#yAC{2uw^>+0hWk9EnP5VnN>JN|#El*jnThv6SQAJKov|JUk&41c_1{DEsg f?Z@9Ec(i|LC?i0x=dy6X literal 0 HcmV?d00001 diff --git a/tests/docx/special_punctuation.native b/tests/docx/special_punctuation.native new file mode 100644 index 000000000..304289f44 --- /dev/null +++ b/tests/docx/special_punctuation.native @@ -0,0 +1,2 @@ +[Para [Str "Soft",Space,Str "hyphen:",Space,Str "[\173]"] +,Para [Str "Non-breaking",Space,Str "hyphen:",Space,Str "[\8209]"]]