diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d22d920..e93376be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ *Megaparsec follows [SemVer](https://semver.org/).* +## Upcoming + +* Implemented correct handling of wide Unicode characters in error messages. + To that end, a new module `Text.Megaparsec.Unicode` was introduced. [Issue + 370](https://github.com/mrkkrp/megaparsec/issues/370). + ## Megaparsec 9.6.1 * Exposed `Text.Megaparsec.State`, so that the new functions (`initialState` diff --git a/Text/Megaparsec/Error.hs b/Text/Megaparsec/Error.hs index f2262099..63e23089 100644 --- a/Text/Megaparsec/Error.hs +++ b/Text/Megaparsec/Error.hs @@ -64,6 +64,7 @@ import GHC.Generics import Text.Megaparsec.Pos import Text.Megaparsec.State import Text.Megaparsec.Stream +import qualified Text.Megaparsec.Unicode as Unicode ---------------------------------------------------------------------------- -- Parse error type @@ -397,7 +398,7 @@ errorBundlePretty ParseErrorBundle {..} = lineNumber = (show . unPos . sourceLine) epos padding = replicate (length lineNumber + 1) ' ' rpshift = unPos (sourceColumn epos) - 1 - slineLen = length sline + slineLen = Unicode.stringLength sline in padding <> "|\n" <> lineNumber diff --git a/Text/Megaparsec/Stream.hs b/Text/Megaparsec/Stream.hs index 57a5bd8c..86f6ac62 100644 --- a/Text/Megaparsec/Stream.hs +++ b/Text/Megaparsec/Stream.hs @@ -52,6 +52,7 @@ import qualified Data.Text.Lazy as TL import Data.Word (Word8) import Text.Megaparsec.Pos import Text.Megaparsec.State +import qualified Text.Megaparsec.Unicode as Unicode -- | Type class for inputs that can be consumed by the library. -- @@ -426,6 +427,7 @@ class (Stream s) => VisualStream s where instance VisualStream String where showTokens Proxy = stringPretty + tokensLength Proxy = Unicode.stringLength instance VisualStream B.ByteString where showTokens Proxy = stringPretty . fmap (chr . fromIntegral) @@ -435,9 +437,11 @@ instance VisualStream BL.ByteString where instance VisualStream T.Text where showTokens Proxy = stringPretty + tokensLength Proxy = Unicode.stringLength instance VisualStream TL.Text where showTokens Proxy = stringPretty + tokensLength Proxy = Unicode.stringLength -- | Type class for inputs that can also be used for error reporting. -- @@ -510,37 +514,37 @@ class (Stream s) => TraversableStream s where instance TraversableStream String where -- NOTE Do not eta-reduce these (breaks inlining) reachOffset o pst = - reachOffset' splitAt foldl' id id ('\n', '\t') o pst + reachOffset' splitAt foldl' id id ('\n', '\t') charInc o pst reachOffsetNoLine o pst = - reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst + reachOffsetNoLine' splitAt foldl' ('\n', '\t') charInc o pst instance TraversableStream B.ByteString where -- NOTE Do not eta-reduce these (breaks inlining) reachOffset o pst = - reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst + reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) byteInc o pst reachOffsetNoLine o pst = - reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst + reachOffsetNoLine' B.splitAt B.foldl' (10, 9) byteInc o pst instance TraversableStream BL.ByteString where -- NOTE Do not eta-reduce these (breaks inlining) reachOffset o pst = - reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst + reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) byteInc o pst reachOffsetNoLine o pst = - reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst + reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) byteInc o pst instance TraversableStream T.Text where -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). reachOffset o pst = - reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst + reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') charInc o pst reachOffsetNoLine o pst = - reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst + reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') charInc o pst instance TraversableStream TL.Text where -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). reachOffset o pst = - reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst + reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') charInc o pst reachOffsetNoLine o pst = - reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst + reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') charInc o pst ---------------------------------------------------------------------------- -- Helpers @@ -564,6 +568,8 @@ reachOffset' :: (Token s -> Char) -> -- | Newline token and tab token (Token s, Token s) -> + -- | Increment in column position for a token + (Token s -> Pos) -> -- | Offset to reach Int -> -- | Initial 'PosState' to use @@ -576,6 +582,7 @@ reachOffset' fromToks fromTok (newlineTok, tabTok) + columnIncrement o PosState {..} = ( Just $ case expandTab pstateTabWidth @@ -624,7 +631,7 @@ reachOffset' (g . (fromTok ch :)) | otherwise -> St - (SourcePos n l (c <> pos1)) + (SourcePos n l (c <> columnIncrement ch)) (g . (fromTok ch :)) {-# INLINE reachOffset' #-} @@ -639,6 +646,8 @@ reachOffsetNoLine' :: -- | Newline token and tab token (Token s, Token s) -> -- | Offset to reach + -- | Increment in column position for a token + (Token s -> Pos) -> Int -> -- | Initial 'PosState' to use PosState s -> @@ -648,6 +657,7 @@ reachOffsetNoLine' splitAt' foldl'' (newlineTok, tabTok) + columnIncrement o PosState {..} = ( PosState @@ -670,7 +680,7 @@ reachOffsetNoLine' | ch == tabTok -> SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w)) | otherwise -> - SourcePos n l (c <> pos1) + SourcePos n l (c <> columnIncrement ch) {-# INLINE reachOffsetNoLine' #-} -- | Like 'BL.splitAt' but accepts the index as an 'Int'. @@ -753,3 +763,13 @@ expandTab w' = go 0 0 go !i 0 (x : xs) = x : go (i + 1) 0 xs go !i n xs = ' ' : go (i + 1) (n - 1) xs w = unPos w' + +-- | Return increment in column position that corresponds to the given +-- 'Char'. +charInc :: Char -> Pos +charInc ch = if Unicode.isWideChar ch then pos1 <> pos1 else pos1 + +-- | Return increment in column position that corresponds to the given +-- 'Word8'. +byteInc :: Word8 -> Pos +byteInc _ = pos1 diff --git a/Text/Megaparsec/Unicode.hs b/Text/Megaparsec/Unicode.hs new file mode 100644 index 00000000..6215bd3f --- /dev/null +++ b/Text/Megaparsec/Unicode.hs @@ -0,0 +1,180 @@ +{-# LANGUAGE Safe #-} + +-- | +-- Module : Text.Megaparsec.Unicode +-- Copyright : © 2024–present Megaparsec contributors +-- License : FreeBSD +-- +-- Maintainer : Mark Karpov +-- Stability : experimental +-- Portability : portable +-- +-- Utility functions for working with Unicode. +-- +-- @since 9.7.0 +module Text.Megaparsec.Unicode + ( stringLength, + charLength, + isWideChar, + ) +where + +import Data.Array (Array, bounds, listArray, (!)) +import Data.Char (ord) + +-- | Calculate length of a string taking into account the fact that certain +-- 'Char's may span more than 1 column. +-- +-- @since 9.7.0 +stringLength :: (Traversable t) => t Char -> Int +stringLength = sum . fmap charLength + +-- | Return length of an individual 'Char'. +-- +-- @since 9.7.0 +charLength :: Char -> Int +charLength ch = if isWideChar ch then 2 else 1 + +-- | Determine whether the given 'Char' is “wide”, that is, whether it spans +-- 2 columns instead of one. +-- +-- @since 9.7.0 +isWideChar :: Char -> Bool +isWideChar c = go (bounds wideCharRanges) + where + go (lo, hi) + | hi < lo = False + | a <= n && n <= b = True + | n < a = go (lo, pred mid) + | otherwise = go (succ mid, hi) + where + mid = (lo + hi) `div` 2 + (a, b) = wideCharRanges ! mid + n = ord c + +-- | Wide character ranges. +wideCharRanges :: Array Int (Int, Int) +wideCharRanges = + listArray + (0, 118) + [ (0x001100, 0x00115f), + (0x00231a, 0x00231b), + (0x002329, 0x00232a), + (0x0023e9, 0x0023ec), + (0x0023f0, 0x0023f0), + (0x0023f3, 0x0023f3), + (0x0025fd, 0x0025fe), + (0x002614, 0x002615), + (0x002648, 0x002653), + (0x00267f, 0x00267f), + (0x002693, 0x002693), + (0x0026a1, 0x0026a1), + (0x0026aa, 0x0026ab), + (0x0026bd, 0x0026be), + (0x0026c4, 0x0026c5), + (0x0026ce, 0x0026ce), + (0x0026d4, 0x0026d4), + (0x0026ea, 0x0026ea), + (0x0026f2, 0x0026f3), + (0x0026f5, 0x0026f5), + (0x0026fa, 0x0026fa), + (0x0026fd, 0x0026fd), + (0x002705, 0x002705), + (0x00270a, 0x00270b), + (0x002728, 0x002728), + (0x00274c, 0x00274c), + (0x00274e, 0x00274e), + (0x002753, 0x002755), + (0x002757, 0x002757), + (0x002795, 0x002797), + (0x0027b0, 0x0027b0), + (0x0027bf, 0x0027bf), + (0x002b1b, 0x002b1c), + (0x002b50, 0x002b50), + (0x002b55, 0x002b55), + (0x002e80, 0x002e99), + (0x002e9b, 0x002ef3), + (0x002f00, 0x002fd5), + (0x002ff0, 0x002ffb), + (0x003000, 0x00303e), + (0x003041, 0x003096), + (0x003099, 0x0030ff), + (0x003105, 0x00312f), + (0x003131, 0x00318e), + (0x003190, 0x0031ba), + (0x0031c0, 0x0031e3), + (0x0031f0, 0x00321e), + (0x003220, 0x003247), + (0x003250, 0x004db5), + (0x004e00, 0x009fef), + (0x00a000, 0x00a48c), + (0x00a490, 0x00a4c6), + (0x00a960, 0x00a97c), + (0x00ac00, 0x00d7a3), + (0x00f900, 0x00fa6d), + (0x00fa70, 0x00fad9), + (0x00fe10, 0x00fe19), + (0x00fe30, 0x00fe52), + (0x00fe54, 0x00fe66), + (0x00fe68, 0x00fe6b), + (0x00ff01, 0x00ff60), + (0x00ffe0, 0x00ffe6), + (0x016fe0, 0x016fe3), + (0x017000, 0x0187f7), + (0x018800, 0x018af2), + (0x01b000, 0x01b11e), + (0x01b150, 0x01b152), + (0x01b164, 0x01b167), + (0x01b170, 0x01b2fb), + (0x01f004, 0x01f004), + (0x01f0cf, 0x01f0cf), + (0x01f18e, 0x01f18e), + (0x01f191, 0x01f19a), + (0x01f200, 0x01f202), + (0x01f210, 0x01f23b), + (0x01f240, 0x01f248), + (0x01f250, 0x01f251), + (0x01f260, 0x01f265), + (0x01f300, 0x01f320), + (0x01f32d, 0x01f335), + (0x01f337, 0x01f37c), + (0x01f37e, 0x01f393), + (0x01f3a0, 0x01f3ca), + (0x01f3cf, 0x01f3d3), + (0x01f3e0, 0x01f3f0), + (0x01f3f4, 0x01f3f4), + (0x01f3f8, 0x01f43e), + (0x01f440, 0x01f440), + (0x01f442, 0x01f4fc), + (0x01f4ff, 0x01f53d), + (0x01f54b, 0x01f54e), + (0x01f550, 0x01f567), + (0x01f57a, 0x01f57a), + (0x01f595, 0x01f596), + (0x01f5a4, 0x01f5a4), + (0x01f5fb, 0x01f64f), + (0x01f680, 0x01f6c5), + (0x01f6cc, 0x01f6cc), + (0x01f6d0, 0x01f6d2), + (0x01f6d5, 0x01f6d5), + (0x01f6eb, 0x01f6ec), + (0x01f6f4, 0x01f6fa), + (0x01f7e0, 0x01f7eb), + (0x01f90d, 0x01f971), + (0x01f973, 0x01f976), + (0x01f97a, 0x01f9a2), + (0x01f9a5, 0x01f9aa), + (0x01f9ae, 0x01f9ca), + (0x01f9cd, 0x01f9ff), + (0x01fa70, 0x01fa73), + (0x01fa78, 0x01fa7a), + (0x01fa80, 0x01fa82), + (0x01fa90, 0x01fa95), + (0x020000, 0x02a6d6), + (0x02a700, 0x02b734), + (0x02b740, 0x02b81d), + (0x02b820, 0x02cea1), + (0x02ceb0, 0x02ebe0), + (0x02f800, 0x02fa1d) + ] +{-# NOINLINE wideCharRanges #-} diff --git a/megaparsec-tests/megaparsec-tests.cabal b/megaparsec-tests/megaparsec-tests.cabal index c1d21705..ee206506 100644 --- a/megaparsec-tests/megaparsec-tests.cabal +++ b/megaparsec-tests/megaparsec-tests.cabal @@ -58,6 +58,7 @@ test-suite tests Text.Megaparsec.ErrorSpec Text.Megaparsec.PosSpec Text.Megaparsec.StreamSpec + Text.Megaparsec.UnicodeSpec Text.MegaparsecSpec default-language: Haskell2010 diff --git a/megaparsec-tests/tests/Text/Megaparsec/ErrorSpec.hs b/megaparsec-tests/tests/Text/Megaparsec/ErrorSpec.hs index aba7ff6a..acea2308 100644 --- a/megaparsec-tests/tests/Text/Megaparsec/ErrorSpec.hs +++ b/megaparsec-tests/tests/Text/Megaparsec/ErrorSpec.hs @@ -199,6 +199,41 @@ spec = do ++ replicate errColumn ' ' ++ "^\nunexpected 'b'\nexpecting 'x'\n" ) + context "in the presence of wide characters" $ do + it "calculates column positions correctly" $ do + let s = "구구 이면" :: String + pe = err 2 (ulabel "space" <> etok '이') :: PE + bundle = + ParseErrorBundle + { bundleErrors = pe :| [], + bundlePosState = + PosState + { pstateInput = s, + pstateOffset = 0, + pstateSourcePos = initialPos "", + pstateTabWidth = defaultTabWidth, + pstateLinePrefix = "" + } + } + errorBundlePretty bundle + `shouldBe` "1:5:\n |\n1 | 구구 이면\n | ^\nunexpected space\nexpecting '이'\n" + it "uses continuous highlighting" $ do + let s = "구구 이면" :: String + pe = err 3 (utok '이' <> etok '구') :: PE + bundle = + ParseErrorBundle + { bundleErrors = pe :| [], + bundlePosState = + PosState + { pstateInput = s, + pstateOffset = 0, + pstateSourcePos = initialPos "", + pstateTabWidth = defaultTabWidth, + pstateLinePrefix = "" + } + } + errorBundlePretty bundle + `shouldBe` "1:6:\n |\n1 | 구구 이면\n | ^^\nunexpected '이'\nexpecting '구'\n" it "displays multi-error bundle correctly" $ do let s = "something\ngood\n" :: String pe0 = err 2 (utok 'm' <> etok 'x') :: PE diff --git a/megaparsec-tests/tests/Text/Megaparsec/UnicodeSpec.hs b/megaparsec-tests/tests/Text/Megaparsec/UnicodeSpec.hs new file mode 100644 index 00000000..50ae650c --- /dev/null +++ b/megaparsec-tests/tests/Text/Megaparsec/UnicodeSpec.hs @@ -0,0 +1,20 @@ +module Text.Megaparsec.UnicodeSpec (spec) where + +import Test.Hspec +import qualified Text.Megaparsec.Unicode as Unicode + +spec :: Spec +spec = do + describe "stringLength" $ + it "computes correct length in the presense of wide chars" $ + Unicode.stringLength "123 구구 이면" `shouldBe` 13 + describe "charLength" $ do + it "returns 1 for non-wide chars" $ + Unicode.charLength 'a' `shouldBe` 1 + it "returns 2 for wide chars" $ + Unicode.charLength '구' `shouldBe` 2 + describe "isWideChar" $ do + it "returns False for non-wide chars" $ + Unicode.isWideChar 'a' `shouldBe` False + it "returns True for wide chars" $ + Unicode.isWideChar '구' `shouldBe` True diff --git a/megaparsec.cabal b/megaparsec.cabal index 95aa1264..b4e89c17 100644 --- a/megaparsec.cabal +++ b/megaparsec.cabal @@ -48,6 +48,7 @@ library Text.Megaparsec.Pos Text.Megaparsec.State Text.Megaparsec.Stream + Text.Megaparsec.Unicode other-modules: Text.Megaparsec.Class @@ -56,6 +57,7 @@ library default-language: Haskell2010 build-depends: + array >=0.5.3 && <0.6, base >=4.15 && <5, bytestring >=0.2 && <0.13, case-insensitive >=1.2 && <1.3,