cgrep does not work well with Unicode input

Question

cgrep does not work well with Unicode input

lyokha opened this issue 8 years ago · 3 comments

Hi Nicola!

I found that cgrep is unable to process UTF-8 encoded input correctly. Here is an image that shows the issue.

I ran it recursively on a directory with md files full of Russian UTF-8 encoded files. Last two commands show correct output from program ack. You can see that not only cgrep misses several matches (less matches for ARGS and no match for russian word потому), but also it prints garbage on output.

I made a patch that fixes the issue for BoyerMoore, Regex and CppTokenizer searches (the changes are all similar and can be easily expanded to other types of searches). Here it is.

diff --git a/cgrep.cabal b/cgrep.cabal
index b9ed1e6..c7174ea 100644
--- a/cgrep.cabal
+++ b/cgrep.cabal
@@ -48,6 +48,7 @@ Executable cgrep
                        mtl >= 2.0,
                        unix-compat >= 0.4,
                        async >= 2.0,
+                       utf8-string,
                        transformers,
                        process
   Ghc-options:         -O2 -Wall -threaded -rtsopts -with-rtsopts=-N
diff --git a/src/CGrep/Output.hs b/src/CGrep/Output.hs
index 3176baf..a62b8a9 100644
--- a/src/CGrep/Output.hs
+++ b/src/CGrep/Output.hs
@@ -26,7 +26,9 @@ module CGrep.Output (Output(..),
                      showFile,
                      showBold) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
 import System.Console.ANSI

 #ifdef ENABLE_HINT
@@ -250,8 +252,8 @@ showTokens Options { show_match = st } out

 showLine :: Config -> Options -> Output -> String
 showLine conf Options { color = c, no_color = c' } out
-    | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (C.unpack (outLine out))
-    | otherwise  = C.unpack (outLine out)
+    | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (UC.decode (B.unpack (outLine out)))
+    | otherwise  = UC.decode (B.unpack (outLine out))


 showFileName :: Config -> Options -> String -> String
diff --git a/src/CGrep/Strategy/BoyerMoore.hs b/src/CGrep/Strategy/BoyerMoore.hs
index e135ddb..98da613 100644
--- a/src/CGrep/Strategy/BoyerMoore.hs
+++ b/src/CGrep/Strategy/BoyerMoore.hs
@@ -20,7 +20,9 @@

 module CGrep.Strategy.BoyerMoore (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8  as C
+import qualified Codec.Binary.UTF8.String as UC

 import Control.Monad.Trans.Reader
 import Control.Monad.IO.Class
@@ -50,10 +52,12 @@ search f patterns = do

     -- transform text

-    let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
-                                              , contextFilter (getFileLang opt filename) (mkContextFilter opt)
-                                              , ignoreCase opt
-                                              ]
+    let utext = C.pack $ UC.decode $ B.unpack text
+
+    let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+                                               , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+                                               , ignoreCase opt
+                                               ]

     -- make shallow search

diff --git a/src/CGrep/Strategy/Cpp/Tokenizer.hs b/src/CGrep/Strategy/Cpp/Tokenizer.hs
index d7b2b1e..017e0b9 100644
--- a/src/CGrep/Strategy/Cpp/Tokenizer.hs
+++ b/src/CGrep/Strategy/Cpp/Tokenizer.hs
@@ -18,7 +18,10 @@

 module CGrep.Strategy.Cpp.Tokenizer (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
 import qualified CGrep.Parser.Cpp.Token as Cpp

 import Control.Monad.Trans.Reader
@@ -48,12 +51,14 @@ search f ps = do

     -- transform text

+    let utext = C.pack $ UC.decode $ B.unpack text
+
     let filt = (mkContextFilter opt) { getFilterComment = False }

-    let [text''', _ , text', _] = scanr ($) text [ expandMultiline opt
-                                                 , contextFilter (getFileLang opt filename) filt
-                                                 , ignoreCase opt
-                                                 ]
+    let [text''', _ , text', _] = scanr ($) utext [ expandMultiline opt
+                                                  , contextFilter (getFileLang opt filename) filt
+                                                  , ignoreCase opt
+                                                  ]


     putStrLevel1 $ "strategy  : running C/C++ token search on " ++ filename ++ "..."
diff --git a/src/CGrep/Strategy/Regex.hs b/src/CGrep/Strategy/Regex.hs
index deda70d..2e17d30 100644
--- a/src/CGrep/Strategy/Regex.hs
+++ b/src/CGrep/Strategy/Regex.hs
@@ -20,7 +20,9 @@

 module CGrep.Strategy.Regex (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC

 import Control.Monad.Trans.Reader
 import Control.Monad.IO.Class
@@ -52,10 +54,12 @@ search f patterns = do

     -- transform text

-    let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
-                                              , contextFilter (getFileLang opt filename) (mkContextFilter opt)
-                                              , ignoreCase opt
-                                              ]
+    let utext = C.pack $ UC.decode $ B.unpack text
+
+    let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+                                               , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+                                               , ignoreCase opt
+                                               ]

     -- search for matching tokens

After this patch cgrep output looks correct:

If you do not mind against changes, I can apply them myself as you kindly gave me permission for this :) I am not sure about what exact dependency on utf-string must be in cgrep.cabal though.

Answer 1 · 2016-07-08T15:30:07.000Z

Unfortunately it makes cgrep slower than ack, probably it makes sense to involve some new option to turn UTF-8 on.

Answer 2 · 2016-07-08T16:28:47.000Z

I localized UTF-8 decoding from strategies to getTargetContents in CGrep/Output.hs. Now the patch is shorter.

diff --git a/cgrep.cabal b/cgrep.cabal
index b9ed1e6..c7174ea 100644
--- a/cgrep.cabal
+++ b/cgrep.cabal
@@ -48,6 +48,7 @@ Executable cgrep
                        mtl >= 2.0,
                        unix-compat >= 0.4,
                        async >= 2.0,
+                       utf8-string,
                        transformers,
                        process
   Ghc-options:         -O2 -Wall -threaded -rtsopts -with-rtsopts=-N
diff --git a/src/CGrep/Common.hs b/src/CGrep/Common.hs
index 00b6fb9..861404f 100644
--- a/src/CGrep/Common.hs
+++ b/src/CGrep/Common.hs
@@ -25,7 +25,9 @@ module CGrep.Common (Text8,
                      ignoreCase,
                      trim, trim8) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
 import qualified Data.ByteString.Search as SC

 import Data.Char
@@ -52,8 +54,8 @@ getTargetName name = name


 getTargetContents :: FilePath -> IO Text8
-getTargetContents [] = C.getContents
-getTargetContents xs = C.readFile xs
+getTargetContents [] = B.getContents >>= return . C.pack . UC.decode . B.unpack
+getTargetContents xs = B.readFile xs >>= return . C.pack . UC.decode . B.unpack 


 shallowSearch :: [Text8] -> Text8 -> [[Int]]
diff --git a/src/CGrep/Output.hs b/src/CGrep/Output.hs
index 3176baf..a62b8a9 100644
--- a/src/CGrep/Output.hs
+++ b/src/CGrep/Output.hs
@@ -26,7 +26,9 @@ module CGrep.Output (Output(..),
                      showFile,
                      showBold) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
 import System.Console.ANSI

 #ifdef ENABLE_HINT
@@ -250,8 +252,8 @@ showTokens Options { show_match = st } out

 showLine :: Config -> Options -> Output -> String
 showLine conf Options { color = c, no_color = c' } out
-    | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (C.unpack (outLine out))
-    | otherwise  = C.unpack (outLine out)
+    | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (UC.decode (B.unpack (outLine out)))
+    | otherwise  = UC.decode (B.unpack (outLine out))


 showFileName :: Config -> Options -> String -> String

The most time-consuming part is now getTargetContents: I cannot find a fast way to decode UTF-8 from ByteString, if there is no such a way then it makes sense to turn UTF-8 support using an extra command-line option.

Answer 3 · 2016-07-08T20:34:56.000Z

Here is the final patch with all strategies implemented and a new option -u (--utf8). If the new option is not set cgrep will do all as before.

diff --git a/cgrep.cabal b/cgrep.cabal
index b9ed1e6..c7174ea 100644
--- a/cgrep.cabal
+++ b/cgrep.cabal
@@ -48,6 +48,7 @@ Executable cgrep
                        mtl >= 2.0,
                        unix-compat >= 0.4,
                        async >= 2.0,
+                       utf8-string,
                        transformers,
                        process
   Ghc-options:         -O2 -Wall -threaded -rtsopts -with-rtsopts=-N
diff --git a/src/CGrep/Output.hs b/src/CGrep/Output.hs
index 3176baf..dd8bdff 100644
--- a/src/CGrep/Output.hs
+++ b/src/CGrep/Output.hs
@@ -26,7 +26,9 @@ module CGrep.Output (Output(..),
                      showFile,
                      showBold) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
 import System.Console.ANSI

 #ifdef ENABLE_HINT
@@ -249,9 +251,10 @@ showTokens Options { show_match = st } out


 showLine :: Config -> Options -> Output -> String
-showLine conf Options { color = c, no_color = c' } out
-    | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (C.unpack (outLine out))
-    | otherwise  = C.unpack (outLine out)
+showLine conf Options { color = c, no_color = c', utf8 = u } out
+    | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (unpack (outLine out))
+    | otherwise  = unpack (outLine out)
+    where unpack = if u then UC.decode . B.unpack else C.unpack


 showFileName :: Config -> Options -> String -> String
diff --git a/src/CGrep/Strategy/BoyerMoore.hs b/src/CGrep/Strategy/BoyerMoore.hs
index e135ddb..87ba172 100644
--- a/src/CGrep/Strategy/BoyerMoore.hs
+++ b/src/CGrep/Strategy/BoyerMoore.hs
@@ -20,7 +20,9 @@

 module CGrep.Strategy.BoyerMoore (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8  as C
+import qualified Codec.Binary.UTF8.String as UC

 import Control.Monad.Trans.Reader
 import Control.Monad.IO.Class
@@ -50,10 +52,12 @@ search f patterns = do

     -- transform text

-    let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
-                                              , contextFilter (getFileLang opt filename) (mkContextFilter opt)
-                                              , ignoreCase opt
-                                              ]
+    let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+    let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+                                               , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+                                               , ignoreCase opt
+                                               ]

     -- make shallow search

diff --git a/src/CGrep/Strategy/Cpp/Semantic.hs b/src/CGrep/Strategy/Cpp/Semantic.hs
index b0e9fd8..a3c8722 100644
--- a/src/CGrep/Strategy/Cpp/Semantic.hs
+++ b/src/CGrep/Strategy/Cpp/Semantic.hs
@@ -18,7 +18,10 @@

 module CGrep.Strategy.Cpp.Semantic (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
 import qualified CGrep.Parser.Cpp.Token  as Cpp

 import Control.Monad.Trans.Reader
@@ -35,6 +38,7 @@ import CGrep.Output
 import CGrep.Parser.WildCard

 import Reader
+import Options
 import Debug
 import Util

@@ -49,10 +53,12 @@ search f patterns = do

     -- transform text

+    let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
     let filt = (mkContextFilter opt) { getFilterComment = False }


-    let [text''', text'' , text', _] = scanr ($) text  [ expandMultiline opt
+    let [text''', text'' , text', _] = scanr ($) utext [ expandMultiline opt
                                                        , contextFilter (getFileLang opt filename) filt
                                                        , ignoreCase opt
                                                        ]
diff --git a/src/CGrep/Strategy/Cpp/Tokenizer.hs b/src/CGrep/Strategy/Cpp/Tokenizer.hs
index d7b2b1e..1ebda1a 100644
--- a/src/CGrep/Strategy/Cpp/Tokenizer.hs
+++ b/src/CGrep/Strategy/Cpp/Tokenizer.hs
@@ -18,7 +18,10 @@

 module CGrep.Strategy.Cpp.Tokenizer (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
 import qualified CGrep.Parser.Cpp.Token as Cpp

 import Control.Monad.Trans.Reader
@@ -48,12 +51,14 @@ search f ps = do

     -- transform text

+    let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
     let filt = (mkContextFilter opt) { getFilterComment = False }

-    let [text''', _ , text', _] = scanr ($) text [ expandMultiline opt
-                                                 , contextFilter (getFileLang opt filename) filt
-                                                 , ignoreCase opt
-                                                 ]
+    let [text''', _ , text', _] = scanr ($) utext [ expandMultiline opt
+                                                  , contextFilter (getFileLang opt filename) filt
+                                                  , ignoreCase opt
+                                                  ]


     putStrLevel1 $ "strategy  : running C/C++ token search on " ++ filename ++ "..."
diff --git a/src/CGrep/Strategy/Generic/Semantic.hs b/src/CGrep/Strategy/Generic/Semantic.hs
index 4787063..b184a93 100644
--- a/src/CGrep/Strategy/Generic/Semantic.hs
+++ b/src/CGrep/Strategy/Generic/Semantic.hs
@@ -18,7 +18,10 @@

 module CGrep.Strategy.Generic.Semantic (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
 import qualified CGrep.Parser.Generic.Token as Generic

 import CGrep.Filter
@@ -37,6 +40,7 @@ import Data.Function
 import Data.Maybe

 import Reader
+import Options
 import Debug
 import Util

@@ -51,10 +55,12 @@ search f ps = do

     -- transform text

-    let [text''', text'', text', _ ] = scanr ($) text [ expandMultiline opt
-                                                      , contextFilter (getFileLang opt filename) filt
-                                                      , ignoreCase opt
-                                                      ]
+    let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+    let [text''', text'', text', _ ] = scanr ($) utext [ expandMultiline opt
+                                                       , contextFilter (getFileLang opt filename) filt
+                                                       , ignoreCase opt
+                                                       ]

         filt  = (mkContextFilter opt) { getFilterComment = False }

diff --git a/src/CGrep/Strategy/Levenshtein.hs b/src/CGrep/Strategy/Levenshtein.hs
index 6beb514..42ac34f 100644
--- a/src/CGrep/Strategy/Levenshtein.hs
+++ b/src/CGrep/Strategy/Levenshtein.hs
@@ -18,7 +18,9 @@

 module CGrep.Strategy.Levenshtein (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC

 import Control.Monad.Trans.Reader
 import Control.Monad.IO.Class
@@ -31,6 +33,7 @@ import CGrep.Distance
 import CGrep.Token

 import Reader
+import Options
 import Debug


@@ -44,10 +47,12 @@ search f patterns = do

     -- transform text

-    let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
-                                              , contextFilter (getFileLang opt filename) (mkContextFilter opt)
-                                              , ignoreCase opt
-                                              ]
+    let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+    let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+                                               , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+                                               , ignoreCase opt
+                                               ]

     -- parse source code, get the Cpp.Token list...

diff --git a/src/CGrep/Strategy/Regex.hs b/src/CGrep/Strategy/Regex.hs
index deda70d..7a38d6a 100644
--- a/src/CGrep/Strategy/Regex.hs
+++ b/src/CGrep/Strategy/Regex.hs
@@ -20,7 +20,9 @@

 module CGrep.Strategy.Regex (search) where

+import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC

 import Control.Monad.Trans.Reader
 import Control.Monad.IO.Class
@@ -52,10 +54,12 @@ search f patterns = do

     -- transform text

-    let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
-                                              , contextFilter (getFileLang opt filename) (mkContextFilter opt)
-                                              , ignoreCase opt
-                                              ]
+    let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+    let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+                                               , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+                                               , ignoreCase opt
+                                               ]

     -- search for matching tokens

diff --git a/src/CmdOptions.hs b/src/CmdOptions.hs
index 1d1f1a6..5b8160e 100644
--- a/src/CmdOptions.hs
+++ b/src/CmdOptions.hs
@@ -76,6 +76,7 @@ options = cmdArgsMode $ Options
           ,     cores  = 0                  &= help "Number of physical processors utilized"
           ,     chunk  = 16                 &= help "Specify the length of chunks"
           ,     asynch = False              &= help "Process chunks asynchronously"
+          ,     utf8 = False                &= groupname "\nMiscellaneous" &= help "Enable UTF8 support"
           ,     debug = 0                   &= groupname "\nMiscellaneous" &= help "Debug level: 1, 2 or 3"
           ,     no_shallow = False          &= help "Disable shallow-search"  &= explicit &= name "no-shallow"
           ,     others = []                 &= args
diff --git a/src/Options.hs b/src/Options.hs
index aeca472..71bb8b3 100644
--- a/src/Options.hs
+++ b/src/Options.hs
@@ -81,6 +81,7 @@ data Options = Options
     ,   chunk               :: Int
     ,   asynch              :: Bool
     -- Misc:
+    ,   utf8                :: Bool
     ,   debug               :: Int
     ,   no_shallow          :: Bool
     ,   others              :: [String]

Localization of changes in getTargetContents did not work fine because original text had to be used in mkOutput too.