cgrep does not work well with Unicode input
lyokha opened this issue · 3 comments
Hi Nicola!
I found that cgrep is unable to process UTF-8 encoded input correctly. Here is an image that shows the issue.
I ran it recursively on a directory with md files full of Russian UTF-8 encoded files. Last two commands show correct output from program ack. You can see that not only cgrep misses several matches (less matches for ARGS and no match for russian word потому), but also it prints garbage on output.
I made a patch that fixes the issue for BoyerMoore, Regex and CppTokenizer searches (the changes are all similar and can be easily expanded to other types of searches). Here it is.
diff --git a/cgrep.cabal b/cgrep.cabal
index b9ed1e6..c7174ea 100644
--- a/cgrep.cabal
+++ b/cgrep.cabal
@@ -48,6 +48,7 @@ Executable cgrep
mtl >= 2.0,
unix-compat >= 0.4,
async >= 2.0,
+ utf8-string,
transformers,
process
Ghc-options: -O2 -Wall -threaded -rtsopts -with-rtsopts=-N
diff --git a/src/CGrep/Output.hs b/src/CGrep/Output.hs
index 3176baf..a62b8a9 100644
--- a/src/CGrep/Output.hs
+++ b/src/CGrep/Output.hs
@@ -26,7 +26,9 @@ module CGrep.Output (Output(..),
showFile,
showBold) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import System.Console.ANSI
#ifdef ENABLE_HINT
@@ -250,8 +252,8 @@ showTokens Options { show_match = st } out
showLine :: Config -> Options -> Output -> String
showLine conf Options { color = c, no_color = c' } out
- | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (C.unpack (outLine out))
- | otherwise = C.unpack (outLine out)
+ | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (UC.decode (B.unpack (outLine out)))
+ | otherwise = UC.decode (B.unpack (outLine out))
showFileName :: Config -> Options -> String -> String
diff --git a/src/CGrep/Strategy/BoyerMoore.hs b/src/CGrep/Strategy/BoyerMoore.hs
index e135ddb..98da613 100644
--- a/src/CGrep/Strategy/BoyerMoore.hs
+++ b/src/CGrep/Strategy/BoyerMoore.hs
@@ -20,7 +20,9 @@
module CGrep.Strategy.BoyerMoore (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import Control.Monad.Trans.Reader
import Control.Monad.IO.Class
@@ -50,10 +52,12 @@ search f patterns = do
-- transform text
- let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) (mkContextFilter opt)
- , ignoreCase opt
- ]
+ let utext = C.pack $ UC.decode $ B.unpack text
+
+ let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+ , ignoreCase opt
+ ]
-- make shallow search
diff --git a/src/CGrep/Strategy/Cpp/Tokenizer.hs b/src/CGrep/Strategy/Cpp/Tokenizer.hs
index d7b2b1e..017e0b9 100644
--- a/src/CGrep/Strategy/Cpp/Tokenizer.hs
+++ b/src/CGrep/Strategy/Cpp/Tokenizer.hs
@@ -18,7 +18,10 @@
module CGrep.Strategy.Cpp.Tokenizer (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
import qualified CGrep.Parser.Cpp.Token as Cpp
import Control.Monad.Trans.Reader
@@ -48,12 +51,14 @@ search f ps = do
-- transform text
+ let utext = C.pack $ UC.decode $ B.unpack text
+
let filt = (mkContextFilter opt) { getFilterComment = False }
- let [text''', _ , text', _] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) filt
- , ignoreCase opt
- ]
+ let [text''', _ , text', _] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) filt
+ , ignoreCase opt
+ ]
putStrLevel1 $ "strategy : running C/C++ token search on " ++ filename ++ "..."
diff --git a/src/CGrep/Strategy/Regex.hs b/src/CGrep/Strategy/Regex.hs
index deda70d..2e17d30 100644
--- a/src/CGrep/Strategy/Regex.hs
+++ b/src/CGrep/Strategy/Regex.hs
@@ -20,7 +20,9 @@
module CGrep.Strategy.Regex (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import Control.Monad.Trans.Reader
import Control.Monad.IO.Class
@@ -52,10 +54,12 @@ search f patterns = do
-- transform text
- let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) (mkContextFilter opt)
- , ignoreCase opt
- ]
+ let utext = C.pack $ UC.decode $ B.unpack text
+
+ let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+ , ignoreCase opt
+ ]
-- search for matching tokens
After this patch cgrep output looks correct:
If you do not mind against changes, I can apply them myself as you kindly gave me permission for this :) I am not sure about what exact dependency on utf-string must be in cgrep.cabal though.
Unfortunately it makes cgrep slower than ack, probably it makes sense to involve some new option to turn UTF-8 on.
I localized UTF-8 decoding from strategies to getTargetContents in CGrep/Output.hs. Now the patch is shorter.
diff --git a/cgrep.cabal b/cgrep.cabal
index b9ed1e6..c7174ea 100644
--- a/cgrep.cabal
+++ b/cgrep.cabal
@@ -48,6 +48,7 @@ Executable cgrep
mtl >= 2.0,
unix-compat >= 0.4,
async >= 2.0,
+ utf8-string,
transformers,
process
Ghc-options: -O2 -Wall -threaded -rtsopts -with-rtsopts=-N
diff --git a/src/CGrep/Common.hs b/src/CGrep/Common.hs
index 00b6fb9..861404f 100644
--- a/src/CGrep/Common.hs
+++ b/src/CGrep/Common.hs
@@ -25,7 +25,9 @@ module CGrep.Common (Text8,
ignoreCase,
trim, trim8) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import qualified Data.ByteString.Search as SC
import Data.Char
@@ -52,8 +54,8 @@ getTargetName name = name
getTargetContents :: FilePath -> IO Text8
-getTargetContents [] = C.getContents
-getTargetContents xs = C.readFile xs
+getTargetContents [] = B.getContents >>= return . C.pack . UC.decode . B.unpack
+getTargetContents xs = B.readFile xs >>= return . C.pack . UC.decode . B.unpack
shallowSearch :: [Text8] -> Text8 -> [[Int]]
diff --git a/src/CGrep/Output.hs b/src/CGrep/Output.hs
index 3176baf..a62b8a9 100644
--- a/src/CGrep/Output.hs
+++ b/src/CGrep/Output.hs
@@ -26,7 +26,9 @@ module CGrep.Output (Output(..),
showFile,
showBold) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import System.Console.ANSI
#ifdef ENABLE_HINT
@@ -250,8 +252,8 @@ showTokens Options { show_match = st } out
showLine :: Config -> Options -> Output -> String
showLine conf Options { color = c, no_color = c' } out
- | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (C.unpack (outLine out))
- | otherwise = C.unpack (outLine out)
+ | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (UC.decode (B.unpack (outLine out)))
+ | otherwise = UC.decode (B.unpack (outLine out))
showFileName :: Config -> Options -> String -> String
The most time-consuming part is now getTargetContents: I cannot find a fast way to decode UTF-8 from ByteString, if there is no such a way then it makes sense to turn UTF-8 support using an extra command-line option.
Here is the final patch with all strategies implemented and a new option -u
(--utf8
). If the new option is not set cgrep will do all as before.
diff --git a/cgrep.cabal b/cgrep.cabal
index b9ed1e6..c7174ea 100644
--- a/cgrep.cabal
+++ b/cgrep.cabal
@@ -48,6 +48,7 @@ Executable cgrep
mtl >= 2.0,
unix-compat >= 0.4,
async >= 2.0,
+ utf8-string,
transformers,
process
Ghc-options: -O2 -Wall -threaded -rtsopts -with-rtsopts=-N
diff --git a/src/CGrep/Output.hs b/src/CGrep/Output.hs
index 3176baf..dd8bdff 100644
--- a/src/CGrep/Output.hs
+++ b/src/CGrep/Output.hs
@@ -26,7 +26,9 @@ module CGrep.Output (Output(..),
showFile,
showBold) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import System.Console.ANSI
#ifdef ENABLE_HINT
@@ -249,9 +251,10 @@ showTokens Options { show_match = st } out
showLine :: Config -> Options -> Output -> String
-showLine conf Options { color = c, no_color = c' } out
- | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (C.unpack (outLine out))
- | otherwise = C.unpack (outLine out)
+showLine conf Options { color = c, no_color = c', utf8 = u } out
+ | c && not c'= hilightLine conf (sortBy (flip compare `on` (length . snd )) (outTokens out)) (unpack (outLine out))
+ | otherwise = unpack (outLine out)
+ where unpack = if u then UC.decode . B.unpack else C.unpack
showFileName :: Config -> Options -> String -> String
diff --git a/src/CGrep/Strategy/BoyerMoore.hs b/src/CGrep/Strategy/BoyerMoore.hs
index e135ddb..87ba172 100644
--- a/src/CGrep/Strategy/BoyerMoore.hs
+++ b/src/CGrep/Strategy/BoyerMoore.hs
@@ -20,7 +20,9 @@
module CGrep.Strategy.BoyerMoore (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import Control.Monad.Trans.Reader
import Control.Monad.IO.Class
@@ -50,10 +52,12 @@ search f patterns = do
-- transform text
- let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) (mkContextFilter opt)
- , ignoreCase opt
- ]
+ let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+ let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+ , ignoreCase opt
+ ]
-- make shallow search
diff --git a/src/CGrep/Strategy/Cpp/Semantic.hs b/src/CGrep/Strategy/Cpp/Semantic.hs
index b0e9fd8..a3c8722 100644
--- a/src/CGrep/Strategy/Cpp/Semantic.hs
+++ b/src/CGrep/Strategy/Cpp/Semantic.hs
@@ -18,7 +18,10 @@
module CGrep.Strategy.Cpp.Semantic (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
import qualified CGrep.Parser.Cpp.Token as Cpp
import Control.Monad.Trans.Reader
@@ -35,6 +38,7 @@ import CGrep.Output
import CGrep.Parser.WildCard
import Reader
+import Options
import Debug
import Util
@@ -49,10 +53,12 @@ search f patterns = do
-- transform text
+ let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
let filt = (mkContextFilter opt) { getFilterComment = False }
- let [text''', text'' , text', _] = scanr ($) text [ expandMultiline opt
+ let [text''', text'' , text', _] = scanr ($) utext [ expandMultiline opt
, contextFilter (getFileLang opt filename) filt
, ignoreCase opt
]
diff --git a/src/CGrep/Strategy/Cpp/Tokenizer.hs b/src/CGrep/Strategy/Cpp/Tokenizer.hs
index d7b2b1e..1ebda1a 100644
--- a/src/CGrep/Strategy/Cpp/Tokenizer.hs
+++ b/src/CGrep/Strategy/Cpp/Tokenizer.hs
@@ -18,7 +18,10 @@
module CGrep.Strategy.Cpp.Tokenizer (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
import qualified CGrep.Parser.Cpp.Token as Cpp
import Control.Monad.Trans.Reader
@@ -48,12 +51,14 @@ search f ps = do
-- transform text
+ let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
let filt = (mkContextFilter opt) { getFilterComment = False }
- let [text''', _ , text', _] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) filt
- , ignoreCase opt
- ]
+ let [text''', _ , text', _] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) filt
+ , ignoreCase opt
+ ]
putStrLevel1 $ "strategy : running C/C++ token search on " ++ filename ++ "..."
diff --git a/src/CGrep/Strategy/Generic/Semantic.hs b/src/CGrep/Strategy/Generic/Semantic.hs
index 4787063..b184a93 100644
--- a/src/CGrep/Strategy/Generic/Semantic.hs
+++ b/src/CGrep/Strategy/Generic/Semantic.hs
@@ -18,7 +18,10 @@
module CGrep.Strategy.Generic.Semantic (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
+
import qualified CGrep.Parser.Generic.Token as Generic
import CGrep.Filter
@@ -37,6 +40,7 @@ import Data.Function
import Data.Maybe
import Reader
+import Options
import Debug
import Util
@@ -51,10 +55,12 @@ search f ps = do
-- transform text
- let [text''', text'', text', _ ] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) filt
- , ignoreCase opt
- ]
+ let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+ let [text''', text'', text', _ ] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) filt
+ , ignoreCase opt
+ ]
filt = (mkContextFilter opt) { getFilterComment = False }
diff --git a/src/CGrep/Strategy/Levenshtein.hs b/src/CGrep/Strategy/Levenshtein.hs
index 6beb514..42ac34f 100644
--- a/src/CGrep/Strategy/Levenshtein.hs
+++ b/src/CGrep/Strategy/Levenshtein.hs
@@ -18,7 +18,9 @@
module CGrep.Strategy.Levenshtein (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import Control.Monad.Trans.Reader
import Control.Monad.IO.Class
@@ -31,6 +33,7 @@ import CGrep.Distance
import CGrep.Token
import Reader
+import Options
import Debug
@@ -44,10 +47,12 @@ search f patterns = do
-- transform text
- let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) (mkContextFilter opt)
- , ignoreCase opt
- ]
+ let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+ let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+ , ignoreCase opt
+ ]
-- parse source code, get the Cpp.Token list...
diff --git a/src/CGrep/Strategy/Regex.hs b/src/CGrep/Strategy/Regex.hs
index deda70d..7a38d6a 100644
--- a/src/CGrep/Strategy/Regex.hs
+++ b/src/CGrep/Strategy/Regex.hs
@@ -20,7 +20,9 @@
module CGrep.Strategy.Regex (search) where
+import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
+import qualified Codec.Binary.UTF8.String as UC
import Control.Monad.Trans.Reader
import Control.Monad.IO.Class
@@ -52,10 +54,12 @@ search f patterns = do
-- transform text
- let [text''', _ , _ , _] = scanr ($) text [ expandMultiline opt
- , contextFilter (getFileLang opt filename) (mkContextFilter opt)
- , ignoreCase opt
- ]
+ let utext = if utf8 opt then C.pack $ UC.decode $ B.unpack text else text
+
+ let [text''', _ , _ , _] = scanr ($) utext [ expandMultiline opt
+ , contextFilter (getFileLang opt filename) (mkContextFilter opt)
+ , ignoreCase opt
+ ]
-- search for matching tokens
diff --git a/src/CmdOptions.hs b/src/CmdOptions.hs
index 1d1f1a6..5b8160e 100644
--- a/src/CmdOptions.hs
+++ b/src/CmdOptions.hs
@@ -76,6 +76,7 @@ options = cmdArgsMode $ Options
, cores = 0 &= help "Number of physical processors utilized"
, chunk = 16 &= help "Specify the length of chunks"
, asynch = False &= help "Process chunks asynchronously"
+ , utf8 = False &= groupname "\nMiscellaneous" &= help "Enable UTF8 support"
, debug = 0 &= groupname "\nMiscellaneous" &= help "Debug level: 1, 2 or 3"
, no_shallow = False &= help "Disable shallow-search" &= explicit &= name "no-shallow"
, others = [] &= args
diff --git a/src/Options.hs b/src/Options.hs
index aeca472..71bb8b3 100644
--- a/src/Options.hs
+++ b/src/Options.hs
@@ -81,6 +81,7 @@ data Options = Options
, chunk :: Int
, asynch :: Bool
-- Misc:
+ , utf8 :: Bool
, debug :: Int
, no_shallow :: Bool
, others :: [String]
Localization of changes in getTargetContents did not work fine because original text had to be used in mkOutput too.