hightman/scws

根本分不了词啊,那里错了?

Opened this issue · 2 comments

root@wcjs-test:/usr/local/scws/bin# cat a.txt
奔驰 12.0 2.2 n
蓝天 11.2 2.2 n
每日一问 30.1 5.0 nz

root@wcjs-test:/usr/local/scws/bin# ./scws-gen-dict -c utf8 -i a.txt
Output file exists: Success
root@wcjs-test:/usr/local/scws/bin# ./scws -i '奔驰在每日一问里面好像有点厉害了' -c utf8 -d dict.xdb -A -U
奔驰/n 在/un 每日一问/n 里/un 面/un 好/un 像/un 有/un 点/un 厉/un 害/un 了/un
+--[scws(scws-cli/1.2.3)]----------+
| TextLen: 48 |
| Prepare: 0.0002 (sec) |
| Segment: 0.0003 (sec) |
+--------------------------------+

<?php
        $sh = scws_open();
        scws_set_charset($sh, 'utf8');
        scws_set_dict($sh, '/usr/local/scws/bin/dict.xdb');
        //scws_set_rule($sh, '/path/to/rules.ini');
        $text = "奔驰在每日一问里面好像有点厉害了";
        scws_send_text($sh, $text);
        $top = scws_get_result($sh);
        scws_close($sh);
        print_r($top);
?>

  | Array
-- | --
  | (
  | [0] => Array
  | (
  | [word] => 奔
  | [off] => 0
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [1] => Array
  | (
  | [word] => 驰
  | [off] => 3
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [2] => Array
  | (
  | [word] => 在
  | [off] => 6
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [3] => Array
  | (
  | [word] => 每
  | [off] => 9
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [4] => Array
  | (
  | [word] => 日
  | [off] => 12
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [5] => Array
  | (
  | [word] => 一
  | [off] => 15
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [6] => Array
  | (
  | [word] => 问
  | [off] => 18
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [7] => Array
  | (
  | [word] => 里
  | [off] => 21
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [8] => Array
  | (
  | [word] => 面
  | [off] => 24
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [9] => Array
  | (
  | [word] => 好
  | [off] => 27
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [10] => Array
  | (
  | [word] => 像
  | [off] => 30
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [11] => Array
  | (
  | [word] => 有
  | [off] => 33
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [12] => Array
  | (
  | [word] => 点
  | [off] => 36
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [13] => Array
  | (
  | [word] => 厉
  | [off] => 39
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [14] => Array
  | (
  | [word] => 害
  | [off] => 42
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | [15] => Array
  | (
  | [word] => 了
  | [off] => 45
  | [len] => 3
  | [idf] => 0
  | [attr] => un
  | )
  |  
  | )
  |  



$ cat dict_aa.txt
奔驰 12.0 2.2 n
蓝天 11.2 2.2 n
每日一问 30.1 5.0 nz

$ scws-gen-dict -i dict_aa.txt -o dict_aa.xdb
Reading the input file: dict_aa.txt ...OK, total nodes=10
Optimizing... OK
Dump the tree data to: dict_aa.xdb ... OK, all been done!

$ scws -c utf8 -d dict_jieba1.xdb:dict_aa.xdb -N -i "奔驰在每日一问里面好像有点厉害了"
奔驰 在 每日一问 里面 好像 有点 厉害 了

dict_jieba1.xdb 是我用 https://github.com/fxsjy/jieba/tree/master/extra_dict (dict.txt.big和dict.txt.small合併) 編譯的。