WorksApplications/elasticsearch-sudachi

Romanization "ふ" becomes "ho" by sudachi_readingform.

kasa-taku opened this issue · 1 comments

I think there's a mistake here. I think the "ふ"'s romanization should be "hu", not "ho".

PUT test/
{
    "settings": {
        "index": {
            "analysis": {
                "filter": {
                    "sudachi_romaji_readingform": {
                        "type": "sudachi_readingform",
                        "use_romaji": true
                    }
                },
                "tokenizer": {
                    "sudachi_tokenizer": {
                        "type": "sudachi_tokenizer",
                        "resources_path": "/etc/elasticsearch/sudachi"
                    }
                },
                "analyzer": {
                    "romaji_analyzer": {
                        "tokenizer": "sudachi_tokenizer",
                        "filter": [
                            "sudachi_romaji_readingform"
                        ]
                    }
                }
            }
        }
    }
}

POST test/_analyze
{
  "text":"ふ",
  "analyzer": "romaji_analyzer"
}

Returns with :

{
  "tokens" : [
    {
      "token" : "ho",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    }
  ]
}

Target code:
elasticsearch-sudachi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java
line 480 - 518

            case 'フ':
                switch(ch2) {
                case 'ァ':
                    builder.append("fwa");
                    i++;
                    break;
                case 'ィ':
                    builder.append("fwi");
                    i++;
                    break;
                case 'ゥ':
                    builder.append("fwu");
                    i++;
                    break;
                case 'ェ':
                    builder.append("fwe");
                    i++;
                    break;
                case 'ォ':
                    builder.append("fwo");
                    i++;
                    break;
                case 'ャ':
                    builder.append("fya");
                    i++;
                    break;
                case 'ュ':
                    builder.append("fyu");
                    i++;
                    break;
                case 'ョ':
                    builder.append("fyo");
                    i++;
                    break;
                default:
                    builder.append("ho");
                    break;
                }
                break;

Thank you for your bug report!
I've fixed it.