How do different tokenizers perform on downstream tasks in scriptio continua languages?: A case study in Japanese
This is the official implementation of the paper titled: "How do different tokenizers perform on downstream tasks in scriptio continua languages?: A case study in Japanese". To reproduce our results, please follow the following instructions.
- Python >= 3.9
- PyTorch 1.8.1
- Transformers 4.24.0.dev0
pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
git clone https://github.com/huggingface/transformers.git
cd transformers
pip install -e .
cd ..
pip install -r requirements.txt
Here, we install required packages under ${HOME}/usr
, but you can choose your preferred location by modifying --prefix
.
-
Model
git clone https://github.com/taku910/mecab.git cd mecab/mecab ./configure --prefix=${HOME}/usr --with-charset=UTF8 make make install cd ../..
-
Dictionary
wget "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM" -O mecab-ipadic-2.70-20070801.tar.gz tar xvzf mecab-ipadic-2.7.0-20070801.tar.gz cd mecab-ipadic-2.7.0-20070801 ./configure --with-mecab-config=$HOME/usr/bin/mecab-config --with-charset=UTF8 --prefix=$HOME/usr make make install cd ..
wget "https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz"
tar xvJf jumanpp-2.0.0-rc3.tar.xz
cd jumanpp-2.0.0-rc3
mkdir build && cd build
curl -LO https://github.com/catchorg/Catch2/releases/download/v2.13.8/catch.hpp
mv catch.hpp ../libs/
cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$HOME/usr
make
make install
echo 'export PATH=$PATH:$HOME/usr' >> ~/.bashrc
echo 'export PATH=$PATH:$HOME/usr/bin' >> ~/.bashrc
cd ..
pip install sudachipy
pip install sudachidict_core
See https://github.com/daac-tools/vaporetto for more details.
cd data/dict
wget https://github.com/daac-tools/vaporetto/releases/download/v0.5.0/bccwj-suw+unidic+tag.tar.xz
tar xf ./bccwj-suw+unidic+tag.tar.xz
cd ../..
Please see preprocessing_for_tokenizers.
Please see tokenizer.
Please see preprocessing_for_pretraining.
Please see pretraining.
First, please clone the JGLUE repository and download the JGLUE dataset under ./data
, following https://github.com/yahoojapan/JGLUE.
Please see marc-ja.
Please see jsts.
Please see jnli.
Please see jsquad.
Please see jcommonsenseqa.
Please see ner.
Please see dependency_parsing.
The pretrained weights are available on the Hugging Face Hub.
The trained dictionary files are available from this repository.
BPE | Unigram | WordPiece | |
---|---|---|---|
MeCab | mecab_bpe.json | mecab_unigram.json | mecab_wordpiece.json |
Juman++ | jumanpp_bpe.json | jumanpp_unigram.json | jumanpp_wordpiece.json |
Sudachi | sudachi_bpe.json | sudachi_unigram.json | sudachi_wordpiece.json |
Vaporetto | vaporetto_bpe.json | vaporetto_unigram.json | vaporetto_wordpiece.json |
Nothing | nothing_bpe.json | nothing_unigram.json | nothing_wordpiece.json |
Because we use the customised tokenizers, we cannot use AutoTokenizer.from_pretrained()
to load a dictionary file.
To load the file and construct a tokenizer, please use the following script. You must call build_tokenizer()
to generate a tokenizer.
from typing import Optional
from tokenizers import Tokenizer
from tokenizers import NormalizedString, PreTokenizedString
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast
from pyknp import Juman
from MeCab import Tagger
from sudachipy import tokenizer
from sudachipy import dictionary
import vaporetto
import mojimoji
import traceback
import textspan
class JumanPreTokenizer:
def __init__(self):
self.juman = Juman("jumanpp", multithreading=True)
def tokenize(self, sequence: str) -> list[str]:
text = mojimoji.han_to_zen(sequence).rstrip()
try:
result = self.juman.analysis(text)
except:
traceback.print_exc()
text = ""
result = self.juman.analysis(text)
return [mrph.midasi for mrph in result.mrph_list()]
def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
text = str(normalized_string)
tokens = self.tokenize(text)
tokens_spans = textspan.get_original_spans(tokens, text)
return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.custom_split)
class MecabPreTokenizer:
def __init__(self, mecab_dict_path: Optional[str] = None):
mecab_option = (f"-Owakati -d {mecab_dict_path}" if mecab_dict_path is not None else "-Owakati")
self.mecab = Tagger(mecab_option)
def tokenize(self, sequence: str) -> list[str]:
return self.mecab.parse(sequence).strip().split(" ")
def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
text = str(normalized_string)
tokens = self.tokenize(text)
tokens_spans = textspan.get_original_spans(tokens, text)
return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.custom_split)
class SudachiPreTokenizer:
def __init__(self, mecab_dict_path: Optional[str] = None):
self.sudachi = dictionary.Dictionary().create()
def tokenize(self, sequence: str) -> list[str]:
return [token.surface() for token in self.sudachi.tokenize(sequence)]
def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
text = str(normalized_string)
tokens = self.tokenize(text)
tokens_spans = textspan.get_original_spans(tokens, text)
return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.custom_split)
class VaporettoPreTokenizer:
def __init__(self, unidic_path: str):
with open(unidic_path, 'rb') as fp:
model = fp.read()
self.tokenizer = vaporetto.Vaporetto(model, predict_tags=False)
def tokenize(self, sequence: str) -> list[str]:
tokens = self.tokenizer.tokenize(sequence)
return [token.surface() for token in tokens]
def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
text = str(normalized_string)
tokens = self.tokenize(text)
tokens_spans = textspan.get_original_spans(tokens, text)
return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.custom_split)
def build_tokenizer(
dict_path: str,
pretokenizer_type: str = None,
vaporetto_model_path: str = None
) -> PreTrainedTokenizerFast:
# load a tokenizer
tokenizer = Tokenizer.from_file(dict_path)
# load a pre-tokenizer
if pretokenizer_type == 'mecab':
pre_tokenizer = MecabPreTokenizer()
elif pretokenizer_type == 'jumanpp':
pre_tokenizer = JumanPreTokenizer()
elif pretokenizer_type == 'vaporetto':
pre_tokenizer = VaporettoPreTokenizer(vaporetto_model_path)
elif pretokenizer_type == 'sudachi':
pre_tokenizer = SudachiPreTokenizer()
elif pretokenizer_type == 'nothing':
pre_tokenizer = None
else:
raise NotImplementedError()
tokenizer.post_processor = BertProcessing(
cls=("[CLS]", tokenizer.token_to_id('[CLS]')),
sep=("[SEP]", tokenizer.token_to_id('[SEP]'))
)
# convert to PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
unk_token='[UNK]',
cls_token='[CLS]',
sep_token='[SEP]',
pad_token='[PAD]',
mask_token='[MASK]'
)
# set a pre-tokenizer
if pre_tokenizer is not None:
tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer)
return tokenizer
@inproceedings{fujii-etal-2023-how,
title={How does the task complexity of masked pretraining objectives affect downstream performance?},
author={Takuro Fujii and Koki Shibata and Atsuki Yamaguchi and Terufumi Morishita and Yasuhiro Sogawa},
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = July,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
}
This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License unless specified.