GNU Lesser General Public License version 2.1 or, at your option, any later version.
pip install -U sacremoses
Tokenizer and Detokenizer
>>> from sacremoses import MosesTokenizer, MosesDetokenizer
>>> mt = MosesTokenizer()
>>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
>>> expected_tokenized = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
>>> tokenized_text = mt.tokenize(text, return_str=True)
>>> tokenized_text == expected_tokenized
True
>>> mt, md = MosesTokenizer(), MosesDetokenizer()
>>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
>>> expected_tokens = [u'This', u'ain', u''t', u'funny', u'.', u'It', u''s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'|', u'[', u']', u'<', u'>', u'[', u']', u'&', u'You', u''re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u''t', u'?']
>>> expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
>>> mt.tokenize(sent) == expected_tokens
True
>>> md.detokenize(tokens) == expected_detokens
True
Truecaser
>>> from sacremoses import MosesTruecaser, MosesTokenizer
# Train a new truecaser from a 'big.txt' file.
>>> mtr = MosesTruecaser()
>>> mtok = MosesTokenizer()
# Save the truecase model to 'big.truecasemodel' using `save_to`
>> tokenized_docs = [mtok.tokenize(line) for line in open('big.txt')]
>>> mtr.train(tokenized_docs, save_to='big.truecasemodel')
# Save the truecase model to 'big.truecasemodel' after training
# (just in case you forgot to use `save_to`)
>>> mtr = MosesTruecaser()
>>> mtr.train('big.txt')
>>> mtr.save_model('big.truecasemodel')
# Truecase a string after training a model.
>>> mtr = MosesTruecaser()
>>> mtr.train('big.txt')
>>> mtr.truecase("THE ADVENTURES OF SHERLOCK HOLMES")
['the', 'adventures', 'of', 'Sherlock', 'Holmes']
# Loads a model and truecase a string using trained model.
>>> mtr = MosesTruecaser('big.truecasemodel')
>>> mtr.truecase("THE ADVENTURES OF SHERLOCK HOLMES")
['the', 'adventures', 'of', 'Sherlock', 'Holmes']
>>> print(mtr.truecase("THE ADVENTURES OF SHERLOCK HOLMES", return_str=True)
'the adventures of Sherlock Holmes'
$ pip install -U sacremoses>=0.07
$ sacremoses --help
Usage: sacremoses [OPTIONS] COMMAND [ARGS]...
Options:
--version Show the version and exit.
-h, --help Show this message and exit.
Commands:
detokenize
detruecase
tokenize
train-truecase
truecase
Tokenizer
$ sacremoses tokenize --help
Usage: sacremoses tokenize [OPTIONS]
Options:
-j, --processes INTEGER No. of processes.
-a, --aggressive-dash-splits Triggers dash split rules.
-x, --xml-escape Escape special characters for XML.
-h, --help Show this message and exit.
$ sacremoses tokenize -j 4 < big.txt > big.txt.tok
100%|██████████████████████████████████| 128457/128457 [00:15<00:00, 8059.72it/s]
Detokenizer
$ sacremoses detokenize --help
Usage: sacremoses detokenize [OPTIONS]
Options:
-j, --processes INTEGER No. of processes.
-x, --xml-unescape Unescape special characters for XML.
-h, --help Show this message and exit.
$ sacremoses detokenize -j 4 < big.txt.tok > big.txt.tok.detok
128457it [00:23, 5355.88it/s]
Train Truecaser
$ sacremoses train-truecase --help
Usage: sacremoses train-truecase [OPTIONS]
Options:
-m, --modelfile TEXT Filename to save the modelfile. [required]
-j, --processes INTEGER No. of processes.
-a, --is-asr A flag to indicate that model is for ASR.
-p, --possibly-use-first-token Use the first token as part of truecasing.
-h, --help Show this message and exit.
$ sacremoses train-truecase -m big.model -j 4 < big.txt.tok
Truecase
$ sacremoses truecase --help
Usage: sacremoses truecase [OPTIONS]
Options:
-m, --modelfile TEXT The trucaser modelfile to use. [required]
-j, --processes INTEGER No. of processes.
-a, --is-asr A flag to indicate that model is for ASR.
-h, --help Show this message and exit.
$ sacremoses truecase -m big.model -j 4 < big.txt.tok > big.txt.tok.true
128457it [00:11, 11411.07it/s]
Detruecase
$ sacremoses detruecase --help
Usage: sacremoses detruecase [OPTIONS]
Options:
-j, --processes INTEGER No. of processes.
-a, --is-headline Whether the file are headlines.
-h, --help Show this message and exit.
$ sacremoses detruecase -j 4 < big.txt.tok.true > big.txt.tok.true.detrue
100%|█████████████████████████████████| 128457/128457 [00:04<00:00, 26945.16it/s]