cd /opt/tts-service-demo
# For Kazakh lang
python3 parse_kz.py --help
python3 parse_kz.py --folderpath /opt/demo_files # sorry but avoid "/" as last symbol, promise to fix it
# For Russian/English
python3 parse_ru_en.py --help
python3 parse_ru_en.py --folderpath /opt/demo_files_en --lang en
python3 parse_ru_en.py --folderpath /opt/demo_files_ru --lang ru
Small guide of other stt-tts services installation
install espnet as preequisite
# install cmake
sudo apt-get install cmake
# install sox
sudo apt-get install sox
# install sndfile
sudo apt-get install libsndfile1-dev
# install ffmpeg
sudo apt-get install ffmpeg
# install flac
sudo apt-get install flac
# clone espnet repo
cd <any-place>
git clone https://github.com/espnet/espnet
# setup venv
cd <espnet-root>/tools
./setup_venv.sh $(command -v python3)
# install espnet
cd <espnet-root>/tools
make
# optional with cuda
cd <espnet-root>/tools
make TH_VERSION=1.10.1 CUDA_VERSION=11.3
# If you don’t have nvcc command, packages are installed for **CPU** mode by default. If you’ll turn it on manually, give CPU_ONLY option.
cd <espnet-root>/tools
make CPU_ONLY=0
# go to asr dir
cd ISSAI_SAIDA_Kazakh_ASR/asr1
ln -s ../../../tools/kaldi/egs/wsj/s5/steps steps
ln -s ../../../tools/kaldi/egs/wsj/s5/utils utils
# create exp
./setup_experiment.sh <exp-name>
# download weights
wget https://issai.nu.edu.kz/wp-content/uploads/2020/10/model.tar.gz
# untar
tar --zxvf model.tar.gz -C ksc/<exp-name>/
# To decode a single audio, specify paths to the following files inside recog_wav.sh script:
lang_model= rnnlm.model.best
cmvn= data/train/cmvn.ark # path to cmvn.ark
recog_model= model.last10.avg.best # path to e2e model, in case of transformer
# Then, run the following script:
./recog_wav.sh <path-to-audio-file>
Installtation process
# Install prerequisites
sudo apt-get update
sudo apt-get upgrade
sudo apt-get install git
sudo apt-get install bc
sudo apt-get install g++
sudo apt-get install zlib1g-dev make automake autoconf bzip2 libtool subversion
sudo apt-get install libatlas3-base
# Install Kaldi:
git clone https://github.com/kaldi-asr/kaldi.git kaldi –origin upstream
cd kaldi/tools
extras/check_dependencies.sh
make
# To install the irst language model:
cd kaldi/src
./configure –shared
make depend
make
# Inorder to install the language model kaldi_lm:
cd kaldi/tools
extras/install_kaldi_lm.sh
# Install VOSK
git clone https://github.com/alphacep/vosk-api
cd vosk-api/src
KALDI_ROOT=<KALDI_ROOT> make
cd ../vosk-api/python
python3 setup.py install
# Install models and run recognition
cd vosk-api/python/example
wget https://alphacephei.com/vosk/models/vosk-model-kz-0.15.zip
unzip vosk-model-kz-0.15.zip
mv vosk-model-kz-0.15 model
python3 ./test_simple.py test.wav # your wav file
Super simple usage
install one lib
pip install huggingsound
Start recognition
from huggingsound import SpeechRecognitionModel
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
transcriptions = model.transcribe(audio_paths)
install few libs
pip install librosa
pip install datasets
pip install transformers
Start your recognition script
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
LANG_ID = "ru"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-russian"
SAMPLES = 5
test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
batch["speech"] = speech_array
batch["sentence"] = batch["sentence"].upper()
return batch
test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)
for i, predicted_sentence in enumerate(predicted_sentences):
print("-" * 100)
print("Reference:", test_dataset[i]["sentence"])
print("Prediction:", predicted_sentence)