A small bug during audio pre-processing
joyolee opened this issue · 1 comments
joyolee commented
Hi,
I just found an error when pre-processing the audio data in
Lines 77 to 102 in 122ef0c
There are additional Tabs from line 84 to line 102. I corrected it to the following:
def preprocess_mtedx_audio(mtedx_path, src_lang, muavic_path):
# get files id per split
for split in ["train", "valid", "test"]:
# create directory for segmented & normalized audio
out_path = muavic_path / src_lang / "audio" / split
out_path.mkdir(parents=True, exist_ok=True)
if not is_empty(out_path):
if split == "train":
print(f"\nSegmenting {src_lang} audio files")
# collect needed info from segment file
segments_info = []
split_dir_path = mtedx_path / f"{src_lang}-{src_lang}" / "data" / split
wav_dir_path = split_dir_path / "wav"
segment_file = split_dir_path / "txt" / "segments"
for line in read_txt_file(segment_file):
seg_id, fid, start, end = line.strip().split(' ')
segments_info.append(
(wav_dir_path/(fid+".flac"), fid, seg_id, float(start), float(end))
)
# preprocess audio files
process_map(
partial(segment_normalize_audio_file, out_path),
segments_info,
max_workers=os.cpu_count(),
desc=f"Preprocessing {src_lang}/{split} Audios",
chunksize=1,
)
Then the code can work ;)