facebookresearch/muavic

A small bug during audio pre-processing

joyolee opened this issue · 1 comments

Hi,
I just found an error when pre-processing the audio data in

muavic/mtedx_utils.py

Lines 77 to 102 in 122ef0c

for split in ["train", "valid", "test"]:
# create directory for segmented & normalized audio
out_path = muavic_path / src_lang / "audio" / split
out_path.mkdir(parents=True, exist_ok=True)
if not is_empty(out_path):
if split == "train":
print(f"\nSegmenting {src_lang} audio files")
# collect needed info from segment file
segments_info = []
split_dir_path = mtedx_path / f"{src_lang}-{src_lang}" / "data" / split
wav_dir_path = split_dir_path / "wav"
segment_file = split_dir_path / "txt" / "segments"
for line in read_txt_file(segment_file):
seg_id, fid, start, end = line.strip().split(' ')
segments_info.append(
(wav_dir_path/(fid+".flac"), fid, seg_id, float(start), float(end))
)
# preprocess audio files
process_map(
partial(segment_normalize_audio_file, out_path),
segments_info,
max_workers=os.cpu_count(),
desc=f"Preprocessing {src_lang}/{split} Audios",
chunksize=1,
)

There are additional Tabs from line 84 to line 102. I corrected it to the following:

def preprocess_mtedx_audio(mtedx_path, src_lang, muavic_path):
    # get files id per split
    for split in ["train", "valid", "test"]:
        # create directory for segmented & normalized audio
        out_path = muavic_path / src_lang / "audio" / split
        out_path.mkdir(parents=True, exist_ok=True)
        if not is_empty(out_path):
            if split == "train":
                print(f"\nSegmenting {src_lang} audio files")
        # collect needed info from segment file
        segments_info = []
        split_dir_path = mtedx_path / f"{src_lang}-{src_lang}" / "data" / split
        wav_dir_path = split_dir_path / "wav"
        segment_file = split_dir_path / "txt" / "segments"
            
        for line in read_txt_file(segment_file):
            seg_id, fid, start, end = line.strip().split(' ')
            segments_info.append(
                (wav_dir_path/(fid+".flac"), fid, seg_id, float(start), float(end))
            )
        # preprocess audio files
        process_map(
            partial(segment_normalize_audio_file, out_path),
            segments_info,
            max_workers=os.cpu_count(),
            desc=f"Preprocessing {src_lang}/{split} Audios",
            chunksize=1,
        )

Then the code can work ;)

hi again @joyolee ,

This bug has been resolved in our newer code version. All you need to do is to update the source code using:

git pull

Feel free to update the issue if the issue persists. Thanks!