A small bug during audio pre-processing

Hi,
I just found an error when pre-processing the audio data in

Lines 77 to 102 in 122ef0c

    
           for split in ["train", "valid", "test"]: 
        
               # create directory for segmented & normalized audio 
        
               out_path = muavic_path / src_lang / "audio" / split 
        
               out_path.mkdir(parents=True, exist_ok=True) 
        
               if not is_empty(out_path): 
        
                   if split == "train": 
        
                       print(f"\nSegmenting {src_lang} audio files") 
        
                   # collect needed info from segment file 
        
                   segments_info = [] 
        
                   split_dir_path = mtedx_path / f"{src_lang}-{src_lang}" / "data" / split 
        
                   wav_dir_path = split_dir_path / "wav" 
        
                   segment_file = split_dir_path / "txt" / "segments" 
        
                   for line in read_txt_file(segment_file): 
        
                       seg_id, fid, start, end = line.strip().split(' ') 
        
                       segments_info.append( 
        
                           (wav_dir_path/(fid+".flac"), fid, seg_id, float(start), float(end)) 
        
                       ) 
        
                   # preprocess audio files 
        
                   process_map( 
        
                       partial(segment_normalize_audio_file, out_path), 
        
                       segments_info, 
        
                       max_workers=os.cpu_count(), 
        
                       desc=f"Preprocessing {src_lang}/{split} Audios", 
        
                       chunksize=1, 
        
                   )

There are additional Tabs from line 84 to line 102. I corrected it to the following:

def preprocess_mtedx_audio(mtedx_path, src_lang, muavic_path):
    # get files id per split
    for split in ["train", "valid", "test"]:
        # create directory for segmented & normalized audio
        out_path = muavic_path / src_lang / "audio" / split
        out_path.mkdir(parents=True, exist_ok=True)
        if not is_empty(out_path):
            if split == "train":
                print(f"\nSegmenting {src_lang} audio files")
        # collect needed info from segment file
        segments_info = []
        split_dir_path = mtedx_path / f"{src_lang}-{src_lang}" / "data" / split
        wav_dir_path = split_dir_path / "wav"
        segment_file = split_dir_path / "txt" / "segments"
            
        for line in read_txt_file(segment_file):
            seg_id, fid, start, end = line.strip().split(' ')
            segments_info.append(
                (wav_dir_path/(fid+".flac"), fid, seg_id, float(start), float(end))
            )
        # preprocess audio files
        process_map(
            partial(segment_normalize_audio_file, out_path),
            segments_info,
            max_workers=os.cpu_count(),
            desc=f"Preprocessing {src_lang}/{split} Audios",
            chunksize=1,
        )

Then the code can work ;)

hi again @joyolee ,

This bug has been resolved in our newer code version. All you need to do is to update the source code using:

git pull

Feel free to update the issue if the issue persists. Thanks!

	for split in ["train", "valid", "test"]:
	# create directory for segmented & normalized audio
	out_path = muavic_path / src_lang / "audio" / split
	out_path.mkdir(parents=True, exist_ok=True)
	if not is_empty(out_path):
	if split == "train":
	print(f"\nSegmenting {src_lang} audio files")
	# collect needed info from segment file
	segments_info = []
	split_dir_path = mtedx_path / f"{src_lang}-{src_lang}" / "data" / split
	wav_dir_path = split_dir_path / "wav"
	segment_file = split_dir_path / "txt" / "segments"

	for line in read_txt_file(segment_file):
	seg_id, fid, start, end = line.strip().split(' ')
	segments_info.append(
	(wav_dir_path/(fid+".flac"), fid, seg_id, float(start), float(end))
	)
	# preprocess audio files
	process_map(
	partial(segment_normalize_audio_file, out_path),
	segments_info,
	max_workers=os.cpu_count(),
	desc=f"Preprocessing {src_lang}/{split} Audios",
	chunksize=1,
	)