AudioVisualLip
Audio-Visual Voice Biometrics is a audio-visual speaker recognition task, which leverages auditory and visual speech in a video. The portrait- and linguistic-based speaker characteristics are extracted via the temporal dynamics modeling. It involves the conventional speaker recognition and lip biometrics tasks.
Introduction
This is the official implementation of ICASSP23 paper CROSS-MODAL AUDIO-VISUAL CO-LEARNING FOR TEXT-INDEPENDENT SPEAKER VERIFICATION.
Datasets
![](https://private-user-images.githubusercontent.com/45690014/241732958-d1f88a36-e874-49cd-a02c-25b98c423362.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MDU1MTQzODUsIm5iZiI6MTcwNTUxNDA4NSwicGF0aCI6Ii80NTY5MDAxNC8yNDE3MzI5NTgtZDFmODhhMzYtZTg3NC00OWNkLWEwMmMtMjViOThjNDIzMzYyLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDAxMTclMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwMTE3VDE3NTQ0NVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTlmNDlhNmYxNDJlZGIxN2QwODViOTQyNGFkZjM4Njc5NzAxMjQ4MzUxYTc3MTJhNmNlMzc5YWZmNjdkMjc3ODcmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.BqvaVeYZVGTFS2Ty6Y0L7CrETO7LoDDC2WAOcT7Z6Os)
![](https://private-user-images.githubusercontent.com/45690014/241732992-1f2d5917-8cd8-4e57-b0c7-872f020f2bf5.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MDU1MTQzODUsIm5iZiI6MTcwNTUxNDA4NSwicGF0aCI6Ii80NTY5MDAxNC8yNDE3MzI5OTItMWYyZDU5MTctOGNkOC00ZTU3LWIwYzctODcyZjAyMGYyYmY1LnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDAxMTclMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwMTE3VDE3NTQ0NVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTQyNzU2Nzc1OTFjNzFhZDk2YWJjNDNiYWZhOTMwZWEwZWNiZGViODg2Zjc4NTVmMjAzNmM5NmNmMGZlNzBlMDgmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.aAKFswVlNb728lfiQkhZeunqp4S0CVISvlfzILRpKVk)
Results
![](https://private-user-images.githubusercontent.com/45690014/241733104-d70da2de-c2f8-417f-999d-2d9778ba719a.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MDU1MTQzODUsIm5iZiI6MTcwNTUxNDA4NSwicGF0aCI6Ii80NTY5MDAxNC8yNDE3MzMxMDQtZDcwZGEyZGUtYzJmOC00MTdmLTk5OWQtMmQ5Nzc4YmE3MTlhLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDAxMTclMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwMTE3VDE3NTQ0NVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWNhMDk0MjViNDdkZTE5MDg1Njc3ZTkxNzE5ZmJlZDBiOGFmOGMyNTk1NDdkMjEzOWE0NTk0NWM3OThmNmYzNGEmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.58ElM95Q8QLBsJ5dynXCD74AjnmgAwwV2UBm3ip0m3k)
![](https://private-user-images.githubusercontent.com/45690014/241733117-f548da67-f55a-4af0-9ec4-8a85b7ceff73.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MDU1MTQzODUsIm5iZiI6MTcwNTUxNDA4NSwicGF0aCI6Ii80NTY5MDAxNC8yNDE3MzMxMTctZjU0OGRhNjctZjU1YS00YWYwLTllYzQtOGE4NWI3Y2VmZjczLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDAxMTclMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwMTE3VDE3NTQ0NVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWIzZTcxMjhjYTA4OWFmZjFhZGE2ZGFlM2UzZjA3NmE5ZDBkMDMwZDQ1NGE1OWEwYWIzOTM3OWRjNjBjYjgyODEmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.EcOMBRdcF6g0gFl-1rmgUuzfdV4w4u40-fRvoBPeXdc)
Reference
AVLip:
@inproceedings{liu2023cross,
title={Cross-Modal Audio-Visual Co-Learning for Text-Independent Speaker Verification},
author={Liu, Meng and Lee, Kong Aik and Wang, Longbiao and Zhang, Hanyi and Zeng, Chang and Dang, Jianwu},
booktitle={ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={1--5},
year={2023},
organization={IEEE}
}
DeepLip:
@inproceedings{liu2021deeplip,
title={DeepLip: A Benchmark for Deep Learning-Based Audio-Visual Lip Biometrics},
author={Liu, Meng and Wang, Longbiao and Lee, Kong Aik and Zhang, Hanyi and Zeng, Chang and Dang, Jianwu},
booktitle={2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
pages={122--129},
year={2021},
organization={IEEE}
}