Request to add new paper
Opened this issue · 1 comments
ZZZHANG-jx commented
Hello! Thanks for compiling so many great methods into this very helpful resource. Our new paper (accepted by AAAI 2025) is a multimodal method for document image understanding. Would you mind adding it to your resource? Thanks!
Title: DocKylin: A Large Multimodal Model for Visual Document Understanding with Efficient Visual Slimming
Paper: https://arxiv.org/abs/2406.19101
Code: https://github.com/ZZZHANG-jx/DocKylin
We will cite your work in our camera-ready version.
xjtupanda commented
Congrats on the acceptance! Your work has been added to the repo.
Please consider citing:
@article{yin2024survey,
title={A survey on multimodal large language models},
author={Yin, Shukang and Fu, Chaoyou and Zhao, Sirui and Li, Ke and Sun, Xing and Xu, Tong and Chen, Enhong},
journal={National Science Review},
pages={nwae403},
year={2024},
publisher={Oxford University Press}
}
@article{yin2024t2vid,
title={T2Vid: Translating Long Text into Multi-Image is the Catalyst for Video-LLMs},
author={Yin, Shukang and Fu, Chaoyou and Zhao, Sirui and Shen, Yunhang and Ge, Chunjiang and Yang, Yan and Long, Zuwei and Dai, Yuhan and Xu, Tong and Sun, Xing and others},
journal={arXiv preprint arXiv:2411.19951},
year={2024}
}
@article{fu2024mme,
title={MME-Survey: A Comprehensive Survey on Evaluation of Multimodal LLMs},
author={Fu, Chaoyou and Zhang, Yi-Fan and Yin, Shukang and Li, Bo and Fang, Xinyu and Zhao, Sirui and Duan, Haodong and Sun, Xing and Liu, Ziwei and Wang, Liang and others},
journal={arXiv preprint arXiv:2411.15296},
year={2024}
}
@article{fu2024vita,
title={Vita: Towards open-source interactive omni multimodal llm},
author={Fu, Chaoyou and Lin, Haojia and Long, Zuwei and Shen, Yunhang and Zhao, Meng and Zhang, Yifan and Wang, Xiong and Yin, Di and Ma, Long and Zheng, Xiawu and others},
journal={arXiv preprint arXiv:2408.05211},
year={2024}
}
@article{fu2023mme,
title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and Sun, Xing and others},
journal={arXiv preprint arXiv:2306.13394},
year={2023}
}
@article{fu2024video,
title={Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis},
author={Fu, Chaoyou and Dai, Yuhan and Luo, Yondong and Li, Lei and Ren, Shuhuai and Zhang, Renrui and Wang, Zihan and Zhou, Chenyu and Shen, Yunhang and Zhang, Mengdan and others},
journal={arXiv preprint arXiv:2405.21075},
year={2024}
}