@misc{trajpilot2026,title={How You Move Tells What You'll Do: Trajectory-Conditioned Egocentric Prediction.},author={Jun, Sejoon and Nguyen-Truong, Hai and Seminara, Luigi and Torresani, Lorenzo},year={2026},}
RECIPE: Procedural Planning via Grounding in Instructional Video
Luigi Seminara, Antonino Furnari, and Lorenzo Torresani
@misc{recipe2026,title={RECIPE: Procedural Planning via Grounding in Instructional Video},author={Seminara, Luigi and Furnari, Antonino and Torresani, Lorenzo},year={2026},}
EvoGround: Self-Evolving Video Agents for Video Temporal Grounding
Minjoon Jung, Byoung-Tak Zhang, and Lorenzo Torresani
@misc{evoground2026,title={EvoGround: Self-Evolving Video Agents for Video Temporal Grounding},author={Jung, Minjoon and Zhang, Byoung-Tak and Torresani, Lorenzo},archiveprefix={arXiv},primaryclass={cs.CV},year={2026},}
Publications
2025
PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding
J. H. Cho, A. Madotto, E. Mavroudi, T. Afouras, T. Nagarajan, M. Maaz, Y. Song, and others
In Advances in Neural Information Processing Systems, Spotlight (<3.5%) , 2025
@inproceedings{perceptionlm2025,title={PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding},author={Cho, J. H. and Madotto, A. and Mavroudi, E. and Afouras, T. and Nagarajan, T. and Maaz, M. and Song, Y. and others},booktitle={Advances in Neural Information Processing Systems},year={2025},}
BIMBA: Selective-Scan Compression for Long-Range Video Question Answering
M. M. Islam, T. Nagarajan, H. Wang, G. Bertasius, and Lorenzo Torresani
In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025
@inproceedings{bimba2025,title={BIMBA: Selective-Scan Compression for Long-Range Video Question Answering},author={Islam, M. M. and Nagarajan, T. and Wang, H. and Bertasius, G. and Torresani, Lorenzo},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2025},}
Enrich and Detect: Video Temporal Grounding with Multimodal LLMs
S. Pramanick, E. Mavroudi, Y. Song, R. Chellappa, Lorenzo Torresani, and T. Afouras
In IEEE/CVF International Conference on Computer Vision (ICCV), Highlight (<2.5%) , 2025
@inproceedings{enrichdetect2025,title={Enrich and Detect: Video Temporal Grounding with Multimodal LLMs},author={Pramanick, S. and Mavroudi, E. and Song, Y. and Chellappa, R. and Torresani, Lorenzo and Afouras, T.},booktitle={IEEE/CVF International Conference on Computer Vision (ICCV)},year={2025},}
VITED: Video Temporal Evidence Distillation
Y. Lu, Y. Song, W. Wang, Lorenzo Torresani, and T. Nagarajan
In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025
@inproceedings{vited2025,title={VITED: Video Temporal Evidence Distillation},author={Lu, Y. and Song, Y. and Wang, W. and Torresani, Lorenzo and Nagarajan, T.},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2025},}
2024
UNICORN: A Unified Causal Video-Oriented Language-Modeling Framework for Temporal Video-Language Tasks
Y. Xiong, Y. Nie, H. Liu, B. Wang, J. Chen, R. Jin, C.-J. Hsieh, Lorenzo Torresani, and J. Lei
In Conference on Empirical Methods in Natural Language Processing (EMNLP), 2024
@inproceedings{unicorn2024,title={UNICORN: A Unified Causal Video-Oriented Language-Modeling Framework for Temporal Video-Language Tasks},author={Xiong, Y. and Nie, Y. and Liu, H. and Wang, B. and Chen, J. and Jin, R. and Hsieh, C.-J. and Torresani, Lorenzo and Lei, J.},booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)},year={2024},}
4Diff: 3D-Aware Diffusion Model for Third-to-First Viewpoint Translation
F. Cheng, M. Luo, H. Wang, A. Dimakis, Lorenzo Torresani, G. Bertasius, and others
In European Conference on Computer Vision (ECCV), 2024
@inproceedings{fourdiff2024,title={4Diff: 3D-Aware Diffusion Model for Third-to-First Viewpoint Translation},author={Cheng, F. and Luo, M. and Wang, H. and Dimakis, A. and Torresani, Lorenzo and Bertasius, G. and others},booktitle={European Conference on Computer Vision (ECCV)},year={2024},}
Video ReCap: Recursive Captioning of Hour-Long Videos
M. M. Islam, N. Ho, X. Yang, T. Nagarajan, Lorenzo Torresani, and G. Bertasius
In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024
@inproceedings{videorecap2024,title={Video ReCap: Recursive Captioning of Hour-Long Videos},author={Islam, M. M. and Ho, N. and Yang, X. and Nagarajan, T. and Torresani, Lorenzo and Bertasius, G.},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2024},}
Learning to Segment Referred Objects from Narrated Egocentric Videos
Y. Shen, H. Wang, X. Yang, M. Feiszli, E. Elhamifar, Lorenzo Torresani, and E. Mavroudi
In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024
@inproceedings{refseg2024,title={Learning to Segment Referred Objects from Narrated Egocentric Videos},author={Shen, Y. and Wang, H. and Yang, X. and Feiszli, M. and Elhamifar, E. and Torresani, Lorenzo and Mavroudi, E.},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2024},}
Step Differences in Instructional Video
T. Nagarajan and Lorenzo Torresani
In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024
@inproceedings{stepdifferences2024,title={Step Differences in Instructional Video},author={Nagarajan, T. and Torresani, Lorenzo},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2024},}
Ego-Exo4D: Understanding Skilled Human Activity from First- and Third-Person Perspectives
K. Grauman, A. Westbury, Lorenzo Torresani, K. Kitani, J. Malik, T. Afouras, and others
In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024
@inproceedings{egoexo4d2024,title={Ego-Exo4D: Understanding Skilled Human Activity from First- and Third-Person Perspectives},author={Grauman, K. and Westbury, A. and Torresani, Lorenzo and Kitani, K. and Malik, J. and Afouras, T. and others},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2024},}