Publications

Preprints

2026

How You Move Tells What You’ll Do: Trajectory-Conditioned Egocentric Prediction.

Sejoon Jun, Hai Nguyen-Truong, Luigi Seminara, and Lorenzo Torresani

2026

@misc{trajpilot2026,
  title = {How You Move Tells What You'll Do: Trajectory-Conditioned Egocentric Prediction.},
  author = {Jun, Sejoon and Nguyen-Truong, Hai and Seminara, Luigi and Torresani, Lorenzo},
  year = {2026},
}

RECIPE: Procedural Planning via Grounding in Instructional Video

Luigi Seminara, Antonino Furnari, and Lorenzo Torresani

2026

Bib Website

EvoGround: Self-Evolving Video Agents for Video Temporal Grounding

Minjoon Jung, Byoung-Tak Zhang, and Lorenzo Torresani

2026

Bib PDF Website

@misc{evoground2026,
  title = {EvoGround: Self-Evolving Video Agents for Video Temporal Grounding},
  author = {Jung, Minjoon and Zhang, Byoung-Tak and Torresani, Lorenzo},
  archiveprefix = {arXiv},
  primaryclass = {cs.CV},
  year = {2026},
}

2025

PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding

J. H. Cho, A. Madotto, E. Mavroudi, T. Afouras, T. Nagarajan, M. Maaz, Y. Song, and others

In Advances in Neural Information Processing Systems, Spotlight (<3.5%) , 2025

Bib PDF

@inproceedings{perceptionlm2025,
  title = {PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding},
  author = {Cho, J. H. and Madotto, A. and Mavroudi, E. and Afouras, T. and Nagarajan, T. and Maaz, M. and Song, Y. and others},
  booktitle = {Advances in Neural Information Processing Systems},
  year = {2025},
}

BIMBA: Selective-Scan Compression for Long-Range Video Question Answering

M. M. Islam, T. Nagarajan, H. Wang, G. Bertasius, and Lorenzo Torresani

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025

Awarded Bib PDF

EgoSchema Challenge — 1st place

@inproceedings{bimba2025,
  title = {BIMBA: Selective-Scan Compression for Long-Range Video Question Answering},
  author = {Islam, M. M. and Nagarajan, T. and Wang, H. and Bertasius, G. and Torresani, Lorenzo},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2025},
}

Enrich and Detect: Video Temporal Grounding with Multimodal LLMs

S. Pramanick, E. Mavroudi, Y. Song, R. Chellappa, Lorenzo Torresani, and T. Afouras

In IEEE/CVF International Conference on Computer Vision (ICCV), Highlight (<2.5%) , 2025

Bib PDF

@inproceedings{enrichdetect2025,
  title = {Enrich and Detect: Video Temporal Grounding with Multimodal LLMs},
  author = {Pramanick, S. and Mavroudi, E. and Song, Y. and Chellappa, R. and Torresani, Lorenzo and Afouras, T.},
  booktitle = {IEEE/CVF International Conference on Computer Vision (ICCV)},
  year = {2025},
}

VITED: Video Temporal Evidence Distillation

Y. Lu, Y. Song, W. Wang, Lorenzo Torresani, and T. Nagarajan

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025

Bib PDF

@inproceedings{vited2025,
  title = {VITED: Video Temporal Evidence Distillation},
  author = {Lu, Y. and Song, Y. and Wang, W. and Torresani, Lorenzo and Nagarajan, T.},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2025},
}

2024

UNICORN: A Unified Causal Video-Oriented Language-Modeling Framework for Temporal Video-Language Tasks

Y. Xiong, Y. Nie, H. Liu, B. Wang, J. Chen, R. Jin, C.-J. Hsieh, Lorenzo Torresani, and J. Lei

In Conference on Empirical Methods in Natural Language Processing (EMNLP), 2024

Bib PDF

@inproceedings{unicorn2024,
  title = {UNICORN: A Unified Causal Video-Oriented Language-Modeling Framework for Temporal Video-Language Tasks},
  author = {Xiong, Y. and Nie, Y. and Liu, H. and Wang, B. and Chen, J. and Jin, R. and Hsieh, C.-J. and Torresani, Lorenzo and Lei, J.},
  booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year = {2024},
}

4Diff: 3D-Aware Diffusion Model for Third-to-First Viewpoint Translation

F. Cheng, M. Luo, H. Wang, A. Dimakis, Lorenzo Torresani, G. Bertasius, and others

In European Conference on Computer Vision (ECCV), 2024

Bib PDF

@inproceedings{fourdiff2024,
  title = {4Diff: 3D-Aware Diffusion Model for Third-to-First Viewpoint Translation},
  author = {Cheng, F. and Luo, M. and Wang, H. and Dimakis, A. and Torresani, Lorenzo and Bertasius, G. and others},
  booktitle = {European Conference on Computer Vision (ECCV)},
  year = {2024},
}

Video ReCap: Recursive Captioning of Hour-Long Videos

M. M. Islam, N. Ho, X. Yang, T. Nagarajan, Lorenzo Torresani, and G. Bertasius

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024

Awarded Bib PDF

CVPR 2025 EgoVis Distinguished Paper

@inproceedings{videorecap2024,
  title = {Video ReCap: Recursive Captioning of Hour-Long Videos},
  author = {Islam, M. M. and Ho, N. and Yang, X. and Nagarajan, T. and Torresani, Lorenzo and Bertasius, G.},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2024},
}

Learning to Segment Referred Objects from Narrated Egocentric Videos

Y. Shen, H. Wang, X. Yang, M. Feiszli, E. Elhamifar, Lorenzo Torresani, and E. Mavroudi

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024

Bib PDF

@inproceedings{refseg2024,
  title = {Learning to Segment Referred Objects from Narrated Egocentric Videos},
  author = {Shen, Y. and Wang, H. and Yang, X. and Feiszli, M. and Elhamifar, E. and Torresani, Lorenzo and Mavroudi, E.},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2024},
}

Step Differences in Instructional Video

T. Nagarajan and Lorenzo Torresani

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024

Bib PDF

@inproceedings{stepdifferences2024,
  title = {Step Differences in Instructional Video},
  author = {Nagarajan, T. and Torresani, Lorenzo},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2024},
}

Ego-Exo4D: Understanding Skilled Human Activity from First- and Third-Person Perspectives

K. Grauman, A. Westbury, Lorenzo Torresani, K. Kitani, J. Malik, T. Afouras, and others

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024

Bib PDF

@inproceedings{egoexo4d2024,
  title = {Ego-Exo4D: Understanding Skilled Human Activity from First- and Third-Person Perspectives},
  author = {Grauman, K. and Westbury, A. and Torresani, Lorenzo and Kitani, K. and Malik, J. and Afouras, T. and others},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2024},
}