{
  "id": "2304.06708",
  "version": "v1",
  "published": "2023-04-13T17:57:01.000Z",
  "updated": "2023-04-13T17:57:01.000Z",
  "title": "Verbs in Action: Improving verb understanding in video-language models",
  "authors": [
    "Liliane Momeni",
    "Mathilde Caron",
    "Arsha Nagrani",
    "Andrew Zisserman",
    "Cordelia Schmid"
  ],
  "categories": [
    "cs.CV",
    "cs.AI",
    "cs.CL"
  ],
  "abstract": "Understanding verbs is crucial to modelling how people and objects interact with each other and the environment through space and time. Recently, state-of-the-art video-language models based on CLIP have been shown to have limited verb understanding and to rely extensively on nouns, restricting their performance in real-world video applications that require action and temporal understanding. In this work, we improve verb understanding for CLIP-based video-language models by proposing a new Verb-Focused Contrastive (VFC) framework. This consists of two main components: (1) leveraging pretrained large language models (LLMs) to create hard negatives for cross-modal contrastive learning, together with a calibration strategy to balance the occurrence of concepts in positive and negative pairs; and (2) enforcing a fine-grained, verb phrase alignment loss. Our method achieves state-of-the-art results for zero-shot performance on three downstream tasks that focus on verb understanding: video-text matching, video question-answering and video classification. To the best of our knowledge, this is the first work which proposes a method to alleviate the verb understanding problem, and does not simply highlight it.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-04-13T17:57:01.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "improving verb understanding",
      "method achieves state-of-the-art results",
      "verb phrase alignment loss",
      "real-world video applications",
      "leveraging pretrained large language models"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}