{ "id": "2205.06160", "version": "v1", "published": "2022-05-12T15:34:37.000Z", "updated": "2022-05-12T15:34:37.000Z", "title": "Localized Vision-Language Matching for Open-vocabulary Object Detection", "authors": [ "Maria A. Bravo", "Sudhanshu Mittal", "Thomas Brox" ], "categories": [ "cs.CV", "cs.LG" ], "abstract": "In this work, we propose an open-world object detection method that, based on image-caption pairs, learns to detect novel object classes along with a given set of known classes. It is a two-stage training approach that first uses a location-guided image-caption matching technique to learn class labels for both novel and known classes in a weakly-supervised manner and second specializes the model for the object detection task using known class annotations. We show that a simple language model fits better than a large contextualized language model for detecting novel objects. Moreover, we introduce a consistency-regularization technique to better exploit image-caption pair information. Our method compares favorably to existing open-world detection approaches while being data-efficient.", "revisions": [ { "version": "v1", "updated": "2022-05-12T15:34:37.000Z" } ], "analyses": { "keywords": [ "open-vocabulary object detection", "localized vision-language matching", "better exploit image-caption pair information", "simple language model fits better", "detect novel object classes" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }