{ "id": "2302.01328", "version": "v1", "published": "2023-02-02T18:58:05.000Z", "updated": "2023-02-02T18:58:05.000Z", "title": "$IC^3$: Image Captioning by Committee Consensus", "authors": [ "David M. Chan", "Austin Myers", "Sudheendra Vijayanarasimhan", "David A. Ross", "John Canny" ], "categories": [ "cs.CV", "cs.AI", "cs.CL", "cs.LG" ], "abstract": "If you ask a human to describe an image, they might do so in a thousand different ways. Traditionally, image captioning models are trained to approximate the reference distribution of image captions, however, doing so encourages captions that are viewpoint-impoverished. Such captions often focus on only a subset of the possible details, while ignoring potentially useful information in the scene. In this work, we introduce a simple, yet novel, method: \"Image Captioning by Committee Consensus\" ($IC^3$), designed to generate a single caption that captures high-level details from several viewpoints. Notably, humans rate captions produced by $IC^3$ at least as helpful as baseline SOTA models more than two thirds of the time, and $IC^3$ captions can improve the performance of SOTA automated recall systems by up to 84%, indicating significant material improvements over existing SOTA approaches for visual description. Our code is publicly available at https://github.com/DavidMChan/caption-by-committee", "revisions": [ { "version": "v1", "updated": "2023-02-02T18:58:05.000Z" } ], "analyses": { "keywords": [ "committee consensus", "image captioning", "indicating significant material improvements", "baseline sota models", "humans rate captions" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }