{ "id": "2311.10093", "version": "v1", "published": "2023-11-16T18:59:51.000Z", "updated": "2023-11-16T18:59:51.000Z", "title": "The Chosen One: Consistent Characters in Text-to-Image Diffusion Models", "authors": [ "Omri Avrahami", "Amir Hertz", "Yael Vinker", "Moab Arar", "Shlomi Fruchter", "Ohad Fried", "Daniel Cohen-Or", "Dani Lischinski" ], "comment": "Project page is available at https://omriavrahami.com/the-chosen-one", "categories": [ "cs.CV", "cs.GR", "cs.LG" ], "abstract": "Recent advances in text-to-image generation models have unlocked vast potential for visual creativity. However, these models struggle with generation of consistent characters, a crucial aspect for numerous real-world applications such as story visualization, game development asset design, advertising, and more. Current methods typically rely on multiple pre-existing images of the target character or involve labor-intensive manual processes. In this work, we propose a fully automated solution for consistent character generation, with the sole input being a text prompt. We introduce an iterative procedure that, at each stage, identifies a coherent set of images sharing a similar identity and extracts a more consistent identity from this set. Our quantitative analysis demonstrates that our method strikes a better balance between prompt alignment and identity consistency compared to the baseline methods, and these findings are reinforced by a user study. To conclude, we showcase several practical applications of our approach. Project page is available at https://omriavrahami.com/the-chosen-one", "revisions": [ { "version": "v1", "updated": "2023-11-16T18:59:51.000Z" } ], "analyses": { "keywords": [ "text-to-image diffusion models", "game development asset design", "consistent character generation", "text-to-image generation models", "prompt alignment" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }