{ "id": "2212.07699", "version": "v1", "published": "2022-12-15T10:20:42.000Z", "updated": "2022-12-15T10:20:42.000Z", "title": "Retrieval-based Disentanglement with Distant Supervision", "authors": [ "Jiawei Zhou", "Xiaoguang Li", "Lifeng Shang", "Xin Jiang", "Qun Liu", "Lei Chen" ], "categories": [ "cs.CL", "cs.AI", "cs.CV" ], "abstract": "Disentangled representation learning remains challenging as ground truth factors of variation do not naturally exist. To address this, we present Vocabulary Disentanglement Retrieval~(VDR), a simple yet effective retrieval-based disentanglement framework that leverages nature language as distant supervision. Our approach is built upon the widely-used bi-encoder architecture with disentanglement heads and is trained on data-text pairs that are readily available on the web or in existing datasets. This makes our approach task- and modality-agnostic with potential for a wide range of downstream applications. We conduct experiments on 16 datasets in both text-to-text and cross-modal scenarios and evaluate VDR in a zero-shot setting. With the incorporation of disentanglement heads and a minor increase in parameters, VDR achieves significant improvements over the base retriever it is built upon, with a 9% higher on NDCG@10 scores in zero-shot text-to-text retrieval and an average of 13% higher recall in cross-modal retrieval. In comparison to other baselines, VDR outperforms them in most tasks, while also improving explainability and efficiency.", "revisions": [ { "version": "v1", "updated": "2022-12-15T10:20:42.000Z" } ], "analyses": { "keywords": [ "distant supervision", "representation learning remains challenging", "disentanglement heads", "vdr achieves significant improvements", "ground truth factors" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }