{ "id": "2206.10139", "version": "v1", "published": "2022-06-21T06:50:38.000Z", "updated": "2022-06-21T06:50:38.000Z", "title": "Insights into Pre-training via Simpler Synthetic Tasks", "authors": [ "Yuhuai Wu", "Felix Li", "Percy Liang" ], "comment": "30 pages", "categories": [ "cs.LG", "cs.AI" ], "abstract": "Pre-training produces representations that are effective for a wide range of downstream tasks, but it is still unclear what properties of pre-training are necessary for effective gains. Notably, recent work shows that even pre-training on synthetic tasks can achieve significant gains in downstream tasks. In this work, we perform three experiments that iteratively simplify pre-training and show that the simplifications still retain much of its gains. First, building on prior work, we perform a systematic evaluation of three existing synthetic pre-training methods on six downstream tasks. We find the best synthetic pre-training method, LIME, attains an average of $67\\%$ of the benefits of natural pre-training. Second, to our surprise, we find that pre-training on a simple and generic synthetic task defined by the Set function achieves $65\\%$ of the benefits, almost matching LIME. Third, we find that $39\\%$ of the benefits can be attained by using merely the parameter statistics of synthetic pre-training. We release the source code at https://github.com/felixzli/synthetic_pretraining.", "revisions": [ { "version": "v1", "updated": "2022-06-21T06:50:38.000Z" } ], "analyses": { "keywords": [ "simpler synthetic tasks", "downstream tasks", "achieve significant gains", "best synthetic pre-training method", "generic synthetic task" ], "note": { "typesetting": "TeX", "pages": 30, "language": "en", "license": "arXiv", "status": "editable" } } }