{
  "id": "2206.10139",
  "version": "v1",
  "published": "2022-06-21T06:50:38.000Z",
  "updated": "2022-06-21T06:50:38.000Z",
  "title": "Insights into Pre-training via Simpler Synthetic Tasks",
  "authors": [
    "Yuhuai Wu",
    "Felix Li",
    "Percy Liang"
  ],
  "comment": "30 pages",
  "categories": [
    "cs.LG",
    "cs.AI"
  ],
  "abstract": "Pre-training produces representations that are effective for a wide range of downstream tasks, but it is still unclear what properties of pre-training are necessary for effective gains. Notably, recent work shows that even pre-training on synthetic tasks can achieve significant gains in downstream tasks. In this work, we perform three experiments that iteratively simplify pre-training and show that the simplifications still retain much of its gains. First, building on prior work, we perform a systematic evaluation of three existing synthetic pre-training methods on six downstream tasks. We find the best synthetic pre-training method, LIME, attains an average of $67\\%$ of the benefits of natural pre-training. Second, to our surprise, we find that pre-training on a simple and generic synthetic task defined by the Set function achieves $65\\%$ of the benefits, almost matching LIME. Third, we find that $39\\%$ of the benefits can be attained by using merely the parameter statistics of synthetic pre-training. We release the source code at https://github.com/felixzli/synthetic_pretraining.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-06-21T06:50:38.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "simpler synthetic tasks",
      "downstream tasks",
      "achieve significant gains",
      "best synthetic pre-training method",
      "generic synthetic task"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 30,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}