{
  "id": "2006.04779",
  "version": "v1",
  "published": "2020-06-08T17:53:42.000Z",
  "updated": "2020-06-08T17:53:42.000Z",
  "title": "Conservative Q-Learning for Offline Reinforcement Learning",
  "authors": [
    "Aviral Kumar",
    "Aurick Zhou",
    "George Tucker",
    "Sergey Levine"
  ],
  "comment": "Preprint. Website at: https://sites.google.com/view/cql-offline-rl",
  "categories": [
    "cs.LG",
    "stat.ML"
  ],
  "abstract": "Effectively leveraging large, previously collected datasets in reinforcement learning (RL) is a key challenge for large-scale real-world applications. Offline RL algorithms promise to learn effective policies from previously-collected, static datasets without further interaction. However, in practice, offline RL presents a major challenge, and standard off-policy RL methods can fail due to overestimation of values induced by the distributional shift between the dataset and the learned policy, especially when training on complex and multi-modal data distributions. In this paper, we propose conservative Q-learning (CQL), which aims to address these limitations by learning a conservative Q-function such that the expected value of a policy under this Q-function lower-bounds its true value. We theoretically show that CQL produces a lower bound on the value of the current policy and that it can be incorporated into a principled policy improvement procedure. In practice, CQL augments the standard Bellman error objective with a simple Q-value regularizer which is straightforward to implement on top of existing deep Q-learning and actor-critic implementations. On both discrete and continuous control domains, we show that CQL substantially outperforms existing offline RL methods, often learning policies that attain 2-5 times higher final return, especially when learning from complex and multi-modal data distributions.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2020-06-08T17:53:42.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "offline reinforcement learning",
      "existing offline rl methods",
      "outperforms existing offline rl",
      "conservative q-learning",
      "multi-modal data distributions"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}