{
  "id": "2210.08323",
  "version": "v1",
  "published": "2022-10-15T15:54:28.000Z",
  "updated": "2022-10-15T15:54:28.000Z",
  "title": "A Policy-Guided Imitation Approach for Offline Reinforcement Learning",
  "authors": [
    "Haoran Xu",
    "Li Jiang",
    "Jianxiong Li",
    "Xianyuan Zhan"
  ],
  "comment": "NeurIPS 2022, code at https://github.com/ryanxhr/POR",
  "categories": [
    "cs.LG",
    "cs.AI"
  ],
  "abstract": "Offline reinforcement learning (RL) methods can generally be categorized into two types: RL-based and Imitation-based. RL-based methods could in principle enjoy out-of-distribution generalization but suffer from erroneous off-policy evaluation. Imitation-based methods avoid off-policy evaluation but are too conservative to surpass the dataset. In this study, we propose an alternative approach, inheriting the training stability of imitation-style methods while still allowing logical out-of-distribution generalization. We decompose the conventional reward-maximizing policy in offline RL into a guide-policy and an execute-policy. During training, the guide-poicy and execute-policy are learned using only data from the dataset, in a supervised and decoupled manner. During evaluation, the guide-policy guides the execute-policy by telling where it should go so that the reward can be maximized, serving as the \\textit{Prophet}. By doing so, our algorithm allows \\textit{state-compositionality} from the dataset, rather than \\textit{action-compositionality} conducted in prior imitation-style methods. We dumb this new approach Policy-guided Offline RL (\\texttt{POR}). \\texttt{POR} demonstrates the state-of-the-art performance on D4RL, a standard benchmark for offline RL. We also highlight the benefits of \\texttt{POR} in terms of improving with supplementary suboptimal data and easily adapting to new tasks by only changing the guide-poicy.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-10-15T15:54:28.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "offline reinforcement learning",
      "policy-guided imitation approach",
      "offline rl",
      "imitation-style methods",
      "principle enjoy out-of-distribution generalization"
    ],
    "tags": [
      "github project"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}