{ "id": "1912.01192", "version": "v1", "published": "2019-12-03T05:04:40.000Z", "updated": "2019-12-03T05:04:40.000Z", "title": "Learning Adversarial MDPs with Bandit Feedback and Unknown Transition", "authors": [ "Tiancheng Jin", "Haipeng Luo" ], "comment": "14 pages", "categories": [ "cs.LG", "stat.ML" ], "abstract": "We consider the problem of learning in episodic finite-horizon Markov decision processes with unknown transition function, bandit feedback, and adversarial losses. We propose an efficient algorithm that achieves $\\mathcal{\\tilde{O}}(L|X|^2\\sqrt{|A|T})$ regret with high probability, where $L$ is the horizon, $|X|$ is the number of states, $|A|$ is the number of actions, and $T$ is the number of episodes. To the best of our knowledge, our algorithm is the first one to ensure sub-linear regret in this challenging setting. Our key technical contribution is to introduce an optimistic loss estimator that is inversely weighted by an $\\textit{upper occupancy bound}$.", "revisions": [ { "version": "v1", "updated": "2019-12-03T05:04:40.000Z" } ], "analyses": { "subjects": [ "I.2.6", "I.2.6" ], "keywords": [ "learning adversarial mdps", "bandit feedback", "episodic finite-horizon markov decision processes", "ensure sub-linear regret", "optimistic loss estimator" ], "note": { "typesetting": "TeX", "pages": 14, "language": "en", "license": "arXiv", "status": "editable" } } }