{
  "id": "1706.00977",
  "version": "v1",
  "published": "2017-06-03T16:48:34.000Z",
  "updated": "2017-06-03T16:48:34.000Z",
  "title": "Thompson Sampling for the MNL-Bandit",
  "authors": [
    "Shipra Agrawal",
    "Vashist Avadhanula",
    "Vineet Goyal",
    "Assaf Zeevi"
  ],
  "comment": "Accepted for presentation at Conference on Learning Theory (COLT) 2017",
  "categories": [
    "cs.LG"
  ],
  "abstract": "We consider a sequential subset selection problem under parameter uncertainty, where at each time step, the decision maker selects a subset of cardinality $K$ from $N$ possible items (arms), and observes a (bandit) feedback in the form of the index of one of the items in said subset, or none. Each item in the index set is ascribed a certain value (reward), and the feedback is governed by a Multinomial Logit (MNL) choice model whose parameters are a priori unknown. The objective of the decision maker is to maximize the expected cumulative rewards over a finite horizon $T$, or alternatively, minimize the regret relative to an oracle that knows the MNL parameters. We refer to this as the MNL-Bandit problem. This problem is representative of a larger family of exploration-exploitation problems that involve a combinatorial objective, and arise in several important application domains. We present an approach to adapt Thompson Sampling to this problem and show that it achieves near-optimal regret as well as attractive numerical performance.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2017-06-03T16:48:34.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "thompson sampling",
      "mnl-bandit",
      "sequential subset selection problem",
      "achieves near-optimal regret",
      "decision maker selects"
    ],
    "tags": [
      "conference paper"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}