{ "id": "2004.02857", "version": "v1", "published": "2020-04-06T17:49:12.000Z", "updated": "2020-04-06T17:49:12.000Z", "title": "Beyond the Nav-Graph: Vision-and-Language Navigation in Continuous Environments", "authors": [ "Jacob Krantz", "Erik Wijmans", "Arjun Majumdar", "Dhruv Batra", "Stefan Lee" ], "categories": [ "cs.CV", "cs.CL", "cs.RO" ], "abstract": "We develop a language-guided navigation task set in a continuous 3D environment where agents must execute low-level actions to follow natural language navigation directions. By being situated in continuous environments, this setting lifts a number of assumptions implicit in prior work that represents environments as a sparse graph of panoramas with edges corresponding to navigability. Specifically, our setting drops the presumptions of known environment topologies, short-range oracle navigation, and perfect agent localization. To contextualize this new task, we develop models that mirror many of the advances made in prior settings as well as single-modality baselines. While some of these techniques transfer, we find significantly lower absolute performance in the continuous setting -- suggesting that performance in prior `navigation-graph' settings may be inflated by the strong implicit assumptions.", "revisions": [ { "version": "v1", "updated": "2020-04-06T17:49:12.000Z" } ], "analyses": { "keywords": [ "continuous environments", "vision-and-language navigation", "natural language navigation directions", "strong implicit assumptions", "language-guided navigation task set" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }