{ "id": "1806.09278", "version": "v1", "published": "2018-06-25T04:11:03.000Z", "updated": "2018-06-25T04:11:03.000Z", "title": "Best Vision Technologies Submission to ActivityNet Challenge 2018-Task: Dense-Captioning Events in Videos", "authors": [ "Yuan Liu", "Moyini Yao" ], "comment": "Rank 2 in ActivityNet Captions Challenge 2018", "categories": [ "cs.CV" ], "abstract": "This note describes the details of our solution to the dense-captioning events in videos task of ActivityNet Challenge 2018. Specifically, we solve this problem with a two-stage way, i.e., first temporal event proposal and then sentence generation. For temporal event proposal, we directly leverage the three-stage workflow in [13, 16]. For sentence generation, we capitalize on LSTM-based captioning framework with temporal attention mechanism (dubbed as LSTM-T). Moreover, the input visual sequence to the LSTM-based video captioning model is comprised of RGB and optical flow images. At inference, we adopt a late fusion scheme to fuse the two LSTM-based captioning models for sentence generation.", "revisions": [ { "version": "v1", "updated": "2018-06-25T04:11:03.000Z" } ], "analyses": { "keywords": [ "best vision technologies submission", "activitynet challenge", "dense-captioning events", "sentence generation", "first temporal event proposal" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }