{ "id": "1712.06317", "version": "v1", "published": "2017-12-18T10:02:23.000Z", "updated": "2017-12-18T10:02:23.000Z", "title": "Spatial-Temporal Memory Networks for Video Object Detection", "authors": [ "Fanyi Xiao", "Yong Jae Lee" ], "categories": [ "cs.CV" ], "abstract": "We introduce Spatial-Temporal Memory Networks (STMN) for video object detection. At its core, we propose a novel Spatial-Temporal Memory module (STMM) as the recurrent computation unit to model long-term temporal appearance and motion dynamics. The STMM's design enables the integration of ImageNet pre-trained backbone CNN weights for both the feature stack as well as the prediction head, which we find to be critical for accurate detection. Furthermore, in order to tackle object motion in videos, we propose a novel MatchTrans module to align the spatial-temporal memory from frame to frame. We compare our method to state-of-the-art detectors on ImageNet VID, and conduct ablative studies to dissect the contribution of our different design choices. We obtain state-of-the-art results with the VGG backbone, and competitive results with the ResNet backbone. To our knowledge, this is the first video object detector that is equipped with an explicit memory mechanism to model long-term temporal dynamics.", "revisions": [ { "version": "v1", "updated": "2017-12-18T10:02:23.000Z" } ], "analyses": { "keywords": [ "video object detection", "spatial-temporal memory networks", "imagenet pre-trained backbone cnn weights", "novel spatial-temporal memory module", "model long-term temporal appearance" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }