{
  "id": "2410.11610",
  "version": "v1",
  "published": "2024-10-15T13:46:19.000Z",
  "updated": "2024-10-15T13:46:19.000Z",
  "title": "Depth Estimation From Monocular Images With Enhanced Encoder-Decoder Architecture",
  "authors": [
    "Dabbrata Das",
    "Argho Deb Das",
    "Farhan Sadaf"
  ],
  "categories": [
    "cs.CV",
    "eess.IV"
  ],
  "abstract": "Estimating depth from a single 2D image is a challenging task because of the need for stereo or multi-view data, which normally provides depth information. This paper deals with this challenge by introducing a novel deep learning-based approach using an encoder-decoder architecture, where the Inception-ResNet-v2 model is utilized as the encoder. According to the available literature, this is the first instance of using Inception-ResNet-v2 as an encoder for monocular depth estimation, illustrating better performance than previous models. The use of Inception-ResNet-v2 enables our model to capture complex objects and fine-grained details effectively that are generally difficult to predict. Besides, our model incorporates multi-scale feature extraction to enhance depth prediction accuracy across different kinds of object sizes and distances. We propose a composite loss function consisting of depth loss, gradient edge loss, and SSIM loss, where the weights are fine-tuned to optimize the weighted sum, ensuring better balance across different aspects of depth estimation. Experimental results on the NYU Depth V2 dataset show that our model achieves state-of-the-art performance, with an ARE of 0.064, RMSE of 0.228, and accuracy ($\\delta$ $<1.25$) of 89.3%. These metrics demonstrate that our model effectively predicts depth, even in challenging circumstances, providing a scalable solution for real-world applications in robotics, 3D reconstruction, and augmented reality.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-10-15T13:46:19.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "depth estimation",
      "enhanced encoder-decoder architecture",
      "monocular images",
      "model incorporates multi-scale feature extraction",
      "nyu depth v2 dataset"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}