{
  "id": "2010.14920",
  "version": "v1",
  "published": "2020-10-28T12:33:04.000Z",
  "updated": "2020-10-28T12:33:04.000Z",
  "title": "Bridging the Modality Gap for Speech-to-Text Translation",
  "authors": [
    "Yuchen Liu",
    "Junnan Zhu",
    "Jiajun Zhang",
    "Chengqing Zong"
  ],
  "categories": [
    "cs.CL"
  ],
  "abstract": "End-to-end speech translation aims to translate speech in one language into text in another language via an end-to-end way. Most existing methods employ an encoder-decoder structure with a single encoder to learn acoustic representation and semantic information simultaneously, which ignores the speech-and-text modality differences and makes the encoder overloaded, leading to great difficulty in learning such a model. To address these issues, we propose a Speech-to-Text Adaptation for Speech Translation (STAST) model which aims to improve the end-to-end model performance by bridging the modality gap between speech and text. Specifically, we decouple the speech translation encoder into three parts and introduce a shrink mechanism to match the length of speech representation with that of the corresponding text transcription. To obtain better semantic representation, we completely integrate a text-based translation model into the STAST so that two tasks can be trained in the same latent space. Furthermore, we introduce a cross-modal adaptation method to close the distance between speech and text representation. Experimental results on English-French and English-German speech translation corpora have shown that our model significantly outperforms strong baselines, and achieves the new state-of-the-art performance.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2020-10-28T12:33:04.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "modality gap",
      "speech-to-text translation",
      "end-to-end speech translation aims",
      "representation",
      "model significantly outperforms strong baselines"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}