{ "id": "2309.15826", "version": "v1", "published": "2023-09-27T17:48:14.000Z", "updated": "2023-09-27T17:48:14.000Z", "title": "Cross-Modal Multi-Tasking for Speech-to-Text Translation via Hard Parameter Sharing", "authors": [ "Brian Yan", "Xuankai Chang", "Antonios Anastasopoulos", "Yuya Fujita", "Shinji Watanabe" ], "categories": [ "cs.CL", "cs.SD", "eess.AS" ], "abstract": "Recent works in end-to-end speech-to-text translation (ST) have proposed multi-tasking methods with soft parameter sharing which leverage machine translation (MT) data via secondary encoders that map text inputs to an eventual cross-modal representation. In this work, we instead propose a ST/MT multi-tasking framework with hard parameter sharing in which all model parameters are shared cross-modally. Our method reduces the speech-text modality gap via a pre-processing stage which converts speech and text inputs into two discrete token sequences of similar length -- this allows models to indiscriminately process both modalities simply using a joint vocabulary. With experiments on MuST-C, we demonstrate that our multi-tasking framework improves attentional encoder-decoder, Connectionist Temporal Classification (CTC), transducer, and joint CTC/attention models by an average of +0.5 BLEU without any external MT data. Further, we show that this framework incorporates external MT data, yielding +0.8 BLEU, and also improves transfer learning from pre-trained textual models, yielding +1.8 BLEU.", "revisions": [ { "version": "v1", "updated": "2023-09-27T17:48:14.000Z" } ], "analyses": { "keywords": [ "hard parameter sharing", "speech-to-text translation", "cross-modal multi-tasking", "framework incorporates external mt data", "text inputs" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }