{ "id": "1708.03312", "version": "v1", "published": "2017-08-10T17:46:28.000Z", "updated": "2017-08-10T17:46:28.000Z", "title": "Radical-level Ideograph Encoder for RNN-based Sentiment Analysis of Chinese and Japanese", "authors": [ "Yuanzhi Ke", "Masafumi Hagiwara" ], "comment": "12 pages, 4 figures", "categories": [ "cs.CL" ], "abstract": "The character vocabulary can be very large in non-alphabetic languages such as Chinese and Japanese, which makes neural network models huge to process such languages. We explored a model for sentiment classification that takes the embeddings of the radicals of the Chinese characters, i.e, hanzi of Chinese and kanji of Japanese. Our model is composed of a CNN word feature encoder and a bi-directional RNN document feature encoder. The results achieved are on par with the character embedding-based models, and close to the state-of-the-art word embedding-based models, with 90% smaller vocabulary, and at least 13% and 80% fewer parameters than the character embedding-based models and word embedding-based models respectively. The results suggest that the radical embedding-based approach is cost-effective for machine learning on Chinese and Japanese.", "revisions": [ { "version": "v1", "updated": "2017-08-10T17:46:28.000Z" } ], "analyses": { "keywords": [ "radical-level ideograph encoder", "rnn-based sentiment analysis", "bi-directional rnn document feature encoder", "word embedding-based models", "character embedding-based models" ], "note": { "typesetting": "TeX", "pages": 12, "language": "en", "license": "arXiv", "status": "editable" } } }