{
  "id": "1708.03312",
  "version": "v1",
  "published": "2017-08-10T17:46:28.000Z",
  "updated": "2017-08-10T17:46:28.000Z",
  "title": "Radical-level Ideograph Encoder for RNN-based Sentiment Analysis of Chinese and Japanese",
  "authors": [
    "Yuanzhi Ke",
    "Masafumi Hagiwara"
  ],
  "comment": "12 pages, 4 figures",
  "categories": [
    "cs.CL"
  ],
  "abstract": "The character vocabulary can be very large in non-alphabetic languages such as Chinese and Japanese, which makes neural network models huge to process such languages. We explored a model for sentiment classification that takes the embeddings of the radicals of the Chinese characters, i.e, hanzi of Chinese and kanji of Japanese. Our model is composed of a CNN word feature encoder and a bi-directional RNN document feature encoder. The results achieved are on par with the character embedding-based models, and close to the state-of-the-art word embedding-based models, with 90% smaller vocabulary, and at least 13% and 80% fewer parameters than the character embedding-based models and word embedding-based models respectively. The results suggest that the radical embedding-based approach is cost-effective for machine learning on Chinese and Japanese.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2017-08-10T17:46:28.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "radical-level ideograph encoder",
      "rnn-based sentiment analysis",
      "bi-directional rnn document feature encoder",
      "word embedding-based models",
      "character embedding-based models"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 12,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}