UnicodeCJK.js.flow 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. /**
  2. * Copyright (c) 2013-present, Facebook, Inc.
  3. *
  4. * This source code is licensed under the MIT license found in the
  5. * LICENSE file in the root directory of this source tree.
  6. *
  7. * @providesModule UnicodeCJK
  8. * @typechecks
  9. */
  10. /**
  11. * Unicode algorithms for CJK (Chinese, Japanese, Korean) writing systems.
  12. *
  13. * Utilities for Hanzi/Kanji/Hanja logographs and Kanas (Katakana and Hiragana)
  14. * syllables.
  15. *
  16. * For Korean Hangul see module `UnicodeHangulKorean`.
  17. */
  18. 'use strict';
  19. /**
  20. * Latin
  21. *
  22. * NOTE: The code assumes these sets include only BMP characters.
  23. */
  24. const R_LATIN_ASCII = 'a-zA-Z';
  25. const R_LATIN_FULLWIDTH = '\uFF21-\uFF3A\uFF41-\uFF5A';
  26. const R_LATIN = R_LATIN_ASCII + R_LATIN_FULLWIDTH;
  27. /**
  28. * Hiragana & Katakana
  29. *
  30. * NOTE: Some ranges include non-BMP characters. We do not support those ranges
  31. * for now.
  32. */
  33. const R_HIRAGANA = '\u3040-\u309F';
  34. const R_KATAKANA = '\u30A0-\u30FF';
  35. const R_KATAKANA_PHONETIC = '\u31F0-\u31FF';
  36. const R_KATAKANA_HALFWIDTH = '\uFF65-\uFF9F';
  37. // var R_KANA_SUPPLEMENT = '\U0001B000-\U0001B0FF';
  38. const R_KATAKANA_ALL = R_KATAKANA + R_KATAKANA_PHONETIC + R_KATAKANA_HALFWIDTH;
  39. const R_KANA = R_HIRAGANA + R_KATAKANA_ALL;
  40. const I_HIRAGANA = [0x3040, 0x309F];
  41. const I_KATAKANA = [0x30A0, 0x30FF];
  42. const I_HIRAGANA_TO_KATAKANA = I_KATAKANA[0] - I_HIRAGANA[0];
  43. /**
  44. * Hanzi/Kanji/Hanja
  45. *
  46. * NOTE: Some ranges include non-BMP characters. We do not support those ranges
  47. * for now.
  48. */
  49. const R_IDEO_MAIN = '\u4E00-\u9FCF';
  50. const R_IDEO_EXT_A = '\u3400-\u4DBF';
  51. // var R_IDEO_EXT_B = '\U00020000-\U0002A6DF';
  52. // var R_IDEO_EXT_C = '\U0002A700-\U0002B73F';
  53. // var R_IDEO_EXT_D = '\U0002B740-\U0002B81F';
  54. const R_IDEO = R_IDEO_MAIN + R_IDEO_EXT_A;
  55. /**
  56. * Hangul
  57. */
  58. // var R_HANGUL_JAMO = '\u1100-\u11FF';
  59. // var R_HANGUL_JAMO_EXT_A = '\uA960-\uA97F';
  60. // var R_HANGUL_JAMO_EXT_B = '\uD7B0-\uD7FF';
  61. // var R_HANGUL_COMPATIBILITY = '\u3130-\u318F';
  62. // var R_HANGUL_COMP_HALFWIDTH = '\uFFA0-\uFFDF';
  63. const R_HANGUL_SYLLABLES = '\uAC00-\uD7AF';
  64. /**
  65. * Globals
  66. */
  67. const R_IDEO_OR_SYLL = R_IDEO + R_KANA + R_HANGUL_SYLLABLES;
  68. let REGEX_IDEO = null;
  69. let REGEX_KANA = null;
  70. let REGEX_IDEO_OR_SYLL = null;
  71. let REGEX_IS_KANA_WITH_TRAILING_LATIN = null;
  72. /**
  73. * Whether the string includes any Katakana or Hiragana characters.
  74. *
  75. * @param {string} str
  76. * @return {boolean}
  77. */
  78. function hasKana(str) {
  79. REGEX_KANA = REGEX_KANA || new RegExp('[' + R_KANA + ']');
  80. return REGEX_KANA.test(str);
  81. }
  82. /**
  83. * Whether the string includes any CJK Ideograph characters.
  84. *
  85. * @param {string} str
  86. * @return {boolean}
  87. */
  88. function hasIdeograph(str) {
  89. REGEX_IDEO = REGEX_IDEO || new RegExp('[' + R_IDEO + ']');
  90. return REGEX_IDEO.test(str);
  91. }
  92. /**
  93. * Whether the string includes any CJK Ideograph or Syllable characters.
  94. *
  95. * @param {string} str
  96. * @return {boolean}
  97. */
  98. function hasIdeoOrSyll(str) {
  99. REGEX_IDEO_OR_SYLL = REGEX_IDEO_OR_SYLL || new RegExp('[' + R_IDEO_OR_SYLL + ']');
  100. return REGEX_IDEO_OR_SYLL.test(str);
  101. }
  102. /**
  103. * @param {string} chr
  104. * @output {string}
  105. */
  106. function charCodeToKatakana(chr) {
  107. const charCode = chr.charCodeAt(0);
  108. return String.fromCharCode(charCode < I_HIRAGANA[0] || charCode > I_HIRAGANA[1] ? charCode : charCode + I_HIRAGANA_TO_KATAKANA);
  109. }
  110. /**
  111. * Replace any Hiragana character with the matching Katakana
  112. *
  113. * @param {string} str
  114. * @output {string}
  115. */
  116. function hiraganaToKatakana(str) {
  117. if (!hasKana(str)) {
  118. return str;
  119. }
  120. return str.split('').map(charCodeToKatakana).join('');
  121. }
  122. /**
  123. * Whether the string is exactly a sequence of Kana characters followed by one
  124. * Latin character.
  125. *
  126. * @param {string} str
  127. * @output {string}
  128. */
  129. function isKanaWithTrailingLatin(str) {
  130. REGEX_IS_KANA_WITH_TRAILING_LATIN = REGEX_IS_KANA_WITH_TRAILING_LATIN || new RegExp('^' + '[' + R_KANA + ']+' + '[' + R_LATIN + ']' + '$');
  131. return REGEX_IS_KANA_WITH_TRAILING_LATIN.test(str);
  132. }
  133. /**
  134. * Drops the trailing Latin character from a string that is exactly a sequence
  135. * of Kana characters followed by one Latin character.
  136. *
  137. * @param {string} str
  138. * @output {string}
  139. */
  140. function kanaRemoveTrailingLatin(str) {
  141. if (isKanaWithTrailingLatin(str)) {
  142. return str.substr(0, str.length - 1);
  143. }
  144. return str;
  145. }
  146. const UnicodeCJK = {
  147. hasKana: hasKana,
  148. hasIdeograph: hasIdeograph,
  149. hasIdeoOrSyll: hasIdeoOrSyll,
  150. hiraganaToKatakana: hiraganaToKatakana,
  151. isKanaWithTrailingLatin: isKanaWithTrailingLatin,
  152. kanaRemoveTrailingLatin: kanaRemoveTrailingLatin
  153. };
  154. module.exports = UnicodeCJK;