UnicodeHangulKorean.js.flow 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. /**
  2. * Copyright (c) 2013-present, Facebook, Inc.
  3. *
  4. * This source code is licensed under the MIT license found in the
  5. * LICENSE file in the root directory of this source tree.
  6. *
  7. * @providesModule UnicodeHangulKorean
  8. * @typechecks
  9. */
  10. /**
  11. * Unicode algorithms for Hangul script, the Korean writing system
  12. *
  13. * Hangul script has three encoded models in Unicode:
  14. *
  15. * A) Conjoining Jamo (covers modern and historic elements)
  16. * * U+1100..U+11FF ; Hangul Jamo
  17. * * U+A960..U+A97F ; Hangul Jamo Extended-A
  18. * * U+D7B0..U+D7FF ; Hangul Jamo Extended-B
  19. *
  20. * B) Conjoined Syllables (only covers modern Korean language)
  21. * * U+AC00..U+D7AF ; Hangul Syllables
  22. *
  23. * C) Compatibility Jamo (one code-point for each "shape")
  24. * * U+3130..U+318F ; Hangul Compatibility Jamo
  25. *
  26. * This modules helps you convert characters from one model to another.
  27. * Primary functionalities are:
  28. *
  29. * 1) Convert from any encodings to Conjoining Jamo characters (A),
  30. * e.g. for prefix matching
  31. *
  32. * 2) Convert from any encodings to Syllable characters, when possible (B),
  33. * e.g. to reach the normal Unicode form (NFC)
  34. */
  35. 'use strict';
  36. const HANGUL_COMPATIBILITY_OR_SYLLABLE_REGEX = /[\u3130-\u318F\uAC00-\uD7AF]/;
  37. /**
  38. * Returns true if the input includes any Hangul Compatibility Jamo or
  39. * Hangul Conjoined Syllable.
  40. *
  41. * @param {string} str
  42. */
  43. function hasCompatibilityOrSyllable(str) {
  44. return HANGUL_COMPATIBILITY_OR_SYLLABLE_REGEX.test(str);
  45. }
  46. /* Compatibility Jamo -> Conjoining Jamo
  47. *
  48. * Maps a compatibility character to the Conjoining Jamo character,
  49. * positioned at (compatibilityCodePoint - 0x3131).
  50. *
  51. * Generated by:
  52. * $ grep '^31[3-8].;' UnicodeData.txt |\
  53. * awk -F';' '{print $6}' | awk '{print " 0x"$2","}'
  54. */
  55. const CMAP = [0x1100, 0x1101, 0x11AA, 0x1102, 0x11AC, 0x11AD, 0x1103, 0x1104, 0x1105, 0x11B0, 0x11B1, 0x11B2, 0x11B3, 0x11B4, 0x11B5, 0x111A, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, 0x110A, 0x110B, 0x110C, 0x110D, 0x110E, 0x110F, 0x1110, 0x1111, 0x1112, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0x1167, 0x1168, 0x1169, 0x116A, 0x116B, 0x116C, 0x116D, 0x116E, 0x116F, 0x1170, 0x1171, 0x1172, 0x1173, 0x1174, 0x1175, 0x1160, 0x1114, 0x1115, 0x11C7, 0x11C8, 0x11CC, 0x11CE, 0x11D3, 0x11D7, 0x11D9, 0x111C, 0x11DD, 0x11DF, 0x111D, 0x111E, 0x1120, 0x1122, 0x1123, 0x1127, 0x1129, 0x112B, 0x112C, 0x112D, 0x112E, 0x112F, 0x1132, 0x1136, 0x1140, 0x1147, 0x114C, 0x11F1, 0x11F2, 0x1157, 0x1158, 0x1159, 0x1184, 0x1185, 0x1188, 0x1191, 0x1192, 0x1194, 0x119E, 0x11A1];
  56. const CBASE = 0x3131;
  57. const CCOUNT = CMAP.length;
  58. const CTOP = CBASE + CCOUNT;
  59. /**
  60. * Maps one Hangul Compatibility Jamo code-point to the equivalent Hangul
  61. * Conjoining Jamo characters, as defined in UnicodeData.txt.
  62. *
  63. * @param {number} codePoint One Unicode code-point
  64. * @output {string}
  65. */
  66. function fromCompatibility(codePoint) {
  67. return String.fromCharCode(CMAP[codePoint - CBASE]);
  68. }
  69. /**
  70. * Conjoined Syllable -> Conjoining Jamo
  71. *
  72. * Based on the "Hangul Syllable Decomposition" algorithm provided in
  73. * 3.12 Conjoining Jamo Behavior, The Unicode Standard, Version 6.3.0.
  74. * <http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf>
  75. */
  76. const LBASE = 0x1100;
  77. const VBASE = 0x1161;
  78. const TBASE = 0x11A7;
  79. const SBASE = 0xAC00;
  80. const LCOUNT = 19;
  81. const VCOUNT = 21;
  82. const TCOUNT = 28;
  83. const NCOUNT = VCOUNT * TCOUNT;
  84. const SCOUNT = LCOUNT * NCOUNT;
  85. const STOP = SBASE + SCOUNT;
  86. /**
  87. * Maps one Hangul Syllable code-point to the equivalent Hangul
  88. * Conjoining Jamo characters, as defined in UnicodeData.txt.
  89. *
  90. * @param {number} codePoint One Unicode character
  91. * @output {string}
  92. */
  93. function decomposeSyllable(codePoint) {
  94. const sylSIndex = codePoint - SBASE;
  95. const sylTIndex = sylSIndex % TCOUNT;
  96. return String.fromCharCode(LBASE + sylSIndex / NCOUNT) + String.fromCharCode(VBASE + sylSIndex % NCOUNT / TCOUNT) + (sylTIndex > 0 ? String.fromCharCode(TBASE + sylTIndex) : '');
  97. }
  98. /* To Conjoining Jamo */
  99. /**
  100. * Return Unicode characters as they are, except for Hangul characters, which
  101. * will be converted to the Conjoining Jamo form.
  102. *
  103. * @param {string} string
  104. * @output {string}
  105. */
  106. function toConjoiningJamo(string) {
  107. if (!hasCompatibilityOrSyllable(string)) {
  108. return string;
  109. }
  110. const result = [];
  111. for (let i = 0; i < string.length; i++) {
  112. const charStr = string.charAt(i);
  113. const codeUnit = charStr.charCodeAt(0);
  114. result.push(CBASE <= codeUnit && codeUnit < CTOP ? fromCompatibility(codeUnit) : SBASE <= codeUnit && codeUnit < STOP ? decomposeSyllable(codeUnit) : charStr);
  115. }
  116. return result.join('');
  117. }
  118. const UnicodeHangulKorean = {
  119. toConjoiningJamo: toConjoiningJamo
  120. };
  121. module.exports = UnicodeHangulKorean;