UnicodeUtils.js 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. /**
  2. * Copyright (c) 2013-present, Facebook, Inc.
  3. *
  4. * This source code is licensed under the MIT license found in the
  5. * LICENSE file in the root directory of this source tree.
  6. *
  7. * @typechecks
  8. */
  9. /**
  10. * Unicode-enabled replacesments for basic String functions.
  11. *
  12. * All the functions in this module assume that the input string is a valid
  13. * UTF-16 encoding of a Unicode sequence. If it's not the case, the behavior
  14. * will be undefined.
  15. *
  16. * WARNING: Since this module is typechecks-enforced, you may find new bugs
  17. * when replacing normal String functions with ones provided here.
  18. */
  19. 'use strict';
  20. var invariant = require('./invariant');
  21. // These two ranges are consecutive so anything in [HIGH_START, LOW_END] is a
  22. // surrogate code unit.
  23. var SURROGATE_HIGH_START = 0xD800;
  24. var SURROGATE_HIGH_END = 0xDBFF;
  25. var SURROGATE_LOW_START = 0xDC00;
  26. var SURROGATE_LOW_END = 0xDFFF;
  27. var SURROGATE_UNITS_REGEX = /[\uD800-\uDFFF]/;
  28. /**
  29. * @param {number} codeUnit A Unicode code-unit, in range [0, 0x10FFFF]
  30. * @return {boolean} Whether code-unit is in a surrogate (hi/low) range
  31. */
  32. function isCodeUnitInSurrogateRange(codeUnit) {
  33. return SURROGATE_HIGH_START <= codeUnit && codeUnit <= SURROGATE_LOW_END;
  34. }
  35. /**
  36. * Returns whether the two characters starting at `index` form a surrogate pair.
  37. * For example, given the string s = "\uD83D\uDE0A", (s, 0) returns true and
  38. * (s, 1) returns false.
  39. *
  40. * @param {string} str
  41. * @param {number} index
  42. * @return {boolean}
  43. */
  44. function isSurrogatePair(str, index) {
  45. !(0 <= index && index < str.length) ? process.env.NODE_ENV !== 'production' ? invariant(false, 'isSurrogatePair: Invalid index %s for string length %s.', index, str.length) : invariant(false) : void 0;
  46. if (index + 1 === str.length) {
  47. return false;
  48. }
  49. var first = str.charCodeAt(index);
  50. var second = str.charCodeAt(index + 1);
  51. return SURROGATE_HIGH_START <= first && first <= SURROGATE_HIGH_END && SURROGATE_LOW_START <= second && second <= SURROGATE_LOW_END;
  52. }
  53. /**
  54. * @param {string} str Non-empty string
  55. * @return {boolean} True if the input includes any surrogate code units
  56. */
  57. function hasSurrogateUnit(str) {
  58. return SURROGATE_UNITS_REGEX.test(str);
  59. }
  60. /**
  61. * Return the length of the original Unicode character at given position in the
  62. * String by looking into the UTF-16 code unit; that is equal to 1 for any
  63. * non-surrogate characters in BMP ([U+0000..U+D7FF] and [U+E000, U+FFFF]); and
  64. * returns 2 for the hi/low surrogates ([U+D800..U+DFFF]), which are in fact
  65. * representing non-BMP characters ([U+10000..U+10FFFF]).
  66. *
  67. * Examples:
  68. * - '\u0020' => 1
  69. * - '\u3020' => 1
  70. * - '\uD835' => 2
  71. * - '\uD835\uDDEF' => 2
  72. * - '\uDDEF' => 2
  73. *
  74. * @param {string} str Non-empty string
  75. * @param {number} pos Position in the string to look for one code unit
  76. * @return {number} Number 1 or 2
  77. */
  78. function getUTF16Length(str, pos) {
  79. return 1 + isCodeUnitInSurrogateRange(str.charCodeAt(pos));
  80. }
  81. /**
  82. * Fully Unicode-enabled replacement for String#length
  83. *
  84. * @param {string} str Valid Unicode string
  85. * @return {number} The number of Unicode characters in the string
  86. */
  87. function strlen(str) {
  88. // Call the native functions if there's no surrogate char
  89. if (!hasSurrogateUnit(str)) {
  90. return str.length;
  91. }
  92. var len = 0;
  93. for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) {
  94. len++;
  95. }
  96. return len;
  97. }
  98. /**
  99. * Fully Unicode-enabled replacement for String#substr()
  100. *
  101. * @param {string} str Valid Unicode string
  102. * @param {number} start Location in Unicode sequence to begin extracting
  103. * @param {?number} length The number of Unicode characters to extract
  104. * (default: to the end of the string)
  105. * @return {string} Extracted sub-string
  106. */
  107. function substr(str, start, length) {
  108. start = start || 0;
  109. length = length === undefined ? Infinity : length || 0;
  110. // Call the native functions if there's no surrogate char
  111. if (!hasSurrogateUnit(str)) {
  112. return str.substr(start, length);
  113. }
  114. // Obvious cases
  115. var size = str.length;
  116. if (size <= 0 || start > size || length <= 0) {
  117. return '';
  118. }
  119. // Find the actual starting position
  120. var posA = 0;
  121. if (start > 0) {
  122. for (; start > 0 && posA < size; start--) {
  123. posA += getUTF16Length(str, posA);
  124. }
  125. if (posA >= size) {
  126. return '';
  127. }
  128. } else if (start < 0) {
  129. for (posA = size; start < 0 && 0 < posA; start++) {
  130. posA -= getUTF16Length(str, posA - 1);
  131. }
  132. if (posA < 0) {
  133. posA = 0;
  134. }
  135. }
  136. // Find the actual ending position
  137. var posB = size;
  138. if (length < size) {
  139. for (posB = posA; length > 0 && posB < size; length--) {
  140. posB += getUTF16Length(str, posB);
  141. }
  142. }
  143. return str.substring(posA, posB);
  144. }
  145. /**
  146. * Fully Unicode-enabled replacement for String#substring()
  147. *
  148. * @param {string} str Valid Unicode string
  149. * @param {number} start Location in Unicode sequence to begin extracting
  150. * @param {?number} end Location in Unicode sequence to end extracting
  151. * (default: end of the string)
  152. * @return {string} Extracted sub-string
  153. */
  154. function substring(str, start, end) {
  155. start = start || 0;
  156. end = end === undefined ? Infinity : end || 0;
  157. if (start < 0) {
  158. start = 0;
  159. }
  160. if (end < 0) {
  161. end = 0;
  162. }
  163. var length = Math.abs(end - start);
  164. start = start < end ? start : end;
  165. return substr(str, start, length);
  166. }
  167. /**
  168. * Get a list of Unicode code-points from a String
  169. *
  170. * @param {string} str Valid Unicode string
  171. * @return {array<number>} A list of code-points in [0..0x10FFFF]
  172. */
  173. function getCodePoints(str) {
  174. var codePoints = [];
  175. for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) {
  176. codePoints.push(str.codePointAt(pos));
  177. }
  178. return codePoints;
  179. }
  180. var UnicodeUtils = {
  181. getCodePoints: getCodePoints,
  182. getUTF16Length: getUTF16Length,
  183. hasSurrogateUnit: hasSurrogateUnit,
  184. isCodeUnitInSurrogateRange: isCodeUnitInSurrogateRange,
  185. isSurrogatePair: isSurrogatePair,
  186. strlen: strlen,
  187. substring: substring,
  188. substr: substr
  189. };
  190. module.exports = UnicodeUtils;