123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- /**
- * Copyright (c) 2013-present, Facebook, Inc.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- *
- * @typechecks
- */
- /**
- * Unicode-enabled replacesments for basic String functions.
- *
- * All the functions in this module assume that the input string is a valid
- * UTF-16 encoding of a Unicode sequence. If it's not the case, the behavior
- * will be undefined.
- *
- * WARNING: Since this module is typechecks-enforced, you may find new bugs
- * when replacing normal String functions with ones provided here.
- */
- 'use strict';
- var invariant = require('./invariant');
- // These two ranges are consecutive so anything in [HIGH_START, LOW_END] is a
- // surrogate code unit.
- var SURROGATE_HIGH_START = 0xD800;
- var SURROGATE_HIGH_END = 0xDBFF;
- var SURROGATE_LOW_START = 0xDC00;
- var SURROGATE_LOW_END = 0xDFFF;
- var SURROGATE_UNITS_REGEX = /[\uD800-\uDFFF]/;
- /**
- * @param {number} codeUnit A Unicode code-unit, in range [0, 0x10FFFF]
- * @return {boolean} Whether code-unit is in a surrogate (hi/low) range
- */
- function isCodeUnitInSurrogateRange(codeUnit) {
- return SURROGATE_HIGH_START <= codeUnit && codeUnit <= SURROGATE_LOW_END;
- }
- /**
- * Returns whether the two characters starting at `index` form a surrogate pair.
- * For example, given the string s = "\uD83D\uDE0A", (s, 0) returns true and
- * (s, 1) returns false.
- *
- * @param {string} str
- * @param {number} index
- * @return {boolean}
- */
- function isSurrogatePair(str, index) {
- !(0 <= index && index < str.length) ? process.env.NODE_ENV !== 'production' ? invariant(false, 'isSurrogatePair: Invalid index %s for string length %s.', index, str.length) : invariant(false) : void 0;
- if (index + 1 === str.length) {
- return false;
- }
- var first = str.charCodeAt(index);
- var second = str.charCodeAt(index + 1);
- return SURROGATE_HIGH_START <= first && first <= SURROGATE_HIGH_END && SURROGATE_LOW_START <= second && second <= SURROGATE_LOW_END;
- }
- /**
- * @param {string} str Non-empty string
- * @return {boolean} True if the input includes any surrogate code units
- */
- function hasSurrogateUnit(str) {
- return SURROGATE_UNITS_REGEX.test(str);
- }
- /**
- * Return the length of the original Unicode character at given position in the
- * String by looking into the UTF-16 code unit; that is equal to 1 for any
- * non-surrogate characters in BMP ([U+0000..U+D7FF] and [U+E000, U+FFFF]); and
- * returns 2 for the hi/low surrogates ([U+D800..U+DFFF]), which are in fact
- * representing non-BMP characters ([U+10000..U+10FFFF]).
- *
- * Examples:
- * - '\u0020' => 1
- * - '\u3020' => 1
- * - '\uD835' => 2
- * - '\uD835\uDDEF' => 2
- * - '\uDDEF' => 2
- *
- * @param {string} str Non-empty string
- * @param {number} pos Position in the string to look for one code unit
- * @return {number} Number 1 or 2
- */
- function getUTF16Length(str, pos) {
- return 1 + isCodeUnitInSurrogateRange(str.charCodeAt(pos));
- }
- /**
- * Fully Unicode-enabled replacement for String#length
- *
- * @param {string} str Valid Unicode string
- * @return {number} The number of Unicode characters in the string
- */
- function strlen(str) {
- // Call the native functions if there's no surrogate char
- if (!hasSurrogateUnit(str)) {
- return str.length;
- }
- var len = 0;
- for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) {
- len++;
- }
- return len;
- }
- /**
- * Fully Unicode-enabled replacement for String#substr()
- *
- * @param {string} str Valid Unicode string
- * @param {number} start Location in Unicode sequence to begin extracting
- * @param {?number} length The number of Unicode characters to extract
- * (default: to the end of the string)
- * @return {string} Extracted sub-string
- */
- function substr(str, start, length) {
- start = start || 0;
- length = length === undefined ? Infinity : length || 0;
- // Call the native functions if there's no surrogate char
- if (!hasSurrogateUnit(str)) {
- return str.substr(start, length);
- }
- // Obvious cases
- var size = str.length;
- if (size <= 0 || start > size || length <= 0) {
- return '';
- }
- // Find the actual starting position
- var posA = 0;
- if (start > 0) {
- for (; start > 0 && posA < size; start--) {
- posA += getUTF16Length(str, posA);
- }
- if (posA >= size) {
- return '';
- }
- } else if (start < 0) {
- for (posA = size; start < 0 && 0 < posA; start++) {
- posA -= getUTF16Length(str, posA - 1);
- }
- if (posA < 0) {
- posA = 0;
- }
- }
- // Find the actual ending position
- var posB = size;
- if (length < size) {
- for (posB = posA; length > 0 && posB < size; length--) {
- posB += getUTF16Length(str, posB);
- }
- }
- return str.substring(posA, posB);
- }
- /**
- * Fully Unicode-enabled replacement for String#substring()
- *
- * @param {string} str Valid Unicode string
- * @param {number} start Location in Unicode sequence to begin extracting
- * @param {?number} end Location in Unicode sequence to end extracting
- * (default: end of the string)
- * @return {string} Extracted sub-string
- */
- function substring(str, start, end) {
- start = start || 0;
- end = end === undefined ? Infinity : end || 0;
- if (start < 0) {
- start = 0;
- }
- if (end < 0) {
- end = 0;
- }
- var length = Math.abs(end - start);
- start = start < end ? start : end;
- return substr(str, start, length);
- }
- /**
- * Get a list of Unicode code-points from a String
- *
- * @param {string} str Valid Unicode string
- * @return {array<number>} A list of code-points in [0..0x10FFFF]
- */
- function getCodePoints(str) {
- var codePoints = [];
- for (var pos = 0; pos < str.length; pos += getUTF16Length(str, pos)) {
- codePoints.push(str.codePointAt(pos));
- }
- return codePoints;
- }
- var UnicodeUtils = {
- getCodePoints: getCodePoints,
- getUTF16Length: getUTF16Length,
- hasSurrogateUnit: hasSurrogateUnit,
- isCodeUnitInSurrogateRange: isCodeUnitInSurrogateRange,
- isSurrogatePair: isSurrogatePair,
- strlen: strlen,
- substring: substring,
- substr: substr
- };
- module.exports = UnicodeUtils;
|