utf8.js 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. /*! https://mths.be/utf8js v2.1.2 by @mathias */
  2. var stringFromCharCode = String.fromCharCode;
  3. // Taken from https://mths.be/punycode
  4. function ucs2decode(string) {
  5. var output = [];
  6. var counter = 0;
  7. var length = string.length;
  8. var value;
  9. var extra;
  10. while (counter < length) {
  11. value = string.charCodeAt(counter++);
  12. if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
  13. // high surrogate, and there is a next character
  14. extra = string.charCodeAt(counter++);
  15. if ((extra & 0xFC00) == 0xDC00) { // low surrogate
  16. output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
  17. }
  18. else {
  19. // unmatched surrogate; only append this code unit, in case the next
  20. // code unit is the high surrogate of a surrogate pair
  21. output.push(value);
  22. counter--;
  23. }
  24. }
  25. else {
  26. output.push(value);
  27. }
  28. }
  29. return output;
  30. }
  31. // Taken from https://mths.be/punycode
  32. function ucs2encode(array) {
  33. var length = array.length;
  34. var index = -1;
  35. var value;
  36. var output = '';
  37. while (++index < length) {
  38. value = array[index];
  39. if (value > 0xFFFF) {
  40. value -= 0x10000;
  41. output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
  42. value = 0xDC00 | value & 0x3FF;
  43. }
  44. output += stringFromCharCode(value);
  45. }
  46. return output;
  47. }
  48. function checkScalarValue(codePoint, strict) {
  49. if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
  50. if (strict) {
  51. throw Error('Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
  52. ' is not a scalar value');
  53. }
  54. return false;
  55. }
  56. return true;
  57. }
  58. /*--------------------------------------------------------------------------*/
  59. function createByte(codePoint, shift) {
  60. return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
  61. }
  62. function encodeCodePoint(codePoint, strict) {
  63. if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
  64. return stringFromCharCode(codePoint);
  65. }
  66. var symbol = '';
  67. if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
  68. symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
  69. }
  70. else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
  71. if (!checkScalarValue(codePoint, strict)) {
  72. codePoint = 0xFFFD;
  73. }
  74. symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
  75. symbol += createByte(codePoint, 6);
  76. }
  77. else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
  78. symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0);
  79. symbol += createByte(codePoint, 12);
  80. symbol += createByte(codePoint, 6);
  81. }
  82. symbol += stringFromCharCode((codePoint & 0x3F) | 0x80);
  83. return symbol;
  84. }
  85. function utf8encode(string, opts) {
  86. opts = opts || {};
  87. var strict = false !== opts.strict;
  88. var codePoints = ucs2decode(string);
  89. var length = codePoints.length;
  90. var index = -1;
  91. var codePoint;
  92. var byteString = '';
  93. while (++index < length) {
  94. codePoint = codePoints[index];
  95. byteString += encodeCodePoint(codePoint, strict);
  96. }
  97. return byteString;
  98. }
  99. /*--------------------------------------------------------------------------*/
  100. function readContinuationByte() {
  101. if (byteIndex >= byteCount) {
  102. throw Error('Invalid byte index');
  103. }
  104. var continuationByte = byteArray[byteIndex] & 0xFF;
  105. byteIndex++;
  106. if ((continuationByte & 0xC0) == 0x80) {
  107. return continuationByte & 0x3F;
  108. }
  109. // If we end up here, it’s not a continuation byte
  110. throw Error('Invalid continuation byte');
  111. }
  112. function decodeSymbol(strict) {
  113. var byte1;
  114. var byte2;
  115. var byte3;
  116. var byte4;
  117. var codePoint;
  118. if (byteIndex > byteCount) {
  119. throw Error('Invalid byte index');
  120. }
  121. if (byteIndex == byteCount) {
  122. return false;
  123. }
  124. // Read first byte
  125. byte1 = byteArray[byteIndex] & 0xFF;
  126. byteIndex++;
  127. // 1-byte sequence (no continuation bytes)
  128. if ((byte1 & 0x80) == 0) {
  129. return byte1;
  130. }
  131. // 2-byte sequence
  132. if ((byte1 & 0xE0) == 0xC0) {
  133. byte2 = readContinuationByte();
  134. codePoint = ((byte1 & 0x1F) << 6) | byte2;
  135. if (codePoint >= 0x80) {
  136. return codePoint;
  137. }
  138. else {
  139. throw Error('Invalid continuation byte');
  140. }
  141. }
  142. // 3-byte sequence (may include unpaired surrogates)
  143. if ((byte1 & 0xF0) == 0xE0) {
  144. byte2 = readContinuationByte();
  145. byte3 = readContinuationByte();
  146. codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
  147. if (codePoint >= 0x0800) {
  148. return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD;
  149. }
  150. else {
  151. throw Error('Invalid continuation byte');
  152. }
  153. }
  154. // 4-byte sequence
  155. if ((byte1 & 0xF8) == 0xF0) {
  156. byte2 = readContinuationByte();
  157. byte3 = readContinuationByte();
  158. byte4 = readContinuationByte();
  159. codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) |
  160. (byte3 << 0x06) | byte4;
  161. if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
  162. return codePoint;
  163. }
  164. }
  165. throw Error('Invalid UTF-8 detected');
  166. }
  167. var byteArray;
  168. var byteCount;
  169. var byteIndex;
  170. function utf8decode(byteString, opts) {
  171. opts = opts || {};
  172. var strict = false !== opts.strict;
  173. byteArray = ucs2decode(byteString);
  174. byteCount = byteArray.length;
  175. byteIndex = 0;
  176. var codePoints = [];
  177. var tmp;
  178. while ((tmp = decodeSymbol(strict)) !== false) {
  179. codePoints.push(tmp);
  180. }
  181. return ucs2encode(codePoints);
  182. }
  183. module.exports = {
  184. version: '2.1.2',
  185. encode: utf8encode,
  186. decode: utf8decode
  187. };