lexer.js 17 KB


  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", {
  3. value: true
  4. });
  5. exports.isPunctuatorTokenKind = isPunctuatorTokenKind;
  6. exports.Lexer = void 0;
  7. var _syntaxError = require("../error/syntaxError.js");
  8. var _ast = require("./ast.js");
  9. var _tokenKind = require("./tokenKind.js");
  10. var _blockString = require("./blockString.js");
  11. /**
  12. * Given a Source object, creates a Lexer for that source.
  13. * A Lexer is a stateful stream generator in that every time
  14. * it is advanced, it returns the next token in the Source. Assuming the
  15. * source lexes, the final Token emitted by the lexer will be of kind
  16. * EOF, after which the lexer will repeatedly return the same EOF token
  17. * whenever called.
  18. */
  19. var Lexer = /*#__PURE__*/function () {
  20. /**
  21. * The previously focused non-ignored token.
  22. */
  23. /**
  24. * The currently focused non-ignored token.
  25. */
  26. /**
  27. * The (1-indexed) line containing the current token.
  28. */
  29. /**
  30. * The character offset at which the current line begins.
  31. */
  32. function Lexer(source) {
  33. var startOfFileToken = new _ast.Token(_tokenKind.TokenKind.SOF, 0, 0, 0, 0, null);
  34. this.source = source;
  35. this.lastToken = startOfFileToken;
  36. this.token = startOfFileToken;
  37. this.line = 1;
  38. this.lineStart = 0;
  39. }
  40. /**
  41. * Advances the token stream to the next non-ignored token.
  42. */
  43. var _proto = Lexer.prototype;
  44. _proto.advance = function advance() {
  45. this.lastToken = this.token;
  46. var token = this.token = this.lookahead();
  47. return token;
  48. }
  49. /**
  50. * Looks ahead and returns the next non-ignored token, but does not change
  51. * the state of Lexer.
  52. */
  53. ;
  54. _proto.lookahead = function lookahead() {
  55. var token = this.token;
  56. if (token.kind !== _tokenKind.TokenKind.EOF) {
  57. do {
  58. var _token$next;
  59. // Note: next is only mutable during parsing, so we cast to allow this.
  60. token = (_token$next = token.next) !== null && _token$next !== void 0 ? _token$next : token.next = readToken(this, token);
  61. } while (token.kind === _tokenKind.TokenKind.COMMENT);
  62. }
  63. return token;
  64. };
  65. return Lexer;
  66. }();
  67. /**
  68. * @internal
  69. */
  70. exports.Lexer = Lexer;
  71. function isPunctuatorTokenKind(kind) {
  72. return kind === _tokenKind.TokenKind.BANG || kind === _tokenKind.TokenKind.DOLLAR || kind === _tokenKind.TokenKind.AMP || kind === _tokenKind.TokenKind.PAREN_L || kind === _tokenKind.TokenKind.PAREN_R || kind === _tokenKind.TokenKind.SPREAD || kind === _tokenKind.TokenKind.COLON || kind === _tokenKind.TokenKind.EQUALS || kind === _tokenKind.TokenKind.AT || kind === _tokenKind.TokenKind.BRACKET_L || kind === _tokenKind.TokenKind.BRACKET_R || kind === _tokenKind.TokenKind.BRACE_L || kind === _tokenKind.TokenKind.PIPE || kind === _tokenKind.TokenKind.BRACE_R;
  73. }
  74. function printCharCode(code) {
  75. return (// NaN/undefined represents access beyond the end of the file.
  76. isNaN(code) ? _tokenKind.TokenKind.EOF : // Trust JSON for ASCII.
  77. code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form.
  78. "\"\\u".concat(('00' + code.toString(16).toUpperCase()).slice(-4), "\"")
  79. );
  80. }
  81. /**
  82. * Gets the next token from the source starting at the given position.
  83. *
  84. * This skips over whitespace until it finds the next lexable token, then lexes
  85. * punctuators immediately or calls the appropriate helper function for more
  86. * complicated tokens.
  87. */
  88. function readToken(lexer, prev) {
  89. var source = lexer.source;
  90. var body = source.body;
  91. var bodyLength = body.length;
  92. var pos = prev.end;
  93. while (pos < bodyLength) {
  94. var code = body.charCodeAt(pos);
  95. var _line = lexer.line;
  96. var _col = 1 + pos - lexer.lineStart; // SourceCharacter
  97. switch (code) {
  98. case 0xfeff: // <BOM>
  99. case 9: // \t
  100. case 32: // <space>
  101. case 44:
  102. // ,
  103. ++pos;
  104. continue;
  105. case 10:
  106. // \n
  107. ++pos;
  108. ++lexer.line;
  109. lexer.lineStart = pos;
  110. continue;
  111. case 13:
  112. // \r
  113. if (body.charCodeAt(pos + 1) === 10) {
  114. pos += 2;
  115. } else {
  116. ++pos;
  117. }
  118. ++lexer.line;
  119. lexer.lineStart = pos;
  120. continue;
  121. case 33:
  122. // !
  123. return new _ast.Token(_tokenKind.TokenKind.BANG, pos, pos + 1, _line, _col, prev);
  124. case 35:
  125. // #
  126. return readComment(source, pos, _line, _col, prev);
  127. case 36:
  128. // $
  129. return new _ast.Token(_tokenKind.TokenKind.DOLLAR, pos, pos + 1, _line, _col, prev);
  130. case 38:
  131. // &
  132. return new _ast.Token(_tokenKind.TokenKind.AMP, pos, pos + 1, _line, _col, prev);
  133. case 40:
  134. // (
  135. return new _ast.Token(_tokenKind.TokenKind.PAREN_L, pos, pos + 1, _line, _col, prev);
  136. case 41:
  137. // )
  138. return new _ast.Token(_tokenKind.TokenKind.PAREN_R, pos, pos + 1, _line, _col, prev);
  139. case 46:
  140. // .
  141. if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
  142. return new _ast.Token(_tokenKind.TokenKind.SPREAD, pos, pos + 3, _line, _col, prev);
  143. }
  144. break;
  145. case 58:
  146. // :
  147. return new _ast.Token(_tokenKind.TokenKind.COLON, pos, pos + 1, _line, _col, prev);
  148. case 61:
  149. // =
  150. return new _ast.Token(_tokenKind.TokenKind.EQUALS, pos, pos + 1, _line, _col, prev);
  151. case 64:
  152. // @
  153. return new _ast.Token(_tokenKind.TokenKind.AT, pos, pos + 1, _line, _col, prev);
  154. case 91:
  155. // [
  156. return new _ast.Token(_tokenKind.TokenKind.BRACKET_L, pos, pos + 1, _line, _col, prev);
  157. case 93:
  158. // ]
  159. return new _ast.Token(_tokenKind.TokenKind.BRACKET_R, pos, pos + 1, _line, _col, prev);
  160. case 123:
  161. // {
  162. return new _ast.Token(_tokenKind.TokenKind.BRACE_L, pos, pos + 1, _line, _col, prev);
  163. case 124:
  164. // |
  165. return new _ast.Token(_tokenKind.TokenKind.PIPE, pos, pos + 1, _line, _col, prev);
  166. case 125:
  167. // }
  168. return new _ast.Token(_tokenKind.TokenKind.BRACE_R, pos, pos + 1, _line, _col, prev);
  169. case 34:
  170. // "
  171. if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
  172. return readBlockString(source, pos, _line, _col, prev, lexer);
  173. }
  174. return readString(source, pos, _line, _col, prev);
  175. case 45: // -
  176. case 48: // 0
  177. case 49: // 1
  178. case 50: // 2
  179. case 51: // 3
  180. case 52: // 4
  181. case 53: // 5
  182. case 54: // 6
  183. case 55: // 7
  184. case 56: // 8
  185. case 57:
  186. // 9
  187. return readNumber(source, pos, code, _line, _col, prev);
  188. case 65: // A
  189. case 66: // B
  190. case 67: // C
  191. case 68: // D
  192. case 69: // E
  193. case 70: // F
  194. case 71: // G
  195. case 72: // H
  196. case 73: // I
  197. case 74: // J
  198. case 75: // K
  199. case 76: // L
  200. case 77: // M
  201. case 78: // N
  202. case 79: // O
  203. case 80: // P
  204. case 81: // Q
  205. case 82: // R
  206. case 83: // S
  207. case 84: // T
  208. case 85: // U
  209. case 86: // V
  210. case 87: // W
  211. case 88: // X
  212. case 89: // Y
  213. case 90: // Z
  214. case 95: // _
  215. case 97: // a
  216. case 98: // b
  217. case 99: // c
  218. case 100: // d
  219. case 101: // e
  220. case 102: // f
  221. case 103: // g
  222. case 104: // h
  223. case 105: // i
  224. case 106: // j
  225. case 107: // k
  226. case 108: // l
  227. case 109: // m
  228. case 110: // n
  229. case 111: // o
  230. case 112: // p
  231. case 113: // q
  232. case 114: // r
  233. case 115: // s
  234. case 116: // t
  235. case 117: // u
  236. case 118: // v
  237. case 119: // w
  238. case 120: // x
  239. case 121: // y
  240. case 122:
  241. // z
  242. return readName(source, pos, _line, _col, prev);
  243. }
  244. throw (0, _syntaxError.syntaxError)(source, pos, unexpectedCharacterMessage(code));
  245. }
  246. var line = lexer.line;
  247. var col = 1 + pos - lexer.lineStart;
  248. return new _ast.Token(_tokenKind.TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  249. }
  250. /**
  251. * Report a message that an unexpected character was encountered.
  252. */
  253. function unexpectedCharacterMessage(code) {
  254. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  255. return "Cannot contain the invalid character ".concat(printCharCode(code), ".");
  256. }
  257. if (code === 39) {
  258. // '
  259. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  260. }
  261. return "Cannot parse the unexpected character ".concat(printCharCode(code), ".");
  262. }
  263. /**
  264. * Reads a comment token from the source file.
  265. *
  266. * #[\u0009\u0020-\uFFFF]*
  267. */
  268. function readComment(source, start, line, col, prev) {
  269. var body = source.body;
  270. var code;
  271. var position = start;
  272. do {
  273. code = body.charCodeAt(++position);
  274. } while (!isNaN(code) && ( // SourceCharacter but not LineTerminator
  275. code > 0x001f || code === 0x0009));
  276. return new _ast.Token(_tokenKind.TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position));
  277. }
  278. /**
  279. * Reads a number token from the source file, either a float
  280. * or an int depending on whether a decimal point appears.
  281. *
  282. * Int: -?(0|[1-9][0-9]*)
  283. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  284. */
  285. function readNumber(source, start, firstCode, line, col, prev) {
  286. var body = source.body;
  287. var code = firstCode;
  288. var position = start;
  289. var isFloat = false;
  290. if (code === 45) {
  291. // -
  292. code = body.charCodeAt(++position);
  293. }
  294. if (code === 48) {
  295. // 0
  296. code = body.charCodeAt(++position);
  297. if (code >= 48 && code <= 57) {
  298. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, unexpected digit after 0: ".concat(printCharCode(code), "."));
  299. }
  300. } else {
  301. position = readDigits(source, position, code);
  302. code = body.charCodeAt(position);
  303. }
  304. if (code === 46) {
  305. // .
  306. isFloat = true;
  307. code = body.charCodeAt(++position);
  308. position = readDigits(source, position, code);
  309. code = body.charCodeAt(position);
  310. }
  311. if (code === 69 || code === 101) {
  312. // E e
  313. isFloat = true;
  314. code = body.charCodeAt(++position);
  315. if (code === 43 || code === 45) {
  316. // + -
  317. code = body.charCodeAt(++position);
  318. }
  319. position = readDigits(source, position, code);
  320. code = body.charCodeAt(position);
  321. } // Numbers cannot be followed by . or NameStart
  322. if (code === 46 || isNameStart(code)) {
  323. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  324. }
  325. return new _ast.Token(isFloat ? _tokenKind.TokenKind.FLOAT : _tokenKind.TokenKind.INT, start, position, line, col, prev, body.slice(start, position));
  326. }
  327. /**
  328. * Returns the new position in the source after reading digits.
  329. */
  330. function readDigits(source, start, firstCode) {
  331. var body = source.body;
  332. var position = start;
  333. var code = firstCode;
  334. if (code >= 48 && code <= 57) {
  335. // 0 - 9
  336. do {
  337. code = body.charCodeAt(++position);
  338. } while (code >= 48 && code <= 57); // 0 - 9
  339. return position;
  340. }
  341. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  342. }
  343. /**
  344. * Reads a string token from the source file.
  345. *
  346. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  347. */
  348. function readString(source, start, line, col, prev) {
  349. var body = source.body;
  350. var position = start + 1;
  351. var chunkStart = position;
  352. var code = 0;
  353. var value = '';
  354. while (position < body.length && !isNaN(code = body.charCodeAt(position)) && // not LineTerminator
  355. code !== 0x000a && code !== 0x000d) {
  356. // Closing Quote (")
  357. if (code === 34) {
  358. value += body.slice(chunkStart, position);
  359. return new _ast.Token(_tokenKind.TokenKind.STRING, start, position + 1, line, col, prev, value);
  360. } // SourceCharacter
  361. if (code < 0x0020 && code !== 0x0009) {
  362. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  363. }
  364. ++position;
  365. if (code === 92) {
  366. // \
  367. value += body.slice(chunkStart, position - 1);
  368. code = body.charCodeAt(position);
  369. switch (code) {
  370. case 34:
  371. value += '"';
  372. break;
  373. case 47:
  374. value += '/';
  375. break;
  376. case 92:
  377. value += '\\';
  378. break;
  379. case 98:
  380. value += '\b';
  381. break;
  382. case 102:
  383. value += '\f';
  384. break;
  385. case 110:
  386. value += '\n';
  387. break;
  388. case 114:
  389. value += '\r';
  390. break;
  391. case 116:
  392. value += '\t';
  393. break;
  394. case 117:
  395. {
  396. // uXXXX
  397. var charCode = uniCharCode(body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4));
  398. if (charCode < 0) {
  399. var invalidSequence = body.slice(position + 1, position + 5);
  400. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character escape sequence: \\u".concat(invalidSequence, "."));
  401. }
  402. value += String.fromCharCode(charCode);
  403. position += 4;
  404. break;
  405. }
  406. default:
  407. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character escape sequence: \\".concat(String.fromCharCode(code), "."));
  408. }
  409. ++position;
  410. chunkStart = position;
  411. }
  412. }
  413. throw (0, _syntaxError.syntaxError)(source, position, 'Unterminated string.');
  414. }
  415. /**
  416. * Reads a block string token from the source file.
  417. *
  418. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  419. */
  420. function readBlockString(source, start, line, col, prev, lexer) {
  421. var body = source.body;
  422. var position = start + 3;
  423. var chunkStart = position;
  424. var code = 0;
  425. var rawValue = '';
  426. while (position < body.length && !isNaN(code = body.charCodeAt(position))) {
  427. // Closing Triple-Quote (""")
  428. if (code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34) {
  429. rawValue += body.slice(chunkStart, position);
  430. return new _ast.Token(_tokenKind.TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, (0, _blockString.dedentBlockStringValue)(rawValue));
  431. } // SourceCharacter
  432. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  433. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  434. }
  435. if (code === 10) {
  436. // new line
  437. ++position;
  438. ++lexer.line;
  439. lexer.lineStart = position;
  440. } else if (code === 13) {
  441. // carriage return
  442. if (body.charCodeAt(position + 1) === 10) {
  443. position += 2;
  444. } else {
  445. ++position;
  446. }
  447. ++lexer.line;
  448. lexer.lineStart = position;
  449. } else if ( // Escape Triple-Quote (\""")
  450. code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34) {
  451. rawValue += body.slice(chunkStart, position) + '"""';
  452. position += 4;
  453. chunkStart = position;
  454. } else {
  455. ++position;
  456. }
  457. }
  458. throw (0, _syntaxError.syntaxError)(source, position, 'Unterminated string.');
  459. }
  460. /**
  461. * Converts four hexadecimal chars to the integer that the
  462. * string represents. For example, uniCharCode('0','0','0','f')
  463. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  464. *
  465. * Returns a negative number on error, if a char was invalid.
  466. *
  467. * This is implemented by noting that char2hex() returns -1 on error,
  468. * which means the result of ORing the char2hex() will also be negative.
  469. */
  470. function uniCharCode(a, b, c, d) {
  471. return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d);
  472. }
  473. /**
  474. * Converts a hex character to its integer value.
  475. * '0' becomes 0, '9' becomes 9
  476. * 'A' becomes 10, 'F' becomes 15
  477. * 'a' becomes 10, 'f' becomes 15
  478. *
  479. * Returns -1 on error.
  480. */
  481. function char2hex(a) {
  482. return a >= 48 && a <= 57 ? a - 48 // 0-9
  483. : a >= 65 && a <= 70 ? a - 55 // A-F
  484. : a >= 97 && a <= 102 ? a - 87 // a-f
  485. : -1;
  486. }
  487. /**
  488. * Reads an alphanumeric + underscore name from the source.
  489. *
  490. * [_A-Za-z][_0-9A-Za-z]*
  491. */
  492. function readName(source, start, line, col, prev) {
  493. var body = source.body;
  494. var bodyLength = body.length;
  495. var position = start + 1;
  496. var code = 0;
  497. while (position !== bodyLength && !isNaN(code = body.charCodeAt(position)) && (code === 95 || // _
  498. code >= 48 && code <= 57 || // 0-9
  499. code >= 65 && code <= 90 || // A-Z
  500. code >= 97 && code <= 122) // a-z
  501. ) {
  502. ++position;
  503. }
  504. return new _ast.Token(_tokenKind.TokenKind.NAME, start, position, line, col, prev, body.slice(start, position));
  505. } // _ A-Z a-z
  506. function isNameStart(code) {
  507. return code === 95 || code >= 65 && code <= 90 || code >= 97 && code <= 122;
  508. }