lexer.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", {
  3. value: true
  4. });
  5. exports.isPunctuatorTokenKind = isPunctuatorTokenKind;
  6. exports.Lexer = void 0;
  7. var _syntaxError = require("../error/syntaxError");
  8. var _ast = require("./ast");
  9. var _tokenKind = require("./tokenKind");
  10. var _blockString = require("./blockString");
  11. /**
  12. * Given a Source object, creates a Lexer for that source.
  13. * A Lexer is a stateful stream generator in that every time
  14. * it is advanced, it returns the next token in the Source. Assuming the
  15. * source lexes, the final Token emitted by the lexer will be of kind
  16. * EOF, after which the lexer will repeatedly return the same EOF token
  17. * whenever called.
  18. */
  19. var Lexer = /*#__PURE__*/function () {
  20. /**
  21. * The previously focused non-ignored token.
  22. */
  23. /**
  24. * The currently focused non-ignored token.
  25. */
  26. /**
  27. * The (1-indexed) line containing the current token.
  28. */
  29. /**
  30. * The character offset at which the current line begins.
  31. */
  32. function Lexer(source) {
  33. var startOfFileToken = new _ast.Token(_tokenKind.TokenKind.SOF, 0, 0, 0, 0, null);
  34. this.source = source;
  35. this.lastToken = startOfFileToken;
  36. this.token = startOfFileToken;
  37. this.line = 1;
  38. this.lineStart = 0;
  39. }
  40. /**
  41. * Advances the token stream to the next non-ignored token.
  42. */
  43. var _proto = Lexer.prototype;
  44. _proto.advance = function advance() {
  45. this.lastToken = this.token;
  46. var token = this.token = this.lookahead();
  47. return token;
  48. }
  49. /**
  50. * Looks ahead and returns the next non-ignored token, but does not change
  51. * the state of Lexer.
  52. */
  53. ;
  54. _proto.lookahead = function lookahead() {
  55. var token = this.token;
  56. if (token.kind !== _tokenKind.TokenKind.EOF) {
  57. do {
  58. var _token$next;
  59. // Note: next is only mutable during parsing, so we cast to allow this.
  60. token = (_token$next = token.next) !== null && _token$next !== void 0 ? _token$next : token.next = readToken(this, token);
  61. } while (token.kind === _tokenKind.TokenKind.COMMENT);
  62. }
  63. return token;
  64. };
  65. return Lexer;
  66. }();
  67. /**
  68. * @internal
  69. */
  70. exports.Lexer = Lexer;
  71. function isPunctuatorTokenKind(kind) {
  72. return kind === _tokenKind.TokenKind.BANG || kind === _tokenKind.TokenKind.DOLLAR || kind === _tokenKind.TokenKind.AMP || kind === _tokenKind.TokenKind.PAREN_L || kind === _tokenKind.TokenKind.PAREN_R || kind === _tokenKind.TokenKind.SPREAD || kind === _tokenKind.TokenKind.COLON || kind === _tokenKind.TokenKind.EQUALS || kind === _tokenKind.TokenKind.AT || kind === _tokenKind.TokenKind.BRACKET_L || kind === _tokenKind.TokenKind.BRACKET_R || kind === _tokenKind.TokenKind.BRACE_L || kind === _tokenKind.TokenKind.PIPE || kind === _tokenKind.TokenKind.BRACE_R;
  73. }
  74. function printCharCode(code) {
  75. return (// NaN/undefined represents access beyond the end of the file.
  76. isNaN(code) ? _tokenKind.TokenKind.EOF : // Trust JSON for ASCII.
  77. code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form.
  78. "\"\\u".concat(('00' + code.toString(16).toUpperCase()).slice(-4), "\"")
  79. );
  80. }
  81. /**
  82. * Gets the next token from the source starting at the given position.
  83. *
  84. * This skips over whitespace until it finds the next lexable token, then lexes
  85. * punctuators immediately or calls the appropriate helper function for more
  86. * complicated tokens.
  87. */
  88. function readToken(lexer, prev) {
  89. var source = lexer.source;
  90. var body = source.body;
  91. var bodyLength = body.length;
  92. var pos = positionAfterWhitespace(body, prev.end, lexer);
  93. var line = lexer.line;
  94. var col = 1 + pos - lexer.lineStart;
  95. if (pos >= bodyLength) {
  96. return new _ast.Token(_tokenKind.TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  97. }
  98. var code = body.charCodeAt(pos); // SourceCharacter
  99. switch (code) {
  100. // !
  101. case 33:
  102. return new _ast.Token(_tokenKind.TokenKind.BANG, pos, pos + 1, line, col, prev);
  103. // #
  104. case 35:
  105. return readComment(source, pos, line, col, prev);
  106. // $
  107. case 36:
  108. return new _ast.Token(_tokenKind.TokenKind.DOLLAR, pos, pos + 1, line, col, prev);
  109. // &
  110. case 38:
  111. return new _ast.Token(_tokenKind.TokenKind.AMP, pos, pos + 1, line, col, prev);
  112. // (
  113. case 40:
  114. return new _ast.Token(_tokenKind.TokenKind.PAREN_L, pos, pos + 1, line, col, prev);
  115. // )
  116. case 41:
  117. return new _ast.Token(_tokenKind.TokenKind.PAREN_R, pos, pos + 1, line, col, prev);
  118. // .
  119. case 46:
  120. if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
  121. return new _ast.Token(_tokenKind.TokenKind.SPREAD, pos, pos + 3, line, col, prev);
  122. }
  123. break;
  124. // :
  125. case 58:
  126. return new _ast.Token(_tokenKind.TokenKind.COLON, pos, pos + 1, line, col, prev);
  127. // =
  128. case 61:
  129. return new _ast.Token(_tokenKind.TokenKind.EQUALS, pos, pos + 1, line, col, prev);
  130. // @
  131. case 64:
  132. return new _ast.Token(_tokenKind.TokenKind.AT, pos, pos + 1, line, col, prev);
  133. // [
  134. case 91:
  135. return new _ast.Token(_tokenKind.TokenKind.BRACKET_L, pos, pos + 1, line, col, prev);
  136. // ]
  137. case 93:
  138. return new _ast.Token(_tokenKind.TokenKind.BRACKET_R, pos, pos + 1, line, col, prev);
  139. // {
  140. case 123:
  141. return new _ast.Token(_tokenKind.TokenKind.BRACE_L, pos, pos + 1, line, col, prev);
  142. // |
  143. case 124:
  144. return new _ast.Token(_tokenKind.TokenKind.PIPE, pos, pos + 1, line, col, prev);
  145. // }
  146. case 125:
  147. return new _ast.Token(_tokenKind.TokenKind.BRACE_R, pos, pos + 1, line, col, prev);
  148. // A-Z _ a-z
  149. case 65:
  150. case 66:
  151. case 67:
  152. case 68:
  153. case 69:
  154. case 70:
  155. case 71:
  156. case 72:
  157. case 73:
  158. case 74:
  159. case 75:
  160. case 76:
  161. case 77:
  162. case 78:
  163. case 79:
  164. case 80:
  165. case 81:
  166. case 82:
  167. case 83:
  168. case 84:
  169. case 85:
  170. case 86:
  171. case 87:
  172. case 88:
  173. case 89:
  174. case 90:
  175. case 95:
  176. case 97:
  177. case 98:
  178. case 99:
  179. case 100:
  180. case 101:
  181. case 102:
  182. case 103:
  183. case 104:
  184. case 105:
  185. case 106:
  186. case 107:
  187. case 108:
  188. case 109:
  189. case 110:
  190. case 111:
  191. case 112:
  192. case 113:
  193. case 114:
  194. case 115:
  195. case 116:
  196. case 117:
  197. case 118:
  198. case 119:
  199. case 120:
  200. case 121:
  201. case 122:
  202. return readName(source, pos, line, col, prev);
  203. // - 0-9
  204. case 45:
  205. case 48:
  206. case 49:
  207. case 50:
  208. case 51:
  209. case 52:
  210. case 53:
  211. case 54:
  212. case 55:
  213. case 56:
  214. case 57:
  215. return readNumber(source, pos, code, line, col, prev);
  216. // "
  217. case 34:
  218. if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
  219. return readBlockString(source, pos, line, col, prev, lexer);
  220. }
  221. return readString(source, pos, line, col, prev);
  222. }
  223. throw (0, _syntaxError.syntaxError)(source, pos, unexpectedCharacterMessage(code));
  224. }
  225. /**
  226. * Report a message that an unexpected character was encountered.
  227. */
  228. function unexpectedCharacterMessage(code) {
  229. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  230. return "Cannot contain the invalid character ".concat(printCharCode(code), ".");
  231. }
  232. if (code === 39) {
  233. // '
  234. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  235. }
  236. return "Cannot parse the unexpected character ".concat(printCharCode(code), ".");
  237. }
  238. /**
  239. * Reads from body starting at startPosition until it finds a non-whitespace
  240. * character, then returns the position of that character for lexing.
  241. */
  242. function positionAfterWhitespace(body, startPosition, lexer) {
  243. var bodyLength = body.length;
  244. var position = startPosition;
  245. while (position < bodyLength) {
  246. var code = body.charCodeAt(position); // tab | space | comma | BOM
  247. if (code === 9 || code === 32 || code === 44 || code === 0xfeff) {
  248. ++position;
  249. } else if (code === 10) {
  250. // new line
  251. ++position;
  252. ++lexer.line;
  253. lexer.lineStart = position;
  254. } else if (code === 13) {
  255. // carriage return
  256. if (body.charCodeAt(position + 1) === 10) {
  257. position += 2;
  258. } else {
  259. ++position;
  260. }
  261. ++lexer.line;
  262. lexer.lineStart = position;
  263. } else {
  264. break;
  265. }
  266. }
  267. return position;
  268. }
  269. /**
  270. * Reads a comment token from the source file.
  271. *
  272. * #[\u0009\u0020-\uFFFF]*
  273. */
  274. function readComment(source, start, line, col, prev) {
  275. var body = source.body;
  276. var code;
  277. var position = start;
  278. do {
  279. code = body.charCodeAt(++position);
  280. } while (!isNaN(code) && ( // SourceCharacter but not LineTerminator
  281. code > 0x001f || code === 0x0009));
  282. return new _ast.Token(_tokenKind.TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position));
  283. }
  284. /**
  285. * Reads a number token from the source file, either a float
  286. * or an int depending on whether a decimal point appears.
  287. *
  288. * Int: -?(0|[1-9][0-9]*)
  289. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  290. */
  291. function readNumber(source, start, firstCode, line, col, prev) {
  292. var body = source.body;
  293. var code = firstCode;
  294. var position = start;
  295. var isFloat = false;
  296. if (code === 45) {
  297. // -
  298. code = body.charCodeAt(++position);
  299. }
  300. if (code === 48) {
  301. // 0
  302. code = body.charCodeAt(++position);
  303. if (code >= 48 && code <= 57) {
  304. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, unexpected digit after 0: ".concat(printCharCode(code), "."));
  305. }
  306. } else {
  307. position = readDigits(source, position, code);
  308. code = body.charCodeAt(position);
  309. }
  310. if (code === 46) {
  311. // .
  312. isFloat = true;
  313. code = body.charCodeAt(++position);
  314. position = readDigits(source, position, code);
  315. code = body.charCodeAt(position);
  316. }
  317. if (code === 69 || code === 101) {
  318. // E e
  319. isFloat = true;
  320. code = body.charCodeAt(++position);
  321. if (code === 43 || code === 45) {
  322. // + -
  323. code = body.charCodeAt(++position);
  324. }
  325. position = readDigits(source, position, code);
  326. code = body.charCodeAt(position);
  327. } // Numbers cannot be followed by . or NameStart
  328. if (code === 46 || isNameStart(code)) {
  329. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  330. }
  331. return new _ast.Token(isFloat ? _tokenKind.TokenKind.FLOAT : _tokenKind.TokenKind.INT, start, position, line, col, prev, body.slice(start, position));
  332. }
  333. /**
  334. * Returns the new position in the source after reading digits.
  335. */
  336. function readDigits(source, start, firstCode) {
  337. var body = source.body;
  338. var position = start;
  339. var code = firstCode;
  340. if (code >= 48 && code <= 57) {
  341. // 0 - 9
  342. do {
  343. code = body.charCodeAt(++position);
  344. } while (code >= 48 && code <= 57); // 0 - 9
  345. return position;
  346. }
  347. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  348. }
  349. /**
  350. * Reads a string token from the source file.
  351. *
  352. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  353. */
  354. function readString(source, start, line, col, prev) {
  355. var body = source.body;
  356. var position = start + 1;
  357. var chunkStart = position;
  358. var code = 0;
  359. var value = '';
  360. while (position < body.length && !isNaN(code = body.charCodeAt(position)) && // not LineTerminator
  361. code !== 0x000a && code !== 0x000d) {
  362. // Closing Quote (")
  363. if (code === 34) {
  364. value += body.slice(chunkStart, position);
  365. return new _ast.Token(_tokenKind.TokenKind.STRING, start, position + 1, line, col, prev, value);
  366. } // SourceCharacter
  367. if (code < 0x0020 && code !== 0x0009) {
  368. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  369. }
  370. ++position;
  371. if (code === 92) {
  372. // \
  373. value += body.slice(chunkStart, position - 1);
  374. code = body.charCodeAt(position);
  375. switch (code) {
  376. case 34:
  377. value += '"';
  378. break;
  379. case 47:
  380. value += '/';
  381. break;
  382. case 92:
  383. value += '\\';
  384. break;
  385. case 98:
  386. value += '\b';
  387. break;
  388. case 102:
  389. value += '\f';
  390. break;
  391. case 110:
  392. value += '\n';
  393. break;
  394. case 114:
  395. value += '\r';
  396. break;
  397. case 116:
  398. value += '\t';
  399. break;
  400. case 117:
  401. {
  402. // uXXXX
  403. var charCode = uniCharCode(body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4));
  404. if (charCode < 0) {
  405. var invalidSequence = body.slice(position + 1, position + 5);
  406. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character escape sequence: \\u".concat(invalidSequence, "."));
  407. }
  408. value += String.fromCharCode(charCode);
  409. position += 4;
  410. break;
  411. }
  412. default:
  413. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character escape sequence: \\".concat(String.fromCharCode(code), "."));
  414. }
  415. ++position;
  416. chunkStart = position;
  417. }
  418. }
  419. throw (0, _syntaxError.syntaxError)(source, position, 'Unterminated string.');
  420. }
  421. /**
  422. * Reads a block string token from the source file.
  423. *
  424. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  425. */
  426. function readBlockString(source, start, line, col, prev, lexer) {
  427. var body = source.body;
  428. var position = start + 3;
  429. var chunkStart = position;
  430. var code = 0;
  431. var rawValue = '';
  432. while (position < body.length && !isNaN(code = body.charCodeAt(position))) {
  433. // Closing Triple-Quote (""")
  434. if (code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34) {
  435. rawValue += body.slice(chunkStart, position);
  436. return new _ast.Token(_tokenKind.TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, (0, _blockString.dedentBlockStringValue)(rawValue));
  437. } // SourceCharacter
  438. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  439. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  440. }
  441. if (code === 10) {
  442. // new line
  443. ++position;
  444. ++lexer.line;
  445. lexer.lineStart = position;
  446. } else if (code === 13) {
  447. // carriage return
  448. if (body.charCodeAt(position + 1) === 10) {
  449. position += 2;
  450. } else {
  451. ++position;
  452. }
  453. ++lexer.line;
  454. lexer.lineStart = position;
  455. } else if ( // Escape Triple-Quote (\""")
  456. code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34) {
  457. rawValue += body.slice(chunkStart, position) + '"""';
  458. position += 4;
  459. chunkStart = position;
  460. } else {
  461. ++position;
  462. }
  463. }
  464. throw (0, _syntaxError.syntaxError)(source, position, 'Unterminated string.');
  465. }
  466. /**
  467. * Converts four hexadecimal chars to the integer that the
  468. * string represents. For example, uniCharCode('0','0','0','f')
  469. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  470. *
  471. * Returns a negative number on error, if a char was invalid.
  472. *
  473. * This is implemented by noting that char2hex() returns -1 on error,
  474. * which means the result of ORing the char2hex() will also be negative.
  475. */
  476. function uniCharCode(a, b, c, d) {
  477. return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d);
  478. }
  479. /**
  480. * Converts a hex character to its integer value.
  481. * '0' becomes 0, '9' becomes 9
  482. * 'A' becomes 10, 'F' becomes 15
  483. * 'a' becomes 10, 'f' becomes 15
  484. *
  485. * Returns -1 on error.
  486. */
  487. function char2hex(a) {
  488. return a >= 48 && a <= 57 ? a - 48 // 0-9
  489. : a >= 65 && a <= 70 ? a - 55 // A-F
  490. : a >= 97 && a <= 102 ? a - 87 // a-f
  491. : -1;
  492. }
  493. /**
  494. * Reads an alphanumeric + underscore name from the source.
  495. *
  496. * [_A-Za-z][_0-9A-Za-z]*
  497. */
  498. function readName(source, start, line, col, prev) {
  499. var body = source.body;
  500. var bodyLength = body.length;
  501. var position = start + 1;
  502. var code = 0;
  503. while (position !== bodyLength && !isNaN(code = body.charCodeAt(position)) && (code === 95 || // _
  504. code >= 48 && code <= 57 || // 0-9
  505. code >= 65 && code <= 90 || // A-Z
  506. code >= 97 && code <= 122) // a-z
  507. ) {
  508. ++position;
  509. }
  510. return new _ast.Token(_tokenKind.TokenKind.NAME, start, position, line, col, prev, body.slice(start, position));
  511. } // _ A-Z a-z
  512. function isNameStart(code) {
  513. return code === 95 || code >= 65 && code <= 90 || code >= 97 && code <= 122;
  514. }