lexer.mjs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. import { syntaxError } from "../error/syntaxError.mjs";
  2. import { Token } from "./ast.mjs";
  3. import { TokenKind } from "./tokenKind.mjs";
  4. import { dedentBlockStringValue } from "./blockString.mjs";
  5. /**
  6. * Given a Source object, creates a Lexer for that source.
  7. * A Lexer is a stateful stream generator in that every time
  8. * it is advanced, it returns the next token in the Source. Assuming the
  9. * source lexes, the final Token emitted by the lexer will be of kind
  10. * EOF, after which the lexer will repeatedly return the same EOF token
  11. * whenever called.
  12. */
  13. export var Lexer = /*#__PURE__*/function () {
  14. /**
  15. * The previously focused non-ignored token.
  16. */
  17. /**
  18. * The currently focused non-ignored token.
  19. */
  20. /**
  21. * The (1-indexed) line containing the current token.
  22. */
  23. /**
  24. * The character offset at which the current line begins.
  25. */
  26. function Lexer(source) {
  27. var startOfFileToken = new Token(TokenKind.SOF, 0, 0, 0, 0, null);
  28. this.source = source;
  29. this.lastToken = startOfFileToken;
  30. this.token = startOfFileToken;
  31. this.line = 1;
  32. this.lineStart = 0;
  33. }
  34. /**
  35. * Advances the token stream to the next non-ignored token.
  36. */
  37. var _proto = Lexer.prototype;
  38. _proto.advance = function advance() {
  39. this.lastToken = this.token;
  40. var token = this.token = this.lookahead();
  41. return token;
  42. }
  43. /**
  44. * Looks ahead and returns the next non-ignored token, but does not change
  45. * the state of Lexer.
  46. */
  47. ;
  48. _proto.lookahead = function lookahead() {
  49. var token = this.token;
  50. if (token.kind !== TokenKind.EOF) {
  51. do {
  52. var _token$next;
  53. // Note: next is only mutable during parsing, so we cast to allow this.
  54. token = (_token$next = token.next) !== null && _token$next !== void 0 ? _token$next : token.next = readToken(this, token);
  55. } while (token.kind === TokenKind.COMMENT);
  56. }
  57. return token;
  58. };
  59. return Lexer;
  60. }();
  61. /**
  62. * @internal
  63. */
  64. export function isPunctuatorTokenKind(kind) {
  65. return kind === TokenKind.BANG || kind === TokenKind.DOLLAR || kind === TokenKind.AMP || kind === TokenKind.PAREN_L || kind === TokenKind.PAREN_R || kind === TokenKind.SPREAD || kind === TokenKind.COLON || kind === TokenKind.EQUALS || kind === TokenKind.AT || kind === TokenKind.BRACKET_L || kind === TokenKind.BRACKET_R || kind === TokenKind.BRACE_L || kind === TokenKind.PIPE || kind === TokenKind.BRACE_R;
  66. }
  67. function printCharCode(code) {
  68. return (// NaN/undefined represents access beyond the end of the file.
  69. isNaN(code) ? TokenKind.EOF : // Trust JSON for ASCII.
  70. code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form.
  71. "\"\\u".concat(('00' + code.toString(16).toUpperCase()).slice(-4), "\"")
  72. );
  73. }
  74. /**
  75. * Gets the next token from the source starting at the given position.
  76. *
  77. * This skips over whitespace until it finds the next lexable token, then lexes
  78. * punctuators immediately or calls the appropriate helper function for more
  79. * complicated tokens.
  80. */
  81. function readToken(lexer, prev) {
  82. var source = lexer.source;
  83. var body = source.body;
  84. var bodyLength = body.length;
  85. var pos = prev.end;
  86. while (pos < bodyLength) {
  87. var code = body.charCodeAt(pos);
  88. var _line = lexer.line;
  89. var _col = 1 + pos - lexer.lineStart; // SourceCharacter
  90. switch (code) {
  91. case 0xfeff: // <BOM>
  92. case 9: // \t
  93. case 32: // <space>
  94. case 44:
  95. // ,
  96. ++pos;
  97. continue;
  98. case 10:
  99. // \n
  100. ++pos;
  101. ++lexer.line;
  102. lexer.lineStart = pos;
  103. continue;
  104. case 13:
  105. // \r
  106. if (body.charCodeAt(pos + 1) === 10) {
  107. pos += 2;
  108. } else {
  109. ++pos;
  110. }
  111. ++lexer.line;
  112. lexer.lineStart = pos;
  113. continue;
  114. case 33:
  115. // !
  116. return new Token(TokenKind.BANG, pos, pos + 1, _line, _col, prev);
  117. case 35:
  118. // #
  119. return readComment(source, pos, _line, _col, prev);
  120. case 36:
  121. // $
  122. return new Token(TokenKind.DOLLAR, pos, pos + 1, _line, _col, prev);
  123. case 38:
  124. // &
  125. return new Token(TokenKind.AMP, pos, pos + 1, _line, _col, prev);
  126. case 40:
  127. // (
  128. return new Token(TokenKind.PAREN_L, pos, pos + 1, _line, _col, prev);
  129. case 41:
  130. // )
  131. return new Token(TokenKind.PAREN_R, pos, pos + 1, _line, _col, prev);
  132. case 46:
  133. // .
  134. if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
  135. return new Token(TokenKind.SPREAD, pos, pos + 3, _line, _col, prev);
  136. }
  137. break;
  138. case 58:
  139. // :
  140. return new Token(TokenKind.COLON, pos, pos + 1, _line, _col, prev);
  141. case 61:
  142. // =
  143. return new Token(TokenKind.EQUALS, pos, pos + 1, _line, _col, prev);
  144. case 64:
  145. // @
  146. return new Token(TokenKind.AT, pos, pos + 1, _line, _col, prev);
  147. case 91:
  148. // [
  149. return new Token(TokenKind.BRACKET_L, pos, pos + 1, _line, _col, prev);
  150. case 93:
  151. // ]
  152. return new Token(TokenKind.BRACKET_R, pos, pos + 1, _line, _col, prev);
  153. case 123:
  154. // {
  155. return new Token(TokenKind.BRACE_L, pos, pos + 1, _line, _col, prev);
  156. case 124:
  157. // |
  158. return new Token(TokenKind.PIPE, pos, pos + 1, _line, _col, prev);
  159. case 125:
  160. // }
  161. return new Token(TokenKind.BRACE_R, pos, pos + 1, _line, _col, prev);
  162. case 34:
  163. // "
  164. if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
  165. return readBlockString(source, pos, _line, _col, prev, lexer);
  166. }
  167. return readString(source, pos, _line, _col, prev);
  168. case 45: // -
  169. case 48: // 0
  170. case 49: // 1
  171. case 50: // 2
  172. case 51: // 3
  173. case 52: // 4
  174. case 53: // 5
  175. case 54: // 6
  176. case 55: // 7
  177. case 56: // 8
  178. case 57:
  179. // 9
  180. return readNumber(source, pos, code, _line, _col, prev);
  181. case 65: // A
  182. case 66: // B
  183. case 67: // C
  184. case 68: // D
  185. case 69: // E
  186. case 70: // F
  187. case 71: // G
  188. case 72: // H
  189. case 73: // I
  190. case 74: // J
  191. case 75: // K
  192. case 76: // L
  193. case 77: // M
  194. case 78: // N
  195. case 79: // O
  196. case 80: // P
  197. case 81: // Q
  198. case 82: // R
  199. case 83: // S
  200. case 84: // T
  201. case 85: // U
  202. case 86: // V
  203. case 87: // W
  204. case 88: // X
  205. case 89: // Y
  206. case 90: // Z
  207. case 95: // _
  208. case 97: // a
  209. case 98: // b
  210. case 99: // c
  211. case 100: // d
  212. case 101: // e
  213. case 102: // f
  214. case 103: // g
  215. case 104: // h
  216. case 105: // i
  217. case 106: // j
  218. case 107: // k
  219. case 108: // l
  220. case 109: // m
  221. case 110: // n
  222. case 111: // o
  223. case 112: // p
  224. case 113: // q
  225. case 114: // r
  226. case 115: // s
  227. case 116: // t
  228. case 117: // u
  229. case 118: // v
  230. case 119: // w
  231. case 120: // x
  232. case 121: // y
  233. case 122:
  234. // z
  235. return readName(source, pos, _line, _col, prev);
  236. }
  237. throw syntaxError(source, pos, unexpectedCharacterMessage(code));
  238. }
  239. var line = lexer.line;
  240. var col = 1 + pos - lexer.lineStart;
  241. return new Token(TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  242. }
  243. /**
  244. * Report a message that an unexpected character was encountered.
  245. */
  246. function unexpectedCharacterMessage(code) {
  247. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  248. return "Cannot contain the invalid character ".concat(printCharCode(code), ".");
  249. }
  250. if (code === 39) {
  251. // '
  252. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  253. }
  254. return "Cannot parse the unexpected character ".concat(printCharCode(code), ".");
  255. }
  256. /**
  257. * Reads a comment token from the source file.
  258. *
  259. * #[\u0009\u0020-\uFFFF]*
  260. */
  261. function readComment(source, start, line, col, prev) {
  262. var body = source.body;
  263. var code;
  264. var position = start;
  265. do {
  266. code = body.charCodeAt(++position);
  267. } while (!isNaN(code) && ( // SourceCharacter but not LineTerminator
  268. code > 0x001f || code === 0x0009));
  269. return new Token(TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position));
  270. }
  271. /**
  272. * Reads a number token from the source file, either a float
  273. * or an int depending on whether a decimal point appears.
  274. *
  275. * Int: -?(0|[1-9][0-9]*)
  276. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  277. */
  278. function readNumber(source, start, firstCode, line, col, prev) {
  279. var body = source.body;
  280. var code = firstCode;
  281. var position = start;
  282. var isFloat = false;
  283. if (code === 45) {
  284. // -
  285. code = body.charCodeAt(++position);
  286. }
  287. if (code === 48) {
  288. // 0
  289. code = body.charCodeAt(++position);
  290. if (code >= 48 && code <= 57) {
  291. throw syntaxError(source, position, "Invalid number, unexpected digit after 0: ".concat(printCharCode(code), "."));
  292. }
  293. } else {
  294. position = readDigits(source, position, code);
  295. code = body.charCodeAt(position);
  296. }
  297. if (code === 46) {
  298. // .
  299. isFloat = true;
  300. code = body.charCodeAt(++position);
  301. position = readDigits(source, position, code);
  302. code = body.charCodeAt(position);
  303. }
  304. if (code === 69 || code === 101) {
  305. // E e
  306. isFloat = true;
  307. code = body.charCodeAt(++position);
  308. if (code === 43 || code === 45) {
  309. // + -
  310. code = body.charCodeAt(++position);
  311. }
  312. position = readDigits(source, position, code);
  313. code = body.charCodeAt(position);
  314. } // Numbers cannot be followed by . or NameStart
  315. if (code === 46 || isNameStart(code)) {
  316. throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  317. }
  318. return new Token(isFloat ? TokenKind.FLOAT : TokenKind.INT, start, position, line, col, prev, body.slice(start, position));
  319. }
  320. /**
  321. * Returns the new position in the source after reading digits.
  322. */
  323. function readDigits(source, start, firstCode) {
  324. var body = source.body;
  325. var position = start;
  326. var code = firstCode;
  327. if (code >= 48 && code <= 57) {
  328. // 0 - 9
  329. do {
  330. code = body.charCodeAt(++position);
  331. } while (code >= 48 && code <= 57); // 0 - 9
  332. return position;
  333. }
  334. throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  335. }
  336. /**
  337. * Reads a string token from the source file.
  338. *
  339. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  340. */
  341. function readString(source, start, line, col, prev) {
  342. var body = source.body;
  343. var position = start + 1;
  344. var chunkStart = position;
  345. var code = 0;
  346. var value = '';
  347. while (position < body.length && !isNaN(code = body.charCodeAt(position)) && // not LineTerminator
  348. code !== 0x000a && code !== 0x000d) {
  349. // Closing Quote (")
  350. if (code === 34) {
  351. value += body.slice(chunkStart, position);
  352. return new Token(TokenKind.STRING, start, position + 1, line, col, prev, value);
  353. } // SourceCharacter
  354. if (code < 0x0020 && code !== 0x0009) {
  355. throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  356. }
  357. ++position;
  358. if (code === 92) {
  359. // \
  360. value += body.slice(chunkStart, position - 1);
  361. code = body.charCodeAt(position);
  362. switch (code) {
  363. case 34:
  364. value += '"';
  365. break;
  366. case 47:
  367. value += '/';
  368. break;
  369. case 92:
  370. value += '\\';
  371. break;
  372. case 98:
  373. value += '\b';
  374. break;
  375. case 102:
  376. value += '\f';
  377. break;
  378. case 110:
  379. value += '\n';
  380. break;
  381. case 114:
  382. value += '\r';
  383. break;
  384. case 116:
  385. value += '\t';
  386. break;
  387. case 117:
  388. {
  389. // uXXXX
  390. var charCode = uniCharCode(body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4));
  391. if (charCode < 0) {
  392. var invalidSequence = body.slice(position + 1, position + 5);
  393. throw syntaxError(source, position, "Invalid character escape sequence: \\u".concat(invalidSequence, "."));
  394. }
  395. value += String.fromCharCode(charCode);
  396. position += 4;
  397. break;
  398. }
  399. default:
  400. throw syntaxError(source, position, "Invalid character escape sequence: \\".concat(String.fromCharCode(code), "."));
  401. }
  402. ++position;
  403. chunkStart = position;
  404. }
  405. }
  406. throw syntaxError(source, position, 'Unterminated string.');
  407. }
  408. /**
  409. * Reads a block string token from the source file.
  410. *
  411. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  412. */
  413. function readBlockString(source, start, line, col, prev, lexer) {
  414. var body = source.body;
  415. var position = start + 3;
  416. var chunkStart = position;
  417. var code = 0;
  418. var rawValue = '';
  419. while (position < body.length && !isNaN(code = body.charCodeAt(position))) {
  420. // Closing Triple-Quote (""")
  421. if (code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34) {
  422. rawValue += body.slice(chunkStart, position);
  423. return new Token(TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, dedentBlockStringValue(rawValue));
  424. } // SourceCharacter
  425. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  426. throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  427. }
  428. if (code === 10) {
  429. // new line
  430. ++position;
  431. ++lexer.line;
  432. lexer.lineStart = position;
  433. } else if (code === 13) {
  434. // carriage return
  435. if (body.charCodeAt(position + 1) === 10) {
  436. position += 2;
  437. } else {
  438. ++position;
  439. }
  440. ++lexer.line;
  441. lexer.lineStart = position;
  442. } else if ( // Escape Triple-Quote (\""")
  443. code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34) {
  444. rawValue += body.slice(chunkStart, position) + '"""';
  445. position += 4;
  446. chunkStart = position;
  447. } else {
  448. ++position;
  449. }
  450. }
  451. throw syntaxError(source, position, 'Unterminated string.');
  452. }
  453. /**
  454. * Converts four hexadecimal chars to the integer that the
  455. * string represents. For example, uniCharCode('0','0','0','f')
  456. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  457. *
  458. * Returns a negative number on error, if a char was invalid.
  459. *
  460. * This is implemented by noting that char2hex() returns -1 on error,
  461. * which means the result of ORing the char2hex() will also be negative.
  462. */
  463. function uniCharCode(a, b, c, d) {
  464. return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d);
  465. }
  466. /**
  467. * Converts a hex character to its integer value.
  468. * '0' becomes 0, '9' becomes 9
  469. * 'A' becomes 10, 'F' becomes 15
  470. * 'a' becomes 10, 'f' becomes 15
  471. *
  472. * Returns -1 on error.
  473. */
  474. function char2hex(a) {
  475. return a >= 48 && a <= 57 ? a - 48 // 0-9
  476. : a >= 65 && a <= 70 ? a - 55 // A-F
  477. : a >= 97 && a <= 102 ? a - 87 // a-f
  478. : -1;
  479. }
  480. /**
  481. * Reads an alphanumeric + underscore name from the source.
  482. *
  483. * [_A-Za-z][_0-9A-Za-z]*
  484. */
  485. function readName(source, start, line, col, prev) {
  486. var body = source.body;
  487. var bodyLength = body.length;
  488. var position = start + 1;
  489. var code = 0;
  490. while (position !== bodyLength && !isNaN(code = body.charCodeAt(position)) && (code === 95 || // _
  491. code >= 48 && code <= 57 || // 0-9
  492. code >= 65 && code <= 90 || // A-Z
  493. code >= 97 && code <= 122) // a-z
  494. ) {
  495. ++position;
  496. }
  497. return new Token(TokenKind.NAME, start, position, line, col, prev, body.slice(start, position));
  498. } // _ A-Z a-z
  499. function isNameStart(code) {
  500. return code === 95 || code >= 65 && code <= 90 || code >= 97 && code <= 122;
  501. }