lexer.js.flow 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
  1. // @flow strict
  2. import { syntaxError } from '../error/syntaxError';
  3. import type { Source } from './source';
  4. import type { TokenKindEnum } from './tokenKind';
  5. import { Token } from './ast';
  6. import { TokenKind } from './tokenKind';
  7. import { dedentBlockStringValue } from './blockString';
  8. /**
  9. * Given a Source object, creates a Lexer for that source.
  10. * A Lexer is a stateful stream generator in that every time
  11. * it is advanced, it returns the next token in the Source. Assuming the
  12. * source lexes, the final Token emitted by the lexer will be of kind
  13. * EOF, after which the lexer will repeatedly return the same EOF token
  14. * whenever called.
  15. */
  16. export class Lexer {
  17. source: Source;
  18. /**
  19. * The previously focused non-ignored token.
  20. */
  21. lastToken: Token;
  22. /**
  23. * The currently focused non-ignored token.
  24. */
  25. token: Token;
  26. /**
  27. * The (1-indexed) line containing the current token.
  28. */
  29. line: number;
  30. /**
  31. * The character offset at which the current line begins.
  32. */
  33. lineStart: number;
  34. constructor(source: Source) {
  35. const startOfFileToken = new Token(TokenKind.SOF, 0, 0, 0, 0, null);
  36. this.source = source;
  37. this.lastToken = startOfFileToken;
  38. this.token = startOfFileToken;
  39. this.line = 1;
  40. this.lineStart = 0;
  41. }
  42. /**
  43. * Advances the token stream to the next non-ignored token.
  44. */
  45. advance(): Token {
  46. this.lastToken = this.token;
  47. const token = (this.token = this.lookahead());
  48. return token;
  49. }
  50. /**
  51. * Looks ahead and returns the next non-ignored token, but does not change
  52. * the state of Lexer.
  53. */
  54. lookahead(): Token {
  55. let token = this.token;
  56. if (token.kind !== TokenKind.EOF) {
  57. do {
  58. // Note: next is only mutable during parsing, so we cast to allow this.
  59. token = token.next ?? ((token: any).next = readToken(this, token));
  60. } while (token.kind === TokenKind.COMMENT);
  61. }
  62. return token;
  63. }
  64. }
  65. /**
  66. * @internal
  67. */
  68. export function isPunctuatorTokenKind(kind: TokenKindEnum): boolean %checks {
  69. return (
  70. kind === TokenKind.BANG ||
  71. kind === TokenKind.DOLLAR ||
  72. kind === TokenKind.AMP ||
  73. kind === TokenKind.PAREN_L ||
  74. kind === TokenKind.PAREN_R ||
  75. kind === TokenKind.SPREAD ||
  76. kind === TokenKind.COLON ||
  77. kind === TokenKind.EQUALS ||
  78. kind === TokenKind.AT ||
  79. kind === TokenKind.BRACKET_L ||
  80. kind === TokenKind.BRACKET_R ||
  81. kind === TokenKind.BRACE_L ||
  82. kind === TokenKind.PIPE ||
  83. kind === TokenKind.BRACE_R
  84. );
  85. }
  86. function printCharCode(code) {
  87. return (
  88. // NaN/undefined represents access beyond the end of the file.
  89. isNaN(code)
  90. ? TokenKind.EOF
  91. : // Trust JSON for ASCII.
  92. code < 0x007f
  93. ? JSON.stringify(String.fromCharCode(code))
  94. : // Otherwise print the escaped form.
  95. `"\\u${('00' + code.toString(16).toUpperCase()).slice(-4)}"`
  96. );
  97. }
  98. /**
  99. * Gets the next token from the source starting at the given position.
  100. *
  101. * This skips over whitespace until it finds the next lexable token, then lexes
  102. * punctuators immediately or calls the appropriate helper function for more
  103. * complicated tokens.
  104. */
  105. function readToken(lexer: Lexer, prev: Token): Token {
  106. const source = lexer.source;
  107. const body = source.body;
  108. const bodyLength = body.length;
  109. const pos = positionAfterWhitespace(body, prev.end, lexer);
  110. const line = lexer.line;
  111. const col = 1 + pos - lexer.lineStart;
  112. if (pos >= bodyLength) {
  113. return new Token(TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  114. }
  115. const code = body.charCodeAt(pos);
  116. // SourceCharacter
  117. switch (code) {
  118. // !
  119. case 33:
  120. return new Token(TokenKind.BANG, pos, pos + 1, line, col, prev);
  121. // #
  122. case 35:
  123. return readComment(source, pos, line, col, prev);
  124. // $
  125. case 36:
  126. return new Token(TokenKind.DOLLAR, pos, pos + 1, line, col, prev);
  127. // &
  128. case 38:
  129. return new Token(TokenKind.AMP, pos, pos + 1, line, col, prev);
  130. // (
  131. case 40:
  132. return new Token(TokenKind.PAREN_L, pos, pos + 1, line, col, prev);
  133. // )
  134. case 41:
  135. return new Token(TokenKind.PAREN_R, pos, pos + 1, line, col, prev);
  136. // .
  137. case 46:
  138. if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
  139. return new Token(TokenKind.SPREAD, pos, pos + 3, line, col, prev);
  140. }
  141. break;
  142. // :
  143. case 58:
  144. return new Token(TokenKind.COLON, pos, pos + 1, line, col, prev);
  145. // =
  146. case 61:
  147. return new Token(TokenKind.EQUALS, pos, pos + 1, line, col, prev);
  148. // @
  149. case 64:
  150. return new Token(TokenKind.AT, pos, pos + 1, line, col, prev);
  151. // [
  152. case 91:
  153. return new Token(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev);
  154. // ]
  155. case 93:
  156. return new Token(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev);
  157. // {
  158. case 123:
  159. return new Token(TokenKind.BRACE_L, pos, pos + 1, line, col, prev);
  160. // |
  161. case 124:
  162. return new Token(TokenKind.PIPE, pos, pos + 1, line, col, prev);
  163. // }
  164. case 125:
  165. return new Token(TokenKind.BRACE_R, pos, pos + 1, line, col, prev);
  166. // A-Z _ a-z
  167. case 65:
  168. case 66:
  169. case 67:
  170. case 68:
  171. case 69:
  172. case 70:
  173. case 71:
  174. case 72:
  175. case 73:
  176. case 74:
  177. case 75:
  178. case 76:
  179. case 77:
  180. case 78:
  181. case 79:
  182. case 80:
  183. case 81:
  184. case 82:
  185. case 83:
  186. case 84:
  187. case 85:
  188. case 86:
  189. case 87:
  190. case 88:
  191. case 89:
  192. case 90:
  193. case 95:
  194. case 97:
  195. case 98:
  196. case 99:
  197. case 100:
  198. case 101:
  199. case 102:
  200. case 103:
  201. case 104:
  202. case 105:
  203. case 106:
  204. case 107:
  205. case 108:
  206. case 109:
  207. case 110:
  208. case 111:
  209. case 112:
  210. case 113:
  211. case 114:
  212. case 115:
  213. case 116:
  214. case 117:
  215. case 118:
  216. case 119:
  217. case 120:
  218. case 121:
  219. case 122:
  220. return readName(source, pos, line, col, prev);
  221. // - 0-9
  222. case 45:
  223. case 48:
  224. case 49:
  225. case 50:
  226. case 51:
  227. case 52:
  228. case 53:
  229. case 54:
  230. case 55:
  231. case 56:
  232. case 57:
  233. return readNumber(source, pos, code, line, col, prev);
  234. // "
  235. case 34:
  236. if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
  237. return readBlockString(source, pos, line, col, prev, lexer);
  238. }
  239. return readString(source, pos, line, col, prev);
  240. }
  241. throw syntaxError(source, pos, unexpectedCharacterMessage(code));
  242. }
  243. /**
  244. * Report a message that an unexpected character was encountered.
  245. */
  246. function unexpectedCharacterMessage(code) {
  247. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  248. return `Cannot contain the invalid character ${printCharCode(code)}.`;
  249. }
  250. if (code === 39) {
  251. // '
  252. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  253. }
  254. return `Cannot parse the unexpected character ${printCharCode(code)}.`;
  255. }
  256. /**
  257. * Reads from body starting at startPosition until it finds a non-whitespace
  258. * character, then returns the position of that character for lexing.
  259. */
  260. function positionAfterWhitespace(
  261. body: string,
  262. startPosition: number,
  263. lexer: Lexer,
  264. ): number {
  265. const bodyLength = body.length;
  266. let position = startPosition;
  267. while (position < bodyLength) {
  268. const code = body.charCodeAt(position);
  269. // tab | space | comma | BOM
  270. if (code === 9 || code === 32 || code === 44 || code === 0xfeff) {
  271. ++position;
  272. } else if (code === 10) {
  273. // new line
  274. ++position;
  275. ++lexer.line;
  276. lexer.lineStart = position;
  277. } else if (code === 13) {
  278. // carriage return
  279. if (body.charCodeAt(position + 1) === 10) {
  280. position += 2;
  281. } else {
  282. ++position;
  283. }
  284. ++lexer.line;
  285. lexer.lineStart = position;
  286. } else {
  287. break;
  288. }
  289. }
  290. return position;
  291. }
  292. /**
  293. * Reads a comment token from the source file.
  294. *
  295. * #[\u0009\u0020-\uFFFF]*
  296. */
  297. function readComment(source, start, line, col, prev): Token {
  298. const body = source.body;
  299. let code;
  300. let position = start;
  301. do {
  302. code = body.charCodeAt(++position);
  303. } while (
  304. !isNaN(code) &&
  305. // SourceCharacter but not LineTerminator
  306. (code > 0x001f || code === 0x0009)
  307. );
  308. return new Token(
  309. TokenKind.COMMENT,
  310. start,
  311. position,
  312. line,
  313. col,
  314. prev,
  315. body.slice(start + 1, position),
  316. );
  317. }
  318. /**
  319. * Reads a number token from the source file, either a float
  320. * or an int depending on whether a decimal point appears.
  321. *
  322. * Int: -?(0|[1-9][0-9]*)
  323. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  324. */
  325. function readNumber(source, start, firstCode, line, col, prev): Token {
  326. const body = source.body;
  327. let code = firstCode;
  328. let position = start;
  329. let isFloat = false;
  330. if (code === 45) {
  331. // -
  332. code = body.charCodeAt(++position);
  333. }
  334. if (code === 48) {
  335. // 0
  336. code = body.charCodeAt(++position);
  337. if (code >= 48 && code <= 57) {
  338. throw syntaxError(
  339. source,
  340. position,
  341. `Invalid number, unexpected digit after 0: ${printCharCode(code)}.`,
  342. );
  343. }
  344. } else {
  345. position = readDigits(source, position, code);
  346. code = body.charCodeAt(position);
  347. }
  348. if (code === 46) {
  349. // .
  350. isFloat = true;
  351. code = body.charCodeAt(++position);
  352. position = readDigits(source, position, code);
  353. code = body.charCodeAt(position);
  354. }
  355. if (code === 69 || code === 101) {
  356. // E e
  357. isFloat = true;
  358. code = body.charCodeAt(++position);
  359. if (code === 43 || code === 45) {
  360. // + -
  361. code = body.charCodeAt(++position);
  362. }
  363. position = readDigits(source, position, code);
  364. code = body.charCodeAt(position);
  365. }
  366. // Numbers cannot be followed by . or NameStart
  367. if (code === 46 || isNameStart(code)) {
  368. throw syntaxError(
  369. source,
  370. position,
  371. `Invalid number, expected digit but got: ${printCharCode(code)}.`,
  372. );
  373. }
  374. return new Token(
  375. isFloat ? TokenKind.FLOAT : TokenKind.INT,
  376. start,
  377. position,
  378. line,
  379. col,
  380. prev,
  381. body.slice(start, position),
  382. );
  383. }
  384. /**
  385. * Returns the new position in the source after reading digits.
  386. */
  387. function readDigits(source, start, firstCode) {
  388. const body = source.body;
  389. let position = start;
  390. let code = firstCode;
  391. if (code >= 48 && code <= 57) {
  392. // 0 - 9
  393. do {
  394. code = body.charCodeAt(++position);
  395. } while (code >= 48 && code <= 57); // 0 - 9
  396. return position;
  397. }
  398. throw syntaxError(
  399. source,
  400. position,
  401. `Invalid number, expected digit but got: ${printCharCode(code)}.`,
  402. );
  403. }
  404. /**
  405. * Reads a string token from the source file.
  406. *
  407. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  408. */
  409. function readString(source, start, line, col, prev): Token {
  410. const body = source.body;
  411. let position = start + 1;
  412. let chunkStart = position;
  413. let code = 0;
  414. let value = '';
  415. while (
  416. position < body.length &&
  417. !isNaN((code = body.charCodeAt(position))) &&
  418. // not LineTerminator
  419. code !== 0x000a &&
  420. code !== 0x000d
  421. ) {
  422. // Closing Quote (")
  423. if (code === 34) {
  424. value += body.slice(chunkStart, position);
  425. return new Token(
  426. TokenKind.STRING,
  427. start,
  428. position + 1,
  429. line,
  430. col,
  431. prev,
  432. value,
  433. );
  434. }
  435. // SourceCharacter
  436. if (code < 0x0020 && code !== 0x0009) {
  437. throw syntaxError(
  438. source,
  439. position,
  440. `Invalid character within String: ${printCharCode(code)}.`,
  441. );
  442. }
  443. ++position;
  444. if (code === 92) {
  445. // \
  446. value += body.slice(chunkStart, position - 1);
  447. code = body.charCodeAt(position);
  448. switch (code) {
  449. case 34:
  450. value += '"';
  451. break;
  452. case 47:
  453. value += '/';
  454. break;
  455. case 92:
  456. value += '\\';
  457. break;
  458. case 98:
  459. value += '\b';
  460. break;
  461. case 102:
  462. value += '\f';
  463. break;
  464. case 110:
  465. value += '\n';
  466. break;
  467. case 114:
  468. value += '\r';
  469. break;
  470. case 116:
  471. value += '\t';
  472. break;
  473. case 117: {
  474. // uXXXX
  475. const charCode = uniCharCode(
  476. body.charCodeAt(position + 1),
  477. body.charCodeAt(position + 2),
  478. body.charCodeAt(position + 3),
  479. body.charCodeAt(position + 4),
  480. );
  481. if (charCode < 0) {
  482. const invalidSequence = body.slice(position + 1, position + 5);
  483. throw syntaxError(
  484. source,
  485. position,
  486. `Invalid character escape sequence: \\u${invalidSequence}.`,
  487. );
  488. }
  489. value += String.fromCharCode(charCode);
  490. position += 4;
  491. break;
  492. }
  493. default:
  494. throw syntaxError(
  495. source,
  496. position,
  497. `Invalid character escape sequence: \\${String.fromCharCode(
  498. code,
  499. )}.`,
  500. );
  501. }
  502. ++position;
  503. chunkStart = position;
  504. }
  505. }
  506. throw syntaxError(source, position, 'Unterminated string.');
  507. }
  508. /**
  509. * Reads a block string token from the source file.
  510. *
  511. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  512. */
  513. function readBlockString(source, start, line, col, prev, lexer): Token {
  514. const body = source.body;
  515. let position = start + 3;
  516. let chunkStart = position;
  517. let code = 0;
  518. let rawValue = '';
  519. while (position < body.length && !isNaN((code = body.charCodeAt(position)))) {
  520. // Closing Triple-Quote (""")
  521. if (
  522. code === 34 &&
  523. body.charCodeAt(position + 1) === 34 &&
  524. body.charCodeAt(position + 2) === 34
  525. ) {
  526. rawValue += body.slice(chunkStart, position);
  527. return new Token(
  528. TokenKind.BLOCK_STRING,
  529. start,
  530. position + 3,
  531. line,
  532. col,
  533. prev,
  534. dedentBlockStringValue(rawValue),
  535. );
  536. }
  537. // SourceCharacter
  538. if (
  539. code < 0x0020 &&
  540. code !== 0x0009 &&
  541. code !== 0x000a &&
  542. code !== 0x000d
  543. ) {
  544. throw syntaxError(
  545. source,
  546. position,
  547. `Invalid character within String: ${printCharCode(code)}.`,
  548. );
  549. }
  550. if (code === 10) {
  551. // new line
  552. ++position;
  553. ++lexer.line;
  554. lexer.lineStart = position;
  555. } else if (code === 13) {
  556. // carriage return
  557. if (body.charCodeAt(position + 1) === 10) {
  558. position += 2;
  559. } else {
  560. ++position;
  561. }
  562. ++lexer.line;
  563. lexer.lineStart = position;
  564. } else if (
  565. // Escape Triple-Quote (\""")
  566. code === 92 &&
  567. body.charCodeAt(position + 1) === 34 &&
  568. body.charCodeAt(position + 2) === 34 &&
  569. body.charCodeAt(position + 3) === 34
  570. ) {
  571. rawValue += body.slice(chunkStart, position) + '"""';
  572. position += 4;
  573. chunkStart = position;
  574. } else {
  575. ++position;
  576. }
  577. }
  578. throw syntaxError(source, position, 'Unterminated string.');
  579. }
  580. /**
  581. * Converts four hexadecimal chars to the integer that the
  582. * string represents. For example, uniCharCode('0','0','0','f')
  583. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  584. *
  585. * Returns a negative number on error, if a char was invalid.
  586. *
  587. * This is implemented by noting that char2hex() returns -1 on error,
  588. * which means the result of ORing the char2hex() will also be negative.
  589. */
  590. function uniCharCode(a, b, c, d) {
  591. return (
  592. (char2hex(a) << 12) | (char2hex(b) << 8) | (char2hex(c) << 4) | char2hex(d)
  593. );
  594. }
  595. /**
  596. * Converts a hex character to its integer value.
  597. * '0' becomes 0, '9' becomes 9
  598. * 'A' becomes 10, 'F' becomes 15
  599. * 'a' becomes 10, 'f' becomes 15
  600. *
  601. * Returns -1 on error.
  602. */
  603. function char2hex(a) {
  604. return a >= 48 && a <= 57
  605. ? a - 48 // 0-9
  606. : a >= 65 && a <= 70
  607. ? a - 55 // A-F
  608. : a >= 97 && a <= 102
  609. ? a - 87 // a-f
  610. : -1;
  611. }
  612. /**
  613. * Reads an alphanumeric + underscore name from the source.
  614. *
  615. * [_A-Za-z][_0-9A-Za-z]*
  616. */
  617. function readName(source, start, line, col, prev): Token {
  618. const body = source.body;
  619. const bodyLength = body.length;
  620. let position = start + 1;
  621. let code = 0;
  622. while (
  623. position !== bodyLength &&
  624. !isNaN((code = body.charCodeAt(position))) &&
  625. (code === 95 || // _
  626. (code >= 48 && code <= 57) || // 0-9
  627. (code >= 65 && code <= 90) || // A-Z
  628. (code >= 97 && code <= 122)) // a-z
  629. ) {
  630. ++position;
  631. }
  632. return new Token(
  633. TokenKind.NAME,
  634. start,
  635. position,
  636. line,
  637. col,
  638. prev,
  639. body.slice(start, position),
  640. );
  641. }
  642. // _ A-Z a-z
  643. function isNameStart(code): boolean {
  644. return (
  645. code === 95 || (code >= 65 && code <= 90) || (code >= 97 && code <= 122)
  646. );
  647. }