lexer.js.flow 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701
  1. // @flow strict
  2. import { syntaxError } from '../error/syntaxError';
  3. import type { Source } from './source';
  4. import type { TokenKindEnum } from './tokenKind';
  5. import { Token } from './ast';
  6. import { TokenKind } from './tokenKind';
  7. import { dedentBlockStringValue } from './blockString';
  8. /**
  9. * Given a Source object, creates a Lexer for that source.
  10. * A Lexer is a stateful stream generator in that every time
  11. * it is advanced, it returns the next token in the Source. Assuming the
  12. * source lexes, the final Token emitted by the lexer will be of kind
  13. * EOF, after which the lexer will repeatedly return the same EOF token
  14. * whenever called.
  15. */
  16. export class Lexer {
  17. source: Source;
  18. /**
  19. * The previously focused non-ignored token.
  20. */
  21. lastToken: Token;
  22. /**
  23. * The currently focused non-ignored token.
  24. */
  25. token: Token;
  26. /**
  27. * The (1-indexed) line containing the current token.
  28. */
  29. line: number;
  30. /**
  31. * The character offset at which the current line begins.
  32. */
  33. lineStart: number;
  34. constructor(source: Source) {
  35. const startOfFileToken = new Token(TokenKind.SOF, 0, 0, 0, 0, null);
  36. this.source = source;
  37. this.lastToken = startOfFileToken;
  38. this.token = startOfFileToken;
  39. this.line = 1;
  40. this.lineStart = 0;
  41. }
  42. /**
  43. * Advances the token stream to the next non-ignored token.
  44. */
  45. advance(): Token {
  46. this.lastToken = this.token;
  47. const token = (this.token = this.lookahead());
  48. return token;
  49. }
  50. /**
  51. * Looks ahead and returns the next non-ignored token, but does not change
  52. * the state of Lexer.
  53. */
  54. lookahead(): Token {
  55. let token = this.token;
  56. if (token.kind !== TokenKind.EOF) {
  57. do {
  58. // Note: next is only mutable during parsing, so we cast to allow this.
  59. token = token.next ?? ((token: any).next = readToken(this, token));
  60. } while (token.kind === TokenKind.COMMENT);
  61. }
  62. return token;
  63. }
  64. }
  65. /**
  66. * @internal
  67. */
  68. export function isPunctuatorTokenKind(kind: TokenKindEnum): boolean %checks {
  69. return (
  70. kind === TokenKind.BANG ||
  71. kind === TokenKind.DOLLAR ||
  72. kind === TokenKind.AMP ||
  73. kind === TokenKind.PAREN_L ||
  74. kind === TokenKind.PAREN_R ||
  75. kind === TokenKind.SPREAD ||
  76. kind === TokenKind.COLON ||
  77. kind === TokenKind.EQUALS ||
  78. kind === TokenKind.AT ||
  79. kind === TokenKind.BRACKET_L ||
  80. kind === TokenKind.BRACKET_R ||
  81. kind === TokenKind.BRACE_L ||
  82. kind === TokenKind.PIPE ||
  83. kind === TokenKind.BRACE_R
  84. );
  85. }
  86. function printCharCode(code: number): string {
  87. return (
  88. // NaN/undefined represents access beyond the end of the file.
  89. isNaN(code)
  90. ? TokenKind.EOF
  91. : // Trust JSON for ASCII.
  92. code < 0x007f
  93. ? JSON.stringify(String.fromCharCode(code))
  94. : // Otherwise print the escaped form.
  95. `"\\u${('00' + code.toString(16).toUpperCase()).slice(-4)}"`
  96. );
  97. }
  98. /**
  99. * Gets the next token from the source starting at the given position.
  100. *
  101. * This skips over whitespace until it finds the next lexable token, then lexes
  102. * punctuators immediately or calls the appropriate helper function for more
  103. * complicated tokens.
  104. */
  105. function readToken(lexer: Lexer, prev: Token): Token {
  106. const source = lexer.source;
  107. const body = source.body;
  108. const bodyLength = body.length;
  109. let pos = prev.end;
  110. while (pos < bodyLength) {
  111. const code = body.charCodeAt(pos);
  112. const line = lexer.line;
  113. const col = 1 + pos - lexer.lineStart;
  114. // SourceCharacter
  115. switch (code) {
  116. case 0xfeff: // <BOM>
  117. case 9: // \t
  118. case 32: // <space>
  119. case 44: // ,
  120. ++pos;
  121. continue;
  122. case 10: // \n
  123. ++pos;
  124. ++lexer.line;
  125. lexer.lineStart = pos;
  126. continue;
  127. case 13: // \r
  128. if (body.charCodeAt(pos + 1) === 10) {
  129. pos += 2;
  130. } else {
  131. ++pos;
  132. }
  133. ++lexer.line;
  134. lexer.lineStart = pos;
  135. continue;
  136. case 33: // !
  137. return new Token(TokenKind.BANG, pos, pos + 1, line, col, prev);
  138. case 35: // #
  139. return readComment(source, pos, line, col, prev);
  140. case 36: // $
  141. return new Token(TokenKind.DOLLAR, pos, pos + 1, line, col, prev);
  142. case 38: // &
  143. return new Token(TokenKind.AMP, pos, pos + 1, line, col, prev);
  144. case 40: // (
  145. return new Token(TokenKind.PAREN_L, pos, pos + 1, line, col, prev);
  146. case 41: // )
  147. return new Token(TokenKind.PAREN_R, pos, pos + 1, line, col, prev);
  148. case 46: // .
  149. if (
  150. body.charCodeAt(pos + 1) === 46 &&
  151. body.charCodeAt(pos + 2) === 46
  152. ) {
  153. return new Token(TokenKind.SPREAD, pos, pos + 3, line, col, prev);
  154. }
  155. break;
  156. case 58: // :
  157. return new Token(TokenKind.COLON, pos, pos + 1, line, col, prev);
  158. case 61: // =
  159. return new Token(TokenKind.EQUALS, pos, pos + 1, line, col, prev);
  160. case 64: // @
  161. return new Token(TokenKind.AT, pos, pos + 1, line, col, prev);
  162. case 91: // [
  163. return new Token(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev);
  164. case 93: // ]
  165. return new Token(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev);
  166. case 123: // {
  167. return new Token(TokenKind.BRACE_L, pos, pos + 1, line, col, prev);
  168. case 124: // |
  169. return new Token(TokenKind.PIPE, pos, pos + 1, line, col, prev);
  170. case 125: // }
  171. return new Token(TokenKind.BRACE_R, pos, pos + 1, line, col, prev);
  172. case 34: // "
  173. if (
  174. body.charCodeAt(pos + 1) === 34 &&
  175. body.charCodeAt(pos + 2) === 34
  176. ) {
  177. return readBlockString(source, pos, line, col, prev, lexer);
  178. }
  179. return readString(source, pos, line, col, prev);
  180. case 45: // -
  181. case 48: // 0
  182. case 49: // 1
  183. case 50: // 2
  184. case 51: // 3
  185. case 52: // 4
  186. case 53: // 5
  187. case 54: // 6
  188. case 55: // 7
  189. case 56: // 8
  190. case 57: // 9
  191. return readNumber(source, pos, code, line, col, prev);
  192. case 65: // A
  193. case 66: // B
  194. case 67: // C
  195. case 68: // D
  196. case 69: // E
  197. case 70: // F
  198. case 71: // G
  199. case 72: // H
  200. case 73: // I
  201. case 74: // J
  202. case 75: // K
  203. case 76: // L
  204. case 77: // M
  205. case 78: // N
  206. case 79: // O
  207. case 80: // P
  208. case 81: // Q
  209. case 82: // R
  210. case 83: // S
  211. case 84: // T
  212. case 85: // U
  213. case 86: // V
  214. case 87: // W
  215. case 88: // X
  216. case 89: // Y
  217. case 90: // Z
  218. case 95: // _
  219. case 97: // a
  220. case 98: // b
  221. case 99: // c
  222. case 100: // d
  223. case 101: // e
  224. case 102: // f
  225. case 103: // g
  226. case 104: // h
  227. case 105: // i
  228. case 106: // j
  229. case 107: // k
  230. case 108: // l
  231. case 109: // m
  232. case 110: // n
  233. case 111: // o
  234. case 112: // p
  235. case 113: // q
  236. case 114: // r
  237. case 115: // s
  238. case 116: // t
  239. case 117: // u
  240. case 118: // v
  241. case 119: // w
  242. case 120: // x
  243. case 121: // y
  244. case 122: // z
  245. return readName(source, pos, line, col, prev);
  246. }
  247. throw syntaxError(source, pos, unexpectedCharacterMessage(code));
  248. }
  249. const line = lexer.line;
  250. const col = 1 + pos - lexer.lineStart;
  251. return new Token(TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  252. }
  253. /**
  254. * Report a message that an unexpected character was encountered.
  255. */
  256. function unexpectedCharacterMessage(code: number): string {
  257. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  258. return `Cannot contain the invalid character ${printCharCode(code)}.`;
  259. }
  260. if (code === 39) {
  261. // '
  262. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  263. }
  264. return `Cannot parse the unexpected character ${printCharCode(code)}.`;
  265. }
  266. /**
  267. * Reads a comment token from the source file.
  268. *
  269. * #[\u0009\u0020-\uFFFF]*
  270. */
  271. function readComment(
  272. source: Source,
  273. start: number,
  274. line: number,
  275. col: number,
  276. prev: Token | null,
  277. ): Token {
  278. const body = source.body;
  279. let code;
  280. let position = start;
  281. do {
  282. code = body.charCodeAt(++position);
  283. } while (
  284. !isNaN(code) &&
  285. // SourceCharacter but not LineTerminator
  286. (code > 0x001f || code === 0x0009)
  287. );
  288. return new Token(
  289. TokenKind.COMMENT,
  290. start,
  291. position,
  292. line,
  293. col,
  294. prev,
  295. body.slice(start + 1, position),
  296. );
  297. }
  298. /**
  299. * Reads a number token from the source file, either a float
  300. * or an int depending on whether a decimal point appears.
  301. *
  302. * Int: -?(0|[1-9][0-9]*)
  303. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  304. */
  305. function readNumber(
  306. source: Source,
  307. start: number,
  308. firstCode: number,
  309. line: number,
  310. col: number,
  311. prev: Token | null,
  312. ): Token {
  313. const body = source.body;
  314. let code = firstCode;
  315. let position = start;
  316. let isFloat = false;
  317. if (code === 45) {
  318. // -
  319. code = body.charCodeAt(++position);
  320. }
  321. if (code === 48) {
  322. // 0
  323. code = body.charCodeAt(++position);
  324. if (code >= 48 && code <= 57) {
  325. throw syntaxError(
  326. source,
  327. position,
  328. `Invalid number, unexpected digit after 0: ${printCharCode(code)}.`,
  329. );
  330. }
  331. } else {
  332. position = readDigits(source, position, code);
  333. code = body.charCodeAt(position);
  334. }
  335. if (code === 46) {
  336. // .
  337. isFloat = true;
  338. code = body.charCodeAt(++position);
  339. position = readDigits(source, position, code);
  340. code = body.charCodeAt(position);
  341. }
  342. if (code === 69 || code === 101) {
  343. // E e
  344. isFloat = true;
  345. code = body.charCodeAt(++position);
  346. if (code === 43 || code === 45) {
  347. // + -
  348. code = body.charCodeAt(++position);
  349. }
  350. position = readDigits(source, position, code);
  351. code = body.charCodeAt(position);
  352. }
  353. // Numbers cannot be followed by . or NameStart
  354. if (code === 46 || isNameStart(code)) {
  355. throw syntaxError(
  356. source,
  357. position,
  358. `Invalid number, expected digit but got: ${printCharCode(code)}.`,
  359. );
  360. }
  361. return new Token(
  362. isFloat ? TokenKind.FLOAT : TokenKind.INT,
  363. start,
  364. position,
  365. line,
  366. col,
  367. prev,
  368. body.slice(start, position),
  369. );
  370. }
  371. /**
  372. * Returns the new position in the source after reading digits.
  373. */
  374. function readDigits(source: Source, start: number, firstCode: number): number {
  375. const body = source.body;
  376. let position = start;
  377. let code = firstCode;
  378. if (code >= 48 && code <= 57) {
  379. // 0 - 9
  380. do {
  381. code = body.charCodeAt(++position);
  382. } while (code >= 48 && code <= 57); // 0 - 9
  383. return position;
  384. }
  385. throw syntaxError(
  386. source,
  387. position,
  388. `Invalid number, expected digit but got: ${printCharCode(code)}.`,
  389. );
  390. }
  391. /**
  392. * Reads a string token from the source file.
  393. *
  394. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  395. */
  396. function readString(
  397. source: Source,
  398. start: number,
  399. line: number,
  400. col: number,
  401. prev: Token | null,
  402. ): Token {
  403. const body = source.body;
  404. let position = start + 1;
  405. let chunkStart = position;
  406. let code = 0;
  407. let value = '';
  408. while (
  409. position < body.length &&
  410. !isNaN((code = body.charCodeAt(position))) &&
  411. // not LineTerminator
  412. code !== 0x000a &&
  413. code !== 0x000d
  414. ) {
  415. // Closing Quote (")
  416. if (code === 34) {
  417. value += body.slice(chunkStart, position);
  418. return new Token(
  419. TokenKind.STRING,
  420. start,
  421. position + 1,
  422. line,
  423. col,
  424. prev,
  425. value,
  426. );
  427. }
  428. // SourceCharacter
  429. if (code < 0x0020 && code !== 0x0009) {
  430. throw syntaxError(
  431. source,
  432. position,
  433. `Invalid character within String: ${printCharCode(code)}.`,
  434. );
  435. }
  436. ++position;
  437. if (code === 92) {
  438. // \
  439. value += body.slice(chunkStart, position - 1);
  440. code = body.charCodeAt(position);
  441. switch (code) {
  442. case 34:
  443. value += '"';
  444. break;
  445. case 47:
  446. value += '/';
  447. break;
  448. case 92:
  449. value += '\\';
  450. break;
  451. case 98:
  452. value += '\b';
  453. break;
  454. case 102:
  455. value += '\f';
  456. break;
  457. case 110:
  458. value += '\n';
  459. break;
  460. case 114:
  461. value += '\r';
  462. break;
  463. case 116:
  464. value += '\t';
  465. break;
  466. case 117: {
  467. // uXXXX
  468. const charCode = uniCharCode(
  469. body.charCodeAt(position + 1),
  470. body.charCodeAt(position + 2),
  471. body.charCodeAt(position + 3),
  472. body.charCodeAt(position + 4),
  473. );
  474. if (charCode < 0) {
  475. const invalidSequence = body.slice(position + 1, position + 5);
  476. throw syntaxError(
  477. source,
  478. position,
  479. `Invalid character escape sequence: \\u${invalidSequence}.`,
  480. );
  481. }
  482. value += String.fromCharCode(charCode);
  483. position += 4;
  484. break;
  485. }
  486. default:
  487. throw syntaxError(
  488. source,
  489. position,
  490. `Invalid character escape sequence: \\${String.fromCharCode(
  491. code,
  492. )}.`,
  493. );
  494. }
  495. ++position;
  496. chunkStart = position;
  497. }
  498. }
  499. throw syntaxError(source, position, 'Unterminated string.');
  500. }
  501. /**
  502. * Reads a block string token from the source file.
  503. *
  504. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  505. */
  506. function readBlockString(
  507. source: Source,
  508. start: number,
  509. line: number,
  510. col: number,
  511. prev: Token | null,
  512. lexer: Lexer,
  513. ): Token {
  514. const body = source.body;
  515. let position = start + 3;
  516. let chunkStart = position;
  517. let code = 0;
  518. let rawValue = '';
  519. while (position < body.length && !isNaN((code = body.charCodeAt(position)))) {
  520. // Closing Triple-Quote (""")
  521. if (
  522. code === 34 &&
  523. body.charCodeAt(position + 1) === 34 &&
  524. body.charCodeAt(position + 2) === 34
  525. ) {
  526. rawValue += body.slice(chunkStart, position);
  527. return new Token(
  528. TokenKind.BLOCK_STRING,
  529. start,
  530. position + 3,
  531. line,
  532. col,
  533. prev,
  534. dedentBlockStringValue(rawValue),
  535. );
  536. }
  537. // SourceCharacter
  538. if (
  539. code < 0x0020 &&
  540. code !== 0x0009 &&
  541. code !== 0x000a &&
  542. code !== 0x000d
  543. ) {
  544. throw syntaxError(
  545. source,
  546. position,
  547. `Invalid character within String: ${printCharCode(code)}.`,
  548. );
  549. }
  550. if (code === 10) {
  551. // new line
  552. ++position;
  553. ++lexer.line;
  554. lexer.lineStart = position;
  555. } else if (code === 13) {
  556. // carriage return
  557. if (body.charCodeAt(position + 1) === 10) {
  558. position += 2;
  559. } else {
  560. ++position;
  561. }
  562. ++lexer.line;
  563. lexer.lineStart = position;
  564. } else if (
  565. // Escape Triple-Quote (\""")
  566. code === 92 &&
  567. body.charCodeAt(position + 1) === 34 &&
  568. body.charCodeAt(position + 2) === 34 &&
  569. body.charCodeAt(position + 3) === 34
  570. ) {
  571. rawValue += body.slice(chunkStart, position) + '"""';
  572. position += 4;
  573. chunkStart = position;
  574. } else {
  575. ++position;
  576. }
  577. }
  578. throw syntaxError(source, position, 'Unterminated string.');
  579. }
  580. /**
  581. * Converts four hexadecimal chars to the integer that the
  582. * string represents. For example, uniCharCode('0','0','0','f')
  583. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  584. *
  585. * Returns a negative number on error, if a char was invalid.
  586. *
  587. * This is implemented by noting that char2hex() returns -1 on error,
  588. * which means the result of ORing the char2hex() will also be negative.
  589. */
  590. function uniCharCode(a: number, b: number, c: number, d: number): number {
  591. return (
  592. (char2hex(a) << 12) | (char2hex(b) << 8) | (char2hex(c) << 4) | char2hex(d)
  593. );
  594. }
  595. /**
  596. * Converts a hex character to its integer value.
  597. * '0' becomes 0, '9' becomes 9
  598. * 'A' becomes 10, 'F' becomes 15
  599. * 'a' becomes 10, 'f' becomes 15
  600. *
  601. * Returns -1 on error.
  602. */
  603. function char2hex(a: number): number {
  604. return a >= 48 && a <= 57
  605. ? a - 48 // 0-9
  606. : a >= 65 && a <= 70
  607. ? a - 55 // A-F
  608. : a >= 97 && a <= 102
  609. ? a - 87 // a-f
  610. : -1;
  611. }
  612. /**
  613. * Reads an alphanumeric + underscore name from the source.
  614. *
  615. * [_A-Za-z][_0-9A-Za-z]*
  616. */
  617. function readName(
  618. source: Source,
  619. start: number,
  620. line: number,
  621. col: number,
  622. prev: Token | null,
  623. ): Token {
  624. const body = source.body;
  625. const bodyLength = body.length;
  626. let position = start + 1;
  627. let code = 0;
  628. while (
  629. position !== bodyLength &&
  630. !isNaN((code = body.charCodeAt(position))) &&
  631. (code === 95 || // _
  632. (code >= 48 && code <= 57) || // 0-9
  633. (code >= 65 && code <= 90) || // A-Z
  634. (code >= 97 && code <= 122)) // a-z
  635. ) {
  636. ++position;
  637. }
  638. return new Token(
  639. TokenKind.NAME,
  640. start,
  641. position,
  642. line,
  643. col,
  644. prev,
  645. body.slice(start, position),
  646. );
  647. }
  648. // _ A-Z a-z
  649. function isNameStart(code: number): boolean {
  650. return (
  651. code === 95 || (code >= 65 && code <= 90) || (code >= 97 && code <= 122)
  652. );
  653. }