lexer.js.flow 17 KB


  1. // @flow strict
  2. import defineToJSON from '../jsutils/defineToJSON';
  3. import { syntaxError } from '../error/syntaxError';
  4. import { type Token } from './ast';
  5. import { type Source } from './source';
  6. import { dedentBlockStringValue } from './blockString';
  7. import { type TokenKindEnum, TokenKind } from './tokenKind';
  8. /**
  9. * Given a Source object, this returns a Lexer for that source.
  10. * A Lexer is a stateful stream generator in that every time
  11. * it is advanced, it returns the next token in the Source. Assuming the
  12. * source lexes, the final Token emitted by the lexer will be of kind
  13. * EOF, after which the lexer will repeatedly return the same EOF token
  14. * whenever called.
  15. */
  16. export function createLexer<TOptions>(
  17. source: Source,
  18. options: TOptions,
  19. ): Lexer<TOptions> {
  20. const startOfFileToken = new Tok(TokenKind.SOF, 0, 0, 0, 0, null);
  21. const lexer: Lexer<TOptions> = {
  22. source,
  23. options,
  24. lastToken: startOfFileToken,
  25. token: startOfFileToken,
  26. line: 1,
  27. lineStart: 0,
  28. advance: advanceLexer,
  29. lookahead,
  30. };
  31. return lexer;
  32. }
  33. function advanceLexer() {
  34. this.lastToken = this.token;
  35. const token = (this.token = this.lookahead());
  36. return token;
  37. }
  38. function lookahead() {
  39. let token = this.token;
  40. if (token.kind !== TokenKind.EOF) {
  41. do {
  42. // Note: next is only mutable during parsing, so we cast to allow this.
  43. token = token.next || ((token: any).next = readToken(this, token));
  44. } while (token.kind === TokenKind.COMMENT);
  45. }
  46. return token;
  47. }
  48. /**
  49. * The return type of createLexer.
  50. */
  51. export type Lexer<TOptions> = {
  52. source: Source,
  53. options: TOptions,
  54. /**
  55. * The previously focused non-ignored token.
  56. */
  57. lastToken: Token,
  58. /**
  59. * The currently focused non-ignored token.
  60. */
  61. token: Token,
  62. /**
  63. * The (1-indexed) line containing the current token.
  64. */
  65. line: number,
  66. /**
  67. * The character offset at which the current line begins.
  68. */
  69. lineStart: number,
  70. /**
  71. * Advances the token stream to the next non-ignored token.
  72. */
  73. advance(): Token,
  74. /**
  75. * Looks ahead and returns the next non-ignored token, but does not change
  76. * the Lexer's state.
  77. */
  78. lookahead(): Token,
  79. ...
  80. };
  81. // @internal
  82. export function isPunctuatorToken(token: Token) {
  83. const kind = token.kind;
  84. return (
  85. kind === TokenKind.BANG ||
  86. kind === TokenKind.DOLLAR ||
  87. kind === TokenKind.AMP ||
  88. kind === TokenKind.PAREN_L ||
  89. kind === TokenKind.PAREN_R ||
  90. kind === TokenKind.SPREAD ||
  91. kind === TokenKind.COLON ||
  92. kind === TokenKind.EQUALS ||
  93. kind === TokenKind.AT ||
  94. kind === TokenKind.BRACKET_L ||
  95. kind === TokenKind.BRACKET_R ||
  96. kind === TokenKind.BRACE_L ||
  97. kind === TokenKind.PIPE ||
  98. kind === TokenKind.BRACE_R
  99. );
  100. }
  101. /**
  102. * Helper function for constructing the Token object.
  103. */
  104. function Tok(
  105. kind: TokenKindEnum,
  106. start: number,
  107. end: number,
  108. line: number,
  109. column: number,
  110. prev: Token | null,
  111. value?: string,
  112. ) {
  113. this.kind = kind;
  114. this.start = start;
  115. this.end = end;
  116. this.line = line;
  117. this.column = column;
  118. this.value = value;
  119. this.prev = prev;
  120. this.next = null;
  121. }
  122. // Print a simplified form when appearing in JSON/util.inspect.
  123. defineToJSON(Tok, function() {
  124. return {
  125. kind: this.kind,
  126. value: this.value,
  127. line: this.line,
  128. column: this.column,
  129. };
  130. });
  131. function printCharCode(code) {
  132. return (
  133. // NaN/undefined represents access beyond the end of the file.
  134. isNaN(code)
  135. ? TokenKind.EOF
  136. : // Trust JSON for ASCII.
  137. code < 0x007f
  138. ? JSON.stringify(String.fromCharCode(code))
  139. : // Otherwise print the escaped form.
  140. `"\\u${('00' + code.toString(16).toUpperCase()).slice(-4)}"`
  141. );
  142. }
  143. /**
  144. * Gets the next token from the source starting at the given position.
  145. *
  146. * This skips over whitespace until it finds the next lexable token, then lexes
  147. * punctuators immediately or calls the appropriate helper function for more
  148. * complicated tokens.
  149. */
  150. function readToken(lexer: Lexer<mixed>, prev: Token): Token {
  151. const source = lexer.source;
  152. const body = source.body;
  153. const bodyLength = body.length;
  154. const pos = positionAfterWhitespace(body, prev.end, lexer);
  155. const line = lexer.line;
  156. const col = 1 + pos - lexer.lineStart;
  157. if (pos >= bodyLength) {
  158. return new Tok(TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  159. }
  160. const code = body.charCodeAt(pos);
  161. // SourceCharacter
  162. switch (code) {
  163. // !
  164. case 33:
  165. return new Tok(TokenKind.BANG, pos, pos + 1, line, col, prev);
  166. // #
  167. case 35:
  168. return readComment(source, pos, line, col, prev);
  169. // $
  170. case 36:
  171. return new Tok(TokenKind.DOLLAR, pos, pos + 1, line, col, prev);
  172. // &
  173. case 38:
  174. return new Tok(TokenKind.AMP, pos, pos + 1, line, col, prev);
  175. // (
  176. case 40:
  177. return new Tok(TokenKind.PAREN_L, pos, pos + 1, line, col, prev);
  178. // )
  179. case 41:
  180. return new Tok(TokenKind.PAREN_R, pos, pos + 1, line, col, prev);
  181. // .
  182. case 46:
  183. if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
  184. return new Tok(TokenKind.SPREAD, pos, pos + 3, line, col, prev);
  185. }
  186. break;
  187. // :
  188. case 58:
  189. return new Tok(TokenKind.COLON, pos, pos + 1, line, col, prev);
  190. // =
  191. case 61:
  192. return new Tok(TokenKind.EQUALS, pos, pos + 1, line, col, prev);
  193. // @
  194. case 64:
  195. return new Tok(TokenKind.AT, pos, pos + 1, line, col, prev);
  196. // [
  197. case 91:
  198. return new Tok(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev);
  199. // ]
  200. case 93:
  201. return new Tok(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev);
  202. // {
  203. case 123:
  204. return new Tok(TokenKind.BRACE_L, pos, pos + 1, line, col, prev);
  205. // |
  206. case 124:
  207. return new Tok(TokenKind.PIPE, pos, pos + 1, line, col, prev);
  208. // }
  209. case 125:
  210. return new Tok(TokenKind.BRACE_R, pos, pos + 1, line, col, prev);
  211. // A-Z _ a-z
  212. case 65:
  213. case 66:
  214. case 67:
  215. case 68:
  216. case 69:
  217. case 70:
  218. case 71:
  219. case 72:
  220. case 73:
  221. case 74:
  222. case 75:
  223. case 76:
  224. case 77:
  225. case 78:
  226. case 79:
  227. case 80:
  228. case 81:
  229. case 82:
  230. case 83:
  231. case 84:
  232. case 85:
  233. case 86:
  234. case 87:
  235. case 88:
  236. case 89:
  237. case 90:
  238. case 95:
  239. case 97:
  240. case 98:
  241. case 99:
  242. case 100:
  243. case 101:
  244. case 102:
  245. case 103:
  246. case 104:
  247. case 105:
  248. case 106:
  249. case 107:
  250. case 108:
  251. case 109:
  252. case 110:
  253. case 111:
  254. case 112:
  255. case 113:
  256. case 114:
  257. case 115:
  258. case 116:
  259. case 117:
  260. case 118:
  261. case 119:
  262. case 120:
  263. case 121:
  264. case 122:
  265. return readName(source, pos, line, col, prev);
  266. // - 0-9
  267. case 45:
  268. case 48:
  269. case 49:
  270. case 50:
  271. case 51:
  272. case 52:
  273. case 53:
  274. case 54:
  275. case 55:
  276. case 56:
  277. case 57:
  278. return readNumber(source, pos, code, line, col, prev);
  279. // "
  280. case 34:
  281. if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
  282. return readBlockString(source, pos, line, col, prev, lexer);
  283. }
  284. return readString(source, pos, line, col, prev);
  285. }
  286. throw syntaxError(source, pos, unexpectedCharacterMessage(code));
  287. }
  288. /**
  289. * Report a message that an unexpected character was encountered.
  290. */
  291. function unexpectedCharacterMessage(code) {
  292. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  293. return `Cannot contain the invalid character ${printCharCode(code)}.`;
  294. }
  295. if (code === 39) {
  296. // '
  297. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  298. }
  299. return `Cannot parse the unexpected character ${printCharCode(code)}.`;
  300. }
  301. /**
  302. * Reads from body starting at startPosition until it finds a non-whitespace
  303. * character, then returns the position of that character for lexing.
  304. */
  305. function positionAfterWhitespace(
  306. body: string,
  307. startPosition: number,
  308. lexer: Lexer<mixed>,
  309. ): number {
  310. const bodyLength = body.length;
  311. let position = startPosition;
  312. while (position < bodyLength) {
  313. const code = body.charCodeAt(position);
  314. // tab | space | comma | BOM
  315. if (code === 9 || code === 32 || code === 44 || code === 0xfeff) {
  316. ++position;
  317. } else if (code === 10) {
  318. // new line
  319. ++position;
  320. ++lexer.line;
  321. lexer.lineStart = position;
  322. } else if (code === 13) {
  323. // carriage return
  324. if (body.charCodeAt(position + 1) === 10) {
  325. position += 2;
  326. } else {
  327. ++position;
  328. }
  329. ++lexer.line;
  330. lexer.lineStart = position;
  331. } else {
  332. break;
  333. }
  334. }
  335. return position;
  336. }
  337. /**
  338. * Reads a comment token from the source file.
  339. *
  340. * #[\u0009\u0020-\uFFFF]*
  341. */
  342. function readComment(source, start, line, col, prev): Token {
  343. const body = source.body;
  344. let code;
  345. let position = start;
  346. do {
  347. code = body.charCodeAt(++position);
  348. } while (
  349. !isNaN(code) &&
  350. // SourceCharacter but not LineTerminator
  351. (code > 0x001f || code === 0x0009)
  352. );
  353. return new Tok(
  354. TokenKind.COMMENT,
  355. start,
  356. position,
  357. line,
  358. col,
  359. prev,
  360. body.slice(start + 1, position),
  361. );
  362. }
  363. /**
  364. * Reads a number token from the source file, either a float
  365. * or an int depending on whether a decimal point appears.
  366. *
  367. * Int: -?(0|[1-9][0-9]*)
  368. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  369. */
  370. function readNumber(source, start, firstCode, line, col, prev): Token {
  371. const body = source.body;
  372. let code = firstCode;
  373. let position = start;
  374. let isFloat = false;
  375. if (code === 45) {
  376. // -
  377. code = body.charCodeAt(++position);
  378. }
  379. if (code === 48) {
  380. // 0
  381. code = body.charCodeAt(++position);
  382. if (code >= 48 && code <= 57) {
  383. throw syntaxError(
  384. source,
  385. position,
  386. `Invalid number, unexpected digit after 0: ${printCharCode(code)}.`,
  387. );
  388. }
  389. } else {
  390. position = readDigits(source, position, code);
  391. code = body.charCodeAt(position);
  392. }
  393. if (code === 46) {
  394. // .
  395. isFloat = true;
  396. code = body.charCodeAt(++position);
  397. position = readDigits(source, position, code);
  398. code = body.charCodeAt(position);
  399. }
  400. if (code === 69 || code === 101) {
  401. // E e
  402. isFloat = true;
  403. code = body.charCodeAt(++position);
  404. if (code === 43 || code === 45) {
  405. // + -
  406. code = body.charCodeAt(++position);
  407. }
  408. position = readDigits(source, position, code);
  409. code = body.charCodeAt(position);
  410. }
  411. // Numbers cannot be followed by . or e
  412. if (code === 46 || code === 69 || code === 101) {
  413. throw syntaxError(
  414. source,
  415. position,
  416. `Invalid number, expected digit but got: ${printCharCode(code)}.`,
  417. );
  418. }
  419. return new Tok(
  420. isFloat ? TokenKind.FLOAT : TokenKind.INT,
  421. start,
  422. position,
  423. line,
  424. col,
  425. prev,
  426. body.slice(start, position),
  427. );
  428. }
  429. /**
  430. * Returns the new position in the source after reading digits.
  431. */
  432. function readDigits(source, start, firstCode) {
  433. const body = source.body;
  434. let position = start;
  435. let code = firstCode;
  436. if (code >= 48 && code <= 57) {
  437. // 0 - 9
  438. do {
  439. code = body.charCodeAt(++position);
  440. } while (code >= 48 && code <= 57); // 0 - 9
  441. return position;
  442. }
  443. throw syntaxError(
  444. source,
  445. position,
  446. `Invalid number, expected digit but got: ${printCharCode(code)}.`,
  447. );
  448. }
  449. /**
  450. * Reads a string token from the source file.
  451. *
  452. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  453. */
  454. function readString(source, start, line, col, prev): Token {
  455. const body = source.body;
  456. let position = start + 1;
  457. let chunkStart = position;
  458. let code = 0;
  459. let value = '';
  460. while (
  461. position < body.length &&
  462. !isNaN((code = body.charCodeAt(position))) &&
  463. // not LineTerminator
  464. code !== 0x000a &&
  465. code !== 0x000d
  466. ) {
  467. // Closing Quote (")
  468. if (code === 34) {
  469. value += body.slice(chunkStart, position);
  470. return new Tok(
  471. TokenKind.STRING,
  472. start,
  473. position + 1,
  474. line,
  475. col,
  476. prev,
  477. value,
  478. );
  479. }
  480. // SourceCharacter
  481. if (code < 0x0020 && code !== 0x0009) {
  482. throw syntaxError(
  483. source,
  484. position,
  485. `Invalid character within String: ${printCharCode(code)}.`,
  486. );
  487. }
  488. ++position;
  489. if (code === 92) {
  490. // \
  491. value += body.slice(chunkStart, position - 1);
  492. code = body.charCodeAt(position);
  493. switch (code) {
  494. case 34:
  495. value += '"';
  496. break;
  497. case 47:
  498. value += '/';
  499. break;
  500. case 92:
  501. value += '\\';
  502. break;
  503. case 98:
  504. value += '\b';
  505. break;
  506. case 102:
  507. value += '\f';
  508. break;
  509. case 110:
  510. value += '\n';
  511. break;
  512. case 114:
  513. value += '\r';
  514. break;
  515. case 116:
  516. value += '\t';
  517. break;
  518. case 117: {
  519. // uXXXX
  520. const charCode = uniCharCode(
  521. body.charCodeAt(position + 1),
  522. body.charCodeAt(position + 2),
  523. body.charCodeAt(position + 3),
  524. body.charCodeAt(position + 4),
  525. );
  526. if (charCode < 0) {
  527. const invalidSequence = body.slice(position + 1, position + 5);
  528. throw syntaxError(
  529. source,
  530. position,
  531. `Invalid character escape sequence: \\u${invalidSequence}.`,
  532. );
  533. }
  534. value += String.fromCharCode(charCode);
  535. position += 4;
  536. break;
  537. }
  538. default:
  539. throw syntaxError(
  540. source,
  541. position,
  542. `Invalid character escape sequence: \\${String.fromCharCode(
  543. code,
  544. )}.`,
  545. );
  546. }
  547. ++position;
  548. chunkStart = position;
  549. }
  550. }
  551. throw syntaxError(source, position, 'Unterminated string.');
  552. }
  553. /**
  554. * Reads a block string token from the source file.
  555. *
  556. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  557. */
  558. function readBlockString(source, start, line, col, prev, lexer): Token {
  559. const body = source.body;
  560. let position = start + 3;
  561. let chunkStart = position;
  562. let code = 0;
  563. let rawValue = '';
  564. while (position < body.length && !isNaN((code = body.charCodeAt(position)))) {
  565. // Closing Triple-Quote (""")
  566. if (
  567. code === 34 &&
  568. body.charCodeAt(position + 1) === 34 &&
  569. body.charCodeAt(position + 2) === 34
  570. ) {
  571. rawValue += body.slice(chunkStart, position);
  572. return new Tok(
  573. TokenKind.BLOCK_STRING,
  574. start,
  575. position + 3,
  576. line,
  577. col,
  578. prev,
  579. dedentBlockStringValue(rawValue),
  580. );
  581. }
  582. // SourceCharacter
  583. if (
  584. code < 0x0020 &&
  585. code !== 0x0009 &&
  586. code !== 0x000a &&
  587. code !== 0x000d
  588. ) {
  589. throw syntaxError(
  590. source,
  591. position,
  592. `Invalid character within String: ${printCharCode(code)}.`,
  593. );
  594. }
  595. if (code === 10) {
  596. // new line
  597. ++position;
  598. ++lexer.line;
  599. lexer.lineStart = position;
  600. } else if (code === 13) {
  601. // carriage return
  602. if (body.charCodeAt(position + 1) === 10) {
  603. position += 2;
  604. } else {
  605. ++position;
  606. }
  607. ++lexer.line;
  608. lexer.lineStart = position;
  609. } else if (
  610. // Escape Triple-Quote (\""")
  611. code === 92 &&
  612. body.charCodeAt(position + 1) === 34 &&
  613. body.charCodeAt(position + 2) === 34 &&
  614. body.charCodeAt(position + 3) === 34
  615. ) {
  616. rawValue += body.slice(chunkStart, position) + '"""';
  617. position += 4;
  618. chunkStart = position;
  619. } else {
  620. ++position;
  621. }
  622. }
  623. throw syntaxError(source, position, 'Unterminated string.');
  624. }
  625. /**
  626. * Converts four hexadecimal chars to the integer that the
  627. * string represents. For example, uniCharCode('0','0','0','f')
  628. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  629. *
  630. * Returns a negative number on error, if a char was invalid.
  631. *
  632. * This is implemented by noting that char2hex() returns -1 on error,
  633. * which means the result of ORing the char2hex() will also be negative.
  634. */
  635. function uniCharCode(a, b, c, d) {
  636. return (
  637. (char2hex(a) << 12) | (char2hex(b) << 8) | (char2hex(c) << 4) | char2hex(d)
  638. );
  639. }
  640. /**
  641. * Converts a hex character to its integer value.
  642. * '0' becomes 0, '9' becomes 9
  643. * 'A' becomes 10, 'F' becomes 15
  644. * 'a' becomes 10, 'f' becomes 15
  645. *
  646. * Returns -1 on error.
  647. */
  648. function char2hex(a) {
  649. return a >= 48 && a <= 57
  650. ? a - 48 // 0-9
  651. : a >= 65 && a <= 70
  652. ? a - 55 // A-F
  653. : a >= 97 && a <= 102
  654. ? a - 87 // a-f
  655. : -1;
  656. }
  657. /**
  658. * Reads an alphanumeric + underscore name from the source.
  659. *
  660. * [_A-Za-z][_0-9A-Za-z]*
  661. */
  662. function readName(source, start, line, col, prev): Token {
  663. const body = source.body;
  664. const bodyLength = body.length;
  665. let position = start + 1;
  666. let code = 0;
  667. while (
  668. position !== bodyLength &&
  669. !isNaN((code = body.charCodeAt(position))) &&
  670. (code === 95 || // _
  671. (code >= 48 && code <= 57) || // 0-9
  672. (code >= 65 && code <= 90) || // A-Z
  673. (code >= 97 && code <= 122)) // a-z
  674. ) {
  675. ++position;
  676. }
  677. return new Tok(
  678. TokenKind.NAME,
  679. start,
  680. position,
  681. line,
  682. col,
  683. prev,
  684. body.slice(start, position),
  685. );
  686. }