lexer.mjs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. import defineToJSON from '../jsutils/defineToJSON';
  2. import { syntaxError } from '../error/syntaxError';
  3. import { dedentBlockStringValue } from './blockString';
  4. import { TokenKind } from './tokenKind';
  5. /**
  6. * Given a Source object, this returns a Lexer for that source.
  7. * A Lexer is a stateful stream generator in that every time
  8. * it is advanced, it returns the next token in the Source. Assuming the
  9. * source lexes, the final Token emitted by the lexer will be of kind
  10. * EOF, after which the lexer will repeatedly return the same EOF token
  11. * whenever called.
  12. */
  13. export function createLexer(source, options) {
  14. var startOfFileToken = new Tok(TokenKind.SOF, 0, 0, 0, 0, null);
  15. var lexer = {
  16. source: source,
  17. options: options,
  18. lastToken: startOfFileToken,
  19. token: startOfFileToken,
  20. line: 1,
  21. lineStart: 0,
  22. advance: advanceLexer,
  23. lookahead: lookahead
  24. };
  25. return lexer;
  26. }
  27. function advanceLexer() {
  28. this.lastToken = this.token;
  29. var token = this.token = this.lookahead();
  30. return token;
  31. }
  32. function lookahead() {
  33. var token = this.token;
  34. if (token.kind !== TokenKind.EOF) {
  35. do {
  36. // Note: next is only mutable during parsing, so we cast to allow this.
  37. token = token.next || (token.next = readToken(this, token));
  38. } while (token.kind === TokenKind.COMMENT);
  39. }
  40. return token;
  41. }
  42. /**
  43. * The return type of createLexer.
  44. */
  45. // @internal
  46. export function isPunctuatorToken(token) {
  47. var kind = token.kind;
  48. return kind === TokenKind.BANG || kind === TokenKind.DOLLAR || kind === TokenKind.AMP || kind === TokenKind.PAREN_L || kind === TokenKind.PAREN_R || kind === TokenKind.SPREAD || kind === TokenKind.COLON || kind === TokenKind.EQUALS || kind === TokenKind.AT || kind === TokenKind.BRACKET_L || kind === TokenKind.BRACKET_R || kind === TokenKind.BRACE_L || kind === TokenKind.PIPE || kind === TokenKind.BRACE_R;
  49. }
  50. /**
  51. * Helper function for constructing the Token object.
  52. */
  53. function Tok(kind, start, end, line, column, prev, value) {
  54. this.kind = kind;
  55. this.start = start;
  56. this.end = end;
  57. this.line = line;
  58. this.column = column;
  59. this.value = value;
  60. this.prev = prev;
  61. this.next = null;
  62. } // Print a simplified form when appearing in JSON/util.inspect.
  63. defineToJSON(Tok, function () {
  64. return {
  65. kind: this.kind,
  66. value: this.value,
  67. line: this.line,
  68. column: this.column
  69. };
  70. });
  71. function printCharCode(code) {
  72. return (// NaN/undefined represents access beyond the end of the file.
  73. isNaN(code) ? TokenKind.EOF : // Trust JSON for ASCII.
  74. code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form.
  75. "\"\\u".concat(('00' + code.toString(16).toUpperCase()).slice(-4), "\"")
  76. );
  77. }
  78. /**
  79. * Gets the next token from the source starting at the given position.
  80. *
  81. * This skips over whitespace until it finds the next lexable token, then lexes
  82. * punctuators immediately or calls the appropriate helper function for more
  83. * complicated tokens.
  84. */
  85. function readToken(lexer, prev) {
  86. var source = lexer.source;
  87. var body = source.body;
  88. var bodyLength = body.length;
  89. var pos = positionAfterWhitespace(body, prev.end, lexer);
  90. var line = lexer.line;
  91. var col = 1 + pos - lexer.lineStart;
  92. if (pos >= bodyLength) {
  93. return new Tok(TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  94. }
  95. var code = body.charCodeAt(pos); // SourceCharacter
  96. switch (code) {
  97. // !
  98. case 33:
  99. return new Tok(TokenKind.BANG, pos, pos + 1, line, col, prev);
  100. // #
  101. case 35:
  102. return readComment(source, pos, line, col, prev);
  103. // $
  104. case 36:
  105. return new Tok(TokenKind.DOLLAR, pos, pos + 1, line, col, prev);
  106. // &
  107. case 38:
  108. return new Tok(TokenKind.AMP, pos, pos + 1, line, col, prev);
  109. // (
  110. case 40:
  111. return new Tok(TokenKind.PAREN_L, pos, pos + 1, line, col, prev);
  112. // )
  113. case 41:
  114. return new Tok(TokenKind.PAREN_R, pos, pos + 1, line, col, prev);
  115. // .
  116. case 46:
  117. if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
  118. return new Tok(TokenKind.SPREAD, pos, pos + 3, line, col, prev);
  119. }
  120. break;
  121. // :
  122. case 58:
  123. return new Tok(TokenKind.COLON, pos, pos + 1, line, col, prev);
  124. // =
  125. case 61:
  126. return new Tok(TokenKind.EQUALS, pos, pos + 1, line, col, prev);
  127. // @
  128. case 64:
  129. return new Tok(TokenKind.AT, pos, pos + 1, line, col, prev);
  130. // [
  131. case 91:
  132. return new Tok(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev);
  133. // ]
  134. case 93:
  135. return new Tok(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev);
  136. // {
  137. case 123:
  138. return new Tok(TokenKind.BRACE_L, pos, pos + 1, line, col, prev);
  139. // |
  140. case 124:
  141. return new Tok(TokenKind.PIPE, pos, pos + 1, line, col, prev);
  142. // }
  143. case 125:
  144. return new Tok(TokenKind.BRACE_R, pos, pos + 1, line, col, prev);
  145. // A-Z _ a-z
  146. case 65:
  147. case 66:
  148. case 67:
  149. case 68:
  150. case 69:
  151. case 70:
  152. case 71:
  153. case 72:
  154. case 73:
  155. case 74:
  156. case 75:
  157. case 76:
  158. case 77:
  159. case 78:
  160. case 79:
  161. case 80:
  162. case 81:
  163. case 82:
  164. case 83:
  165. case 84:
  166. case 85:
  167. case 86:
  168. case 87:
  169. case 88:
  170. case 89:
  171. case 90:
  172. case 95:
  173. case 97:
  174. case 98:
  175. case 99:
  176. case 100:
  177. case 101:
  178. case 102:
  179. case 103:
  180. case 104:
  181. case 105:
  182. case 106:
  183. case 107:
  184. case 108:
  185. case 109:
  186. case 110:
  187. case 111:
  188. case 112:
  189. case 113:
  190. case 114:
  191. case 115:
  192. case 116:
  193. case 117:
  194. case 118:
  195. case 119:
  196. case 120:
  197. case 121:
  198. case 122:
  199. return readName(source, pos, line, col, prev);
  200. // - 0-9
  201. case 45:
  202. case 48:
  203. case 49:
  204. case 50:
  205. case 51:
  206. case 52:
  207. case 53:
  208. case 54:
  209. case 55:
  210. case 56:
  211. case 57:
  212. return readNumber(source, pos, code, line, col, prev);
  213. // "
  214. case 34:
  215. if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
  216. return readBlockString(source, pos, line, col, prev, lexer);
  217. }
  218. return readString(source, pos, line, col, prev);
  219. }
  220. throw syntaxError(source, pos, unexpectedCharacterMessage(code));
  221. }
  222. /**
  223. * Report a message that an unexpected character was encountered.
  224. */
  225. function unexpectedCharacterMessage(code) {
  226. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  227. return "Cannot contain the invalid character ".concat(printCharCode(code), ".");
  228. }
  229. if (code === 39) {
  230. // '
  231. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  232. }
  233. return "Cannot parse the unexpected character ".concat(printCharCode(code), ".");
  234. }
  235. /**
  236. * Reads from body starting at startPosition until it finds a non-whitespace
  237. * character, then returns the position of that character for lexing.
  238. */
  239. function positionAfterWhitespace(body, startPosition, lexer) {
  240. var bodyLength = body.length;
  241. var position = startPosition;
  242. while (position < bodyLength) {
  243. var code = body.charCodeAt(position); // tab | space | comma | BOM
  244. if (code === 9 || code === 32 || code === 44 || code === 0xfeff) {
  245. ++position;
  246. } else if (code === 10) {
  247. // new line
  248. ++position;
  249. ++lexer.line;
  250. lexer.lineStart = position;
  251. } else if (code === 13) {
  252. // carriage return
  253. if (body.charCodeAt(position + 1) === 10) {
  254. position += 2;
  255. } else {
  256. ++position;
  257. }
  258. ++lexer.line;
  259. lexer.lineStart = position;
  260. } else {
  261. break;
  262. }
  263. }
  264. return position;
  265. }
  266. /**
  267. * Reads a comment token from the source file.
  268. *
  269. * #[\u0009\u0020-\uFFFF]*
  270. */
  271. function readComment(source, start, line, col, prev) {
  272. var body = source.body;
  273. var code;
  274. var position = start;
  275. do {
  276. code = body.charCodeAt(++position);
  277. } while (!isNaN(code) && ( // SourceCharacter but not LineTerminator
  278. code > 0x001f || code === 0x0009));
  279. return new Tok(TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position));
  280. }
  281. /**
  282. * Reads a number token from the source file, either a float
  283. * or an int depending on whether a decimal point appears.
  284. *
  285. * Int: -?(0|[1-9][0-9]*)
  286. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  287. */
  288. function readNumber(source, start, firstCode, line, col, prev) {
  289. var body = source.body;
  290. var code = firstCode;
  291. var position = start;
  292. var isFloat = false;
  293. if (code === 45) {
  294. // -
  295. code = body.charCodeAt(++position);
  296. }
  297. if (code === 48) {
  298. // 0
  299. code = body.charCodeAt(++position);
  300. if (code >= 48 && code <= 57) {
  301. throw syntaxError(source, position, "Invalid number, unexpected digit after 0: ".concat(printCharCode(code), "."));
  302. }
  303. } else {
  304. position = readDigits(source, position, code);
  305. code = body.charCodeAt(position);
  306. }
  307. if (code === 46) {
  308. // .
  309. isFloat = true;
  310. code = body.charCodeAt(++position);
  311. position = readDigits(source, position, code);
  312. code = body.charCodeAt(position);
  313. }
  314. if (code === 69 || code === 101) {
  315. // E e
  316. isFloat = true;
  317. code = body.charCodeAt(++position);
  318. if (code === 43 || code === 45) {
  319. // + -
  320. code = body.charCodeAt(++position);
  321. }
  322. position = readDigits(source, position, code);
  323. code = body.charCodeAt(position);
  324. } // Numbers cannot be followed by . or e
  325. if (code === 46 || code === 69 || code === 101) {
  326. throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  327. }
  328. return new Tok(isFloat ? TokenKind.FLOAT : TokenKind.INT, start, position, line, col, prev, body.slice(start, position));
  329. }
  330. /**
  331. * Returns the new position in the source after reading digits.
  332. */
  333. function readDigits(source, start, firstCode) {
  334. var body = source.body;
  335. var position = start;
  336. var code = firstCode;
  337. if (code >= 48 && code <= 57) {
  338. // 0 - 9
  339. do {
  340. code = body.charCodeAt(++position);
  341. } while (code >= 48 && code <= 57); // 0 - 9
  342. return position;
  343. }
  344. throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  345. }
  346. /**
  347. * Reads a string token from the source file.
  348. *
  349. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  350. */
  351. function readString(source, start, line, col, prev) {
  352. var body = source.body;
  353. var position = start + 1;
  354. var chunkStart = position;
  355. var code = 0;
  356. var value = '';
  357. while (position < body.length && !isNaN(code = body.charCodeAt(position)) && // not LineTerminator
  358. code !== 0x000a && code !== 0x000d) {
  359. // Closing Quote (")
  360. if (code === 34) {
  361. value += body.slice(chunkStart, position);
  362. return new Tok(TokenKind.STRING, start, position + 1, line, col, prev, value);
  363. } // SourceCharacter
  364. if (code < 0x0020 && code !== 0x0009) {
  365. throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  366. }
  367. ++position;
  368. if (code === 92) {
  369. // \
  370. value += body.slice(chunkStart, position - 1);
  371. code = body.charCodeAt(position);
  372. switch (code) {
  373. case 34:
  374. value += '"';
  375. break;
  376. case 47:
  377. value += '/';
  378. break;
  379. case 92:
  380. value += '\\';
  381. break;
  382. case 98:
  383. value += '\b';
  384. break;
  385. case 102:
  386. value += '\f';
  387. break;
  388. case 110:
  389. value += '\n';
  390. break;
  391. case 114:
  392. value += '\r';
  393. break;
  394. case 116:
  395. value += '\t';
  396. break;
  397. case 117:
  398. {
  399. // uXXXX
  400. var charCode = uniCharCode(body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4));
  401. if (charCode < 0) {
  402. var invalidSequence = body.slice(position + 1, position + 5);
  403. throw syntaxError(source, position, "Invalid character escape sequence: \\u".concat(invalidSequence, "."));
  404. }
  405. value += String.fromCharCode(charCode);
  406. position += 4;
  407. break;
  408. }
  409. default:
  410. throw syntaxError(source, position, "Invalid character escape sequence: \\".concat(String.fromCharCode(code), "."));
  411. }
  412. ++position;
  413. chunkStart = position;
  414. }
  415. }
  416. throw syntaxError(source, position, 'Unterminated string.');
  417. }
  418. /**
  419. * Reads a block string token from the source file.
  420. *
  421. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  422. */
  423. function readBlockString(source, start, line, col, prev, lexer) {
  424. var body = source.body;
  425. var position = start + 3;
  426. var chunkStart = position;
  427. var code = 0;
  428. var rawValue = '';
  429. while (position < body.length && !isNaN(code = body.charCodeAt(position))) {
  430. // Closing Triple-Quote (""")
  431. if (code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34) {
  432. rawValue += body.slice(chunkStart, position);
  433. return new Tok(TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, dedentBlockStringValue(rawValue));
  434. } // SourceCharacter
  435. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  436. throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  437. }
  438. if (code === 10) {
  439. // new line
  440. ++position;
  441. ++lexer.line;
  442. lexer.lineStart = position;
  443. } else if (code === 13) {
  444. // carriage return
  445. if (body.charCodeAt(position + 1) === 10) {
  446. position += 2;
  447. } else {
  448. ++position;
  449. }
  450. ++lexer.line;
  451. lexer.lineStart = position;
  452. } else if ( // Escape Triple-Quote (\""")
  453. code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34) {
  454. rawValue += body.slice(chunkStart, position) + '"""';
  455. position += 4;
  456. chunkStart = position;
  457. } else {
  458. ++position;
  459. }
  460. }
  461. throw syntaxError(source, position, 'Unterminated string.');
  462. }
  463. /**
  464. * Converts four hexadecimal chars to the integer that the
  465. * string represents. For example, uniCharCode('0','0','0','f')
  466. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  467. *
  468. * Returns a negative number on error, if a char was invalid.
  469. *
  470. * This is implemented by noting that char2hex() returns -1 on error,
  471. * which means the result of ORing the char2hex() will also be negative.
  472. */
  473. function uniCharCode(a, b, c, d) {
  474. return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d);
  475. }
  476. /**
  477. * Converts a hex character to its integer value.
  478. * '0' becomes 0, '9' becomes 9
  479. * 'A' becomes 10, 'F' becomes 15
  480. * 'a' becomes 10, 'f' becomes 15
  481. *
  482. * Returns -1 on error.
  483. */
  484. function char2hex(a) {
  485. return a >= 48 && a <= 57 ? a - 48 // 0-9
  486. : a >= 65 && a <= 70 ? a - 55 // A-F
  487. : a >= 97 && a <= 102 ? a - 87 // a-f
  488. : -1;
  489. }
  490. /**
  491. * Reads an alphanumeric + underscore name from the source.
  492. *
  493. * [_A-Za-z][_0-9A-Za-z]*
  494. */
  495. function readName(source, start, line, col, prev) {
  496. var body = source.body;
  497. var bodyLength = body.length;
  498. var position = start + 1;
  499. var code = 0;
  500. while (position !== bodyLength && !isNaN(code = body.charCodeAt(position)) && (code === 95 || // _
  501. code >= 48 && code <= 57 || // 0-9
  502. code >= 65 && code <= 90 || // A-Z
  503. code >= 97 && code <= 122) // a-z
  504. ) {
  505. ++position;
  506. }
  507. return new Tok(TokenKind.NAME, start, position, line, col, prev, body.slice(start, position));
  508. }