lexer.js 16 KB


  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", {
  3. value: true
  4. });
  5. exports.createLexer = createLexer;
  6. exports.isPunctuatorToken = isPunctuatorToken;
  7. var _defineToJSON = _interopRequireDefault(require("../jsutils/defineToJSON"));
  8. var _syntaxError = require("../error/syntaxError");
  9. var _blockString = require("./blockString");
  10. var _tokenKind = require("./tokenKind");
  11. function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
  12. /**
  13. * Given a Source object, this returns a Lexer for that source.
  14. * A Lexer is a stateful stream generator in that every time
  15. * it is advanced, it returns the next token in the Source. Assuming the
  16. * source lexes, the final Token emitted by the lexer will be of kind
  17. * EOF, after which the lexer will repeatedly return the same EOF token
  18. * whenever called.
  19. */
  20. function createLexer(source, options) {
  21. var startOfFileToken = new Tok(_tokenKind.TokenKind.SOF, 0, 0, 0, 0, null);
  22. var lexer = {
  23. source: source,
  24. options: options,
  25. lastToken: startOfFileToken,
  26. token: startOfFileToken,
  27. line: 1,
  28. lineStart: 0,
  29. advance: advanceLexer,
  30. lookahead: lookahead
  31. };
  32. return lexer;
  33. }
  34. function advanceLexer() {
  35. this.lastToken = this.token;
  36. var token = this.token = this.lookahead();
  37. return token;
  38. }
  39. function lookahead() {
  40. var token = this.token;
  41. if (token.kind !== _tokenKind.TokenKind.EOF) {
  42. do {
  43. // Note: next is only mutable during parsing, so we cast to allow this.
  44. token = token.next || (token.next = readToken(this, token));
  45. } while (token.kind === _tokenKind.TokenKind.COMMENT);
  46. }
  47. return token;
  48. }
  49. /**
  50. * The return type of createLexer.
  51. */
  52. // @internal
  53. function isPunctuatorToken(token) {
  54. var kind = token.kind;
  55. return kind === _tokenKind.TokenKind.BANG || kind === _tokenKind.TokenKind.DOLLAR || kind === _tokenKind.TokenKind.AMP || kind === _tokenKind.TokenKind.PAREN_L || kind === _tokenKind.TokenKind.PAREN_R || kind === _tokenKind.TokenKind.SPREAD || kind === _tokenKind.TokenKind.COLON || kind === _tokenKind.TokenKind.EQUALS || kind === _tokenKind.TokenKind.AT || kind === _tokenKind.TokenKind.BRACKET_L || kind === _tokenKind.TokenKind.BRACKET_R || kind === _tokenKind.TokenKind.BRACE_L || kind === _tokenKind.TokenKind.PIPE || kind === _tokenKind.TokenKind.BRACE_R;
  56. }
  57. /**
  58. * Helper function for constructing the Token object.
  59. */
  60. function Tok(kind, start, end, line, column, prev, value) {
  61. this.kind = kind;
  62. this.start = start;
  63. this.end = end;
  64. this.line = line;
  65. this.column = column;
  66. this.value = value;
  67. this.prev = prev;
  68. this.next = null;
  69. } // Print a simplified form when appearing in JSON/util.inspect.
  70. (0, _defineToJSON.default)(Tok, function () {
  71. return {
  72. kind: this.kind,
  73. value: this.value,
  74. line: this.line,
  75. column: this.column
  76. };
  77. });
  78. function printCharCode(code) {
  79. return (// NaN/undefined represents access beyond the end of the file.
  80. isNaN(code) ? _tokenKind.TokenKind.EOF : // Trust JSON for ASCII.
  81. code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form.
  82. "\"\\u".concat(('00' + code.toString(16).toUpperCase()).slice(-4), "\"")
  83. );
  84. }
  85. /**
  86. * Gets the next token from the source starting at the given position.
  87. *
  88. * This skips over whitespace until it finds the next lexable token, then lexes
  89. * punctuators immediately or calls the appropriate helper function for more
  90. * complicated tokens.
  91. */
  92. function readToken(lexer, prev) {
  93. var source = lexer.source;
  94. var body = source.body;
  95. var bodyLength = body.length;
  96. var pos = positionAfterWhitespace(body, prev.end, lexer);
  97. var line = lexer.line;
  98. var col = 1 + pos - lexer.lineStart;
  99. if (pos >= bodyLength) {
  100. return new Tok(_tokenKind.TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
  101. }
  102. var code = body.charCodeAt(pos); // SourceCharacter
  103. switch (code) {
  104. // !
  105. case 33:
  106. return new Tok(_tokenKind.TokenKind.BANG, pos, pos + 1, line, col, prev);
  107. // #
  108. case 35:
  109. return readComment(source, pos, line, col, prev);
  110. // $
  111. case 36:
  112. return new Tok(_tokenKind.TokenKind.DOLLAR, pos, pos + 1, line, col, prev);
  113. // &
  114. case 38:
  115. return new Tok(_tokenKind.TokenKind.AMP, pos, pos + 1, line, col, prev);
  116. // (
  117. case 40:
  118. return new Tok(_tokenKind.TokenKind.PAREN_L, pos, pos + 1, line, col, prev);
  119. // )
  120. case 41:
  121. return new Tok(_tokenKind.TokenKind.PAREN_R, pos, pos + 1, line, col, prev);
  122. // .
  123. case 46:
  124. if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
  125. return new Tok(_tokenKind.TokenKind.SPREAD, pos, pos + 3, line, col, prev);
  126. }
  127. break;
  128. // :
  129. case 58:
  130. return new Tok(_tokenKind.TokenKind.COLON, pos, pos + 1, line, col, prev);
  131. // =
  132. case 61:
  133. return new Tok(_tokenKind.TokenKind.EQUALS, pos, pos + 1, line, col, prev);
  134. // @
  135. case 64:
  136. return new Tok(_tokenKind.TokenKind.AT, pos, pos + 1, line, col, prev);
  137. // [
  138. case 91:
  139. return new Tok(_tokenKind.TokenKind.BRACKET_L, pos, pos + 1, line, col, prev);
  140. // ]
  141. case 93:
  142. return new Tok(_tokenKind.TokenKind.BRACKET_R, pos, pos + 1, line, col, prev);
  143. // {
  144. case 123:
  145. return new Tok(_tokenKind.TokenKind.BRACE_L, pos, pos + 1, line, col, prev);
  146. // |
  147. case 124:
  148. return new Tok(_tokenKind.TokenKind.PIPE, pos, pos + 1, line, col, prev);
  149. // }
  150. case 125:
  151. return new Tok(_tokenKind.TokenKind.BRACE_R, pos, pos + 1, line, col, prev);
  152. // A-Z _ a-z
  153. case 65:
  154. case 66:
  155. case 67:
  156. case 68:
  157. case 69:
  158. case 70:
  159. case 71:
  160. case 72:
  161. case 73:
  162. case 74:
  163. case 75:
  164. case 76:
  165. case 77:
  166. case 78:
  167. case 79:
  168. case 80:
  169. case 81:
  170. case 82:
  171. case 83:
  172. case 84:
  173. case 85:
  174. case 86:
  175. case 87:
  176. case 88:
  177. case 89:
  178. case 90:
  179. case 95:
  180. case 97:
  181. case 98:
  182. case 99:
  183. case 100:
  184. case 101:
  185. case 102:
  186. case 103:
  187. case 104:
  188. case 105:
  189. case 106:
  190. case 107:
  191. case 108:
  192. case 109:
  193. case 110:
  194. case 111:
  195. case 112:
  196. case 113:
  197. case 114:
  198. case 115:
  199. case 116:
  200. case 117:
  201. case 118:
  202. case 119:
  203. case 120:
  204. case 121:
  205. case 122:
  206. return readName(source, pos, line, col, prev);
  207. // - 0-9
  208. case 45:
  209. case 48:
  210. case 49:
  211. case 50:
  212. case 51:
  213. case 52:
  214. case 53:
  215. case 54:
  216. case 55:
  217. case 56:
  218. case 57:
  219. return readNumber(source, pos, code, line, col, prev);
  220. // "
  221. case 34:
  222. if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
  223. return readBlockString(source, pos, line, col, prev, lexer);
  224. }
  225. return readString(source, pos, line, col, prev);
  226. }
  227. throw (0, _syntaxError.syntaxError)(source, pos, unexpectedCharacterMessage(code));
  228. }
  229. /**
  230. * Report a message that an unexpected character was encountered.
  231. */
  232. function unexpectedCharacterMessage(code) {
  233. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  234. return "Cannot contain the invalid character ".concat(printCharCode(code), ".");
  235. }
  236. if (code === 39) {
  237. // '
  238. return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
  239. }
  240. return "Cannot parse the unexpected character ".concat(printCharCode(code), ".");
  241. }
  242. /**
  243. * Reads from body starting at startPosition until it finds a non-whitespace
  244. * character, then returns the position of that character for lexing.
  245. */
  246. function positionAfterWhitespace(body, startPosition, lexer) {
  247. var bodyLength = body.length;
  248. var position = startPosition;
  249. while (position < bodyLength) {
  250. var code = body.charCodeAt(position); // tab | space | comma | BOM
  251. if (code === 9 || code === 32 || code === 44 || code === 0xfeff) {
  252. ++position;
  253. } else if (code === 10) {
  254. // new line
  255. ++position;
  256. ++lexer.line;
  257. lexer.lineStart = position;
  258. } else if (code === 13) {
  259. // carriage return
  260. if (body.charCodeAt(position + 1) === 10) {
  261. position += 2;
  262. } else {
  263. ++position;
  264. }
  265. ++lexer.line;
  266. lexer.lineStart = position;
  267. } else {
  268. break;
  269. }
  270. }
  271. return position;
  272. }
  273. /**
  274. * Reads a comment token from the source file.
  275. *
  276. * #[\u0009\u0020-\uFFFF]*
  277. */
  278. function readComment(source, start, line, col, prev) {
  279. var body = source.body;
  280. var code;
  281. var position = start;
  282. do {
  283. code = body.charCodeAt(++position);
  284. } while (!isNaN(code) && ( // SourceCharacter but not LineTerminator
  285. code > 0x001f || code === 0x0009));
  286. return new Tok(_tokenKind.TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position));
  287. }
  288. /**
  289. * Reads a number token from the source file, either a float
  290. * or an int depending on whether a decimal point appears.
  291. *
  292. * Int: -?(0|[1-9][0-9]*)
  293. * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
  294. */
  295. function readNumber(source, start, firstCode, line, col, prev) {
  296. var body = source.body;
  297. var code = firstCode;
  298. var position = start;
  299. var isFloat = false;
  300. if (code === 45) {
  301. // -
  302. code = body.charCodeAt(++position);
  303. }
  304. if (code === 48) {
  305. // 0
  306. code = body.charCodeAt(++position);
  307. if (code >= 48 && code <= 57) {
  308. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, unexpected digit after 0: ".concat(printCharCode(code), "."));
  309. }
  310. } else {
  311. position = readDigits(source, position, code);
  312. code = body.charCodeAt(position);
  313. }
  314. if (code === 46) {
  315. // .
  316. isFloat = true;
  317. code = body.charCodeAt(++position);
  318. position = readDigits(source, position, code);
  319. code = body.charCodeAt(position);
  320. }
  321. if (code === 69 || code === 101) {
  322. // E e
  323. isFloat = true;
  324. code = body.charCodeAt(++position);
  325. if (code === 43 || code === 45) {
  326. // + -
  327. code = body.charCodeAt(++position);
  328. }
  329. position = readDigits(source, position, code);
  330. code = body.charCodeAt(position);
  331. } // Numbers cannot be followed by . or e
  332. if (code === 46 || code === 69 || code === 101) {
  333. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  334. }
  335. return new Tok(isFloat ? _tokenKind.TokenKind.FLOAT : _tokenKind.TokenKind.INT, start, position, line, col, prev, body.slice(start, position));
  336. }
  337. /**
  338. * Returns the new position in the source after reading digits.
  339. */
  340. function readDigits(source, start, firstCode) {
  341. var body = source.body;
  342. var position = start;
  343. var code = firstCode;
  344. if (code >= 48 && code <= 57) {
  345. // 0 - 9
  346. do {
  347. code = body.charCodeAt(++position);
  348. } while (code >= 48 && code <= 57); // 0 - 9
  349. return position;
  350. }
  351. throw (0, _syntaxError.syntaxError)(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
  352. }
  353. /**
  354. * Reads a string token from the source file.
  355. *
  356. * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
  357. */
  358. function readString(source, start, line, col, prev) {
  359. var body = source.body;
  360. var position = start + 1;
  361. var chunkStart = position;
  362. var code = 0;
  363. var value = '';
  364. while (position < body.length && !isNaN(code = body.charCodeAt(position)) && // not LineTerminator
  365. code !== 0x000a && code !== 0x000d) {
  366. // Closing Quote (")
  367. if (code === 34) {
  368. value += body.slice(chunkStart, position);
  369. return new Tok(_tokenKind.TokenKind.STRING, start, position + 1, line, col, prev, value);
  370. } // SourceCharacter
  371. if (code < 0x0020 && code !== 0x0009) {
  372. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  373. }
  374. ++position;
  375. if (code === 92) {
  376. // \
  377. value += body.slice(chunkStart, position - 1);
  378. code = body.charCodeAt(position);
  379. switch (code) {
  380. case 34:
  381. value += '"';
  382. break;
  383. case 47:
  384. value += '/';
  385. break;
  386. case 92:
  387. value += '\\';
  388. break;
  389. case 98:
  390. value += '\b';
  391. break;
  392. case 102:
  393. value += '\f';
  394. break;
  395. case 110:
  396. value += '\n';
  397. break;
  398. case 114:
  399. value += '\r';
  400. break;
  401. case 116:
  402. value += '\t';
  403. break;
  404. case 117:
  405. {
  406. // uXXXX
  407. var charCode = uniCharCode(body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4));
  408. if (charCode < 0) {
  409. var invalidSequence = body.slice(position + 1, position + 5);
  410. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character escape sequence: \\u".concat(invalidSequence, "."));
  411. }
  412. value += String.fromCharCode(charCode);
  413. position += 4;
  414. break;
  415. }
  416. default:
  417. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character escape sequence: \\".concat(String.fromCharCode(code), "."));
  418. }
  419. ++position;
  420. chunkStart = position;
  421. }
  422. }
  423. throw (0, _syntaxError.syntaxError)(source, position, 'Unterminated string.');
  424. }
  425. /**
  426. * Reads a block string token from the source file.
  427. *
  428. * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
  429. */
  430. function readBlockString(source, start, line, col, prev, lexer) {
  431. var body = source.body;
  432. var position = start + 3;
  433. var chunkStart = position;
  434. var code = 0;
  435. var rawValue = '';
  436. while (position < body.length && !isNaN(code = body.charCodeAt(position))) {
  437. // Closing Triple-Quote (""")
  438. if (code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34) {
  439. rawValue += body.slice(chunkStart, position);
  440. return new Tok(_tokenKind.TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, (0, _blockString.dedentBlockStringValue)(rawValue));
  441. } // SourceCharacter
  442. if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
  443. throw (0, _syntaxError.syntaxError)(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
  444. }
  445. if (code === 10) {
  446. // new line
  447. ++position;
  448. ++lexer.line;
  449. lexer.lineStart = position;
  450. } else if (code === 13) {
  451. // carriage return
  452. if (body.charCodeAt(position + 1) === 10) {
  453. position += 2;
  454. } else {
  455. ++position;
  456. }
  457. ++lexer.line;
  458. lexer.lineStart = position;
  459. } else if ( // Escape Triple-Quote (\""")
  460. code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34) {
  461. rawValue += body.slice(chunkStart, position) + '"""';
  462. position += 4;
  463. chunkStart = position;
  464. } else {
  465. ++position;
  466. }
  467. }
  468. throw (0, _syntaxError.syntaxError)(source, position, 'Unterminated string.');
  469. }
  470. /**
  471. * Converts four hexadecimal chars to the integer that the
  472. * string represents. For example, uniCharCode('0','0','0','f')
  473. * will return 15, and uniCharCode('0','0','f','f') returns 255.
  474. *
  475. * Returns a negative number on error, if a char was invalid.
  476. *
  477. * This is implemented by noting that char2hex() returns -1 on error,
  478. * which means the result of ORing the char2hex() will also be negative.
  479. */
  480. function uniCharCode(a, b, c, d) {
  481. return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d);
  482. }
  483. /**
  484. * Converts a hex character to its integer value.
  485. * '0' becomes 0, '9' becomes 9
  486. * 'A' becomes 10, 'F' becomes 15
  487. * 'a' becomes 10, 'f' becomes 15
  488. *
  489. * Returns -1 on error.
  490. */
  491. function char2hex(a) {
  492. return a >= 48 && a <= 57 ? a - 48 // 0-9
  493. : a >= 65 && a <= 70 ? a - 55 // A-F
  494. : a >= 97 && a <= 102 ? a - 87 // a-f
  495. : -1;
  496. }
  497. /**
  498. * Reads an alphanumeric + underscore name from the source.
  499. *
  500. * [_A-Za-z][_0-9A-Za-z]*
  501. */
  502. function readName(source, start, line, col, prev) {
  503. var body = source.body;
  504. var bodyLength = body.length;
  505. var position = start + 1;
  506. var code = 0;
  507. while (position !== bodyLength && !isNaN(code = body.charCodeAt(position)) && (code === 95 || // _
  508. code >= 48 && code <= 57 || // 0-9
  509. code >= 65 && code <= 90 || // A-Z
  510. code >= 97 && code <= 122) // a-z
  511. ) {
  512. ++position;
  513. }
  514. return new Tok(_tokenKind.TokenKind.NAME, start, position, line, col, prev, body.slice(start, position));
  515. }