rewrite-pattern.js 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. var generate = require('regjsgen').generate;
  2. var parse = require('regjsparser').parse;
  3. var regenerate = require('regenerate');
  4. var iuMappings = require('./data/iu-mappings.json');
  5. var ESCAPE_SETS = require('./data/character-class-escape-sets.js');
  6. function getCharacterClassEscapeSet(character) {
  7. if (unicode) {
  8. if (ignoreCase) {
  9. return ESCAPE_SETS.UNICODE_IGNORE_CASE[character];
  10. }
  11. return ESCAPE_SETS.UNICODE[character];
  12. }
  13. return ESCAPE_SETS.REGULAR[character];
  14. }
  15. var object = {};
  16. var hasOwnProperty = object.hasOwnProperty;
  17. function has(object, property) {
  18. return hasOwnProperty.call(object, property);
  19. }
  20. // Prepare a Regenerate set containing all code points, used for negative
  21. // character classes (if any).
  22. var UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
  23. // Without the `u` flag, the range stops at 0xFFFF.
  24. // https://mths.be/es6#sec-pattern-semantics
  25. var BMP_SET = regenerate().addRange(0x0, 0xFFFF);
  26. // Prepare a Regenerate set containing all code points that are supposed to be
  27. // matched by `/./u`. https://mths.be/es6#sec-atom
  28. var DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
  29. .remove(
  30. // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
  31. 0x000A, // Line Feed <LF>
  32. 0x000D, // Carriage Return <CR>
  33. 0x2028, // Line Separator <LS>
  34. 0x2029 // Paragraph Separator <PS>
  35. );
  36. // Prepare a Regenerate set containing all code points that are supposed to be
  37. // matched by `/./` (only BMP code points).
  38. var DOT_SET = DOT_SET_UNICODE.clone()
  39. .intersection(BMP_SET);
  40. // Add a range of code points + any case-folded code points in that range to a
  41. // set.
  42. regenerate.prototype.iuAddRange = function(min, max) {
  43. var $this = this;
  44. do {
  45. var folded = caseFold(min);
  46. if (folded) {
  47. $this.add(folded);
  48. }
  49. } while (++min <= max);
  50. return $this;
  51. };
  52. function assign(target, source) {
  53. for (var key in source) {
  54. // Note: `hasOwnProperty` is not needed here.
  55. target[key] = source[key];
  56. }
  57. }
  58. function update(item, pattern) {
  59. // TODO: Test if memoizing `pattern` here is worth the effort.
  60. if (!pattern) {
  61. return;
  62. }
  63. var tree = parse(pattern, '');
  64. switch (tree.type) {
  65. case 'characterClass':
  66. case 'group':
  67. case 'value':
  68. // No wrapping needed.
  69. break;
  70. default:
  71. // Wrap the pattern in a non-capturing group.
  72. tree = wrap(tree, pattern);
  73. }
  74. assign(item, tree);
  75. }
  76. function wrap(tree, pattern) {
  77. // Wrap the pattern in a non-capturing group.
  78. return {
  79. 'type': 'group',
  80. 'behavior': 'ignore',
  81. 'body': [tree],
  82. 'raw': '(?:' + pattern + ')'
  83. };
  84. }
  85. function caseFold(codePoint) {
  86. return has(iuMappings, codePoint) ? iuMappings[codePoint] : false;
  87. }
  88. var ignoreCase = false;
  89. var unicode = false;
  90. function processCharacterClass(characterClassItem) {
  91. var set = regenerate();
  92. var body = characterClassItem.body.forEach(function(item) {
  93. switch (item.type) {
  94. case 'value':
  95. set.add(item.codePoint);
  96. if (ignoreCase && unicode) {
  97. var folded = caseFold(item.codePoint);
  98. if (folded) {
  99. set.add(folded);
  100. }
  101. }
  102. break;
  103. case 'characterClassRange':
  104. var min = item.min.codePoint;
  105. var max = item.max.codePoint;
  106. set.addRange(min, max);
  107. if (ignoreCase && unicode) {
  108. set.iuAddRange(min, max);
  109. }
  110. break;
  111. case 'characterClassEscape':
  112. set.add(getCharacterClassEscapeSet(item.value));
  113. break;
  114. // The `default` clause is only here as a safeguard; it should never be
  115. // reached. Code coverage tools should ignore it.
  116. /* istanbul ignore next */
  117. default:
  118. throw Error('Unknown term type: ' + item.type);
  119. }
  120. });
  121. if (characterClassItem.negative) {
  122. set = (unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
  123. }
  124. update(characterClassItem, set.toString());
  125. return characterClassItem;
  126. }
  127. function processTerm(item) {
  128. switch (item.type) {
  129. case 'dot':
  130. update(
  131. item,
  132. (unicode ? DOT_SET_UNICODE : DOT_SET).toString()
  133. );
  134. break;
  135. case 'characterClass':
  136. item = processCharacterClass(item);
  137. break;
  138. case 'characterClassEscape':
  139. update(
  140. item,
  141. getCharacterClassEscapeSet(item.value).toString()
  142. );
  143. break;
  144. case 'alternative':
  145. case 'disjunction':
  146. case 'group':
  147. case 'quantifier':
  148. item.body = item.body.map(processTerm);
  149. break;
  150. case 'value':
  151. var codePoint = item.codePoint;
  152. var set = regenerate(codePoint);
  153. if (ignoreCase && unicode) {
  154. var folded = caseFold(codePoint);
  155. if (folded) {
  156. set.add(folded);
  157. }
  158. }
  159. update(item, set.toString());
  160. break;
  161. case 'anchor':
  162. case 'empty':
  163. case 'group':
  164. case 'reference':
  165. // Nothing to do here.
  166. break;
  167. // The `default` clause is only here as a safeguard; it should never be
  168. // reached. Code coverage tools should ignore it.
  169. /* istanbul ignore next */
  170. default:
  171. throw Error('Unknown term type: ' + item.type);
  172. }
  173. return item;
  174. };
  175. module.exports = function(pattern, flags) {
  176. var tree = parse(pattern, flags);
  177. ignoreCase = flags ? flags.indexOf('i') > -1 : false;
  178. unicode = flags ? flags.indexOf('u') > -1 : false;
  179. assign(tree, processTerm(tree));
  180. return generate(tree);
  181. };