123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193 |
- var generate = require('regjsgen').generate;
- var parse = require('regjsparser').parse;
- var regenerate = require('regenerate');
- var iuMappings = require('./data/iu-mappings.json');
- var ESCAPE_SETS = require('./data/character-class-escape-sets.js');
- function getCharacterClassEscapeSet(character) {
- if (unicode) {
- if (ignoreCase) {
- return ESCAPE_SETS.UNICODE_IGNORE_CASE[character];
- }
- return ESCAPE_SETS.UNICODE[character];
- }
- return ESCAPE_SETS.REGULAR[character];
- }
- var object = {};
- var hasOwnProperty = object.hasOwnProperty;
- function has(object, property) {
- return hasOwnProperty.call(object, property);
- }
- // Prepare a Regenerate set containing all code points, used for negative
- // character classes (if any).
- var UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
- // Without the `u` flag, the range stops at 0xFFFF.
- // https://mths.be/es6#sec-pattern-semantics
- var BMP_SET = regenerate().addRange(0x0, 0xFFFF);
- // Prepare a Regenerate set containing all code points that are supposed to be
- // matched by `/./u`. https://mths.be/es6#sec-atom
- var DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
- .remove(
- // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
- 0x000A, // Line Feed <LF>
- 0x000D, // Carriage Return <CR>
- 0x2028, // Line Separator <LS>
- 0x2029 // Paragraph Separator <PS>
- );
- // Prepare a Regenerate set containing all code points that are supposed to be
- // matched by `/./` (only BMP code points).
- var DOT_SET = DOT_SET_UNICODE.clone()
- .intersection(BMP_SET);
- // Add a range of code points + any case-folded code points in that range to a
- // set.
- regenerate.prototype.iuAddRange = function(min, max) {
- var $this = this;
- do {
- var folded = caseFold(min);
- if (folded) {
- $this.add(folded);
- }
- } while (++min <= max);
- return $this;
- };
- function assign(target, source) {
- for (var key in source) {
- // Note: `hasOwnProperty` is not needed here.
- target[key] = source[key];
- }
- }
- function update(item, pattern) {
- // TODO: Test if memoizing `pattern` here is worth the effort.
- if (!pattern) {
- return;
- }
- var tree = parse(pattern, '');
- switch (tree.type) {
- case 'characterClass':
- case 'group':
- case 'value':
- // No wrapping needed.
- break;
- default:
- // Wrap the pattern in a non-capturing group.
- tree = wrap(tree, pattern);
- }
- assign(item, tree);
- }
- function wrap(tree, pattern) {
- // Wrap the pattern in a non-capturing group.
- return {
- 'type': 'group',
- 'behavior': 'ignore',
- 'body': [tree],
- 'raw': '(?:' + pattern + ')'
- };
- }
- function caseFold(codePoint) {
- return has(iuMappings, codePoint) ? iuMappings[codePoint] : false;
- }
- var ignoreCase = false;
- var unicode = false;
- function processCharacterClass(characterClassItem) {
- var set = regenerate();
- var body = characterClassItem.body.forEach(function(item) {
- switch (item.type) {
- case 'value':
- set.add(item.codePoint);
- if (ignoreCase && unicode) {
- var folded = caseFold(item.codePoint);
- if (folded) {
- set.add(folded);
- }
- }
- break;
- case 'characterClassRange':
- var min = item.min.codePoint;
- var max = item.max.codePoint;
- set.addRange(min, max);
- if (ignoreCase && unicode) {
- set.iuAddRange(min, max);
- }
- break;
- case 'characterClassEscape':
- set.add(getCharacterClassEscapeSet(item.value));
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* istanbul ignore next */
- default:
- throw Error('Unknown term type: ' + item.type);
- }
- });
- if (characterClassItem.negative) {
- set = (unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
- }
- update(characterClassItem, set.toString());
- return characterClassItem;
- }
- function processTerm(item) {
- switch (item.type) {
- case 'dot':
- update(
- item,
- (unicode ? DOT_SET_UNICODE : DOT_SET).toString()
- );
- break;
- case 'characterClass':
- item = processCharacterClass(item);
- break;
- case 'characterClassEscape':
- update(
- item,
- getCharacterClassEscapeSet(item.value).toString()
- );
- break;
- case 'alternative':
- case 'disjunction':
- case 'group':
- case 'quantifier':
- item.body = item.body.map(processTerm);
- break;
- case 'value':
- var codePoint = item.codePoint;
- var set = regenerate(codePoint);
- if (ignoreCase && unicode) {
- var folded = caseFold(codePoint);
- if (folded) {
- set.add(folded);
- }
- }
- update(item, set.toString());
- break;
- case 'anchor':
- case 'empty':
- case 'group':
- case 'reference':
- // Nothing to do here.
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* istanbul ignore next */
- default:
- throw Error('Unknown term type: ' + item.type);
- }
- return item;
- };
- module.exports = function(pattern, flags) {
- var tree = parse(pattern, flags);
- ignoreCase = flags ? flags.indexOf('i') > -1 : false;
- unicode = flags ? flags.indexOf('u') > -1 : false;
- assign(tree, processTerm(tree));
- return generate(tree);
- };
|