123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267 |
- 'use strict';
- /*
- Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
- by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
- */
- function memcmp(buf1, pos1, buf2, pos2, num) {
- for (let i = 0; i < num; ++i) {
- if (buf1[pos1 + i] !== buf2[pos2 + i])
- return false;
- }
- return true;
- }
- class SBMH {
- constructor(needle, cb) {
- if (typeof cb !== 'function')
- throw new Error('Missing match callback');
- if (typeof needle === 'string')
- needle = Buffer.from(needle);
- else if (!Buffer.isBuffer(needle))
- throw new Error(`Expected Buffer for needle, got ${typeof needle}`);
- const needleLen = needle.length;
- this.maxMatches = Infinity;
- this.matches = 0;
- this._cb = cb;
- this._lookbehindSize = 0;
- this._needle = needle;
- this._bufPos = 0;
- this._lookbehind = Buffer.allocUnsafe(needleLen);
- // Initialize occurrence table.
- this._occ = [
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
- needleLen, needleLen, needleLen, needleLen
- ];
- // Populate occurrence table with analysis of the needle, ignoring the last
- // letter.
- if (needleLen > 1) {
- for (let i = 0; i < needleLen - 1; ++i)
- this._occ[needle[i]] = needleLen - 1 - i;
- }
- }
- reset() {
- this.matches = 0;
- this._lookbehindSize = 0;
- this._bufPos = 0;
- }
- push(chunk, pos) {
- let result;
- if (!Buffer.isBuffer(chunk))
- chunk = Buffer.from(chunk, 'latin1');
- const chunkLen = chunk.length;
- this._bufPos = pos || 0;
- while (result !== chunkLen && this.matches < this.maxMatches)
- result = feed(this, chunk);
- return result;
- }
- destroy() {
- const lbSize = this._lookbehindSize;
- if (lbSize)
- this._cb(false, this._lookbehind, 0, lbSize, false);
- this.reset();
- }
- }
- function feed(self, data) {
- const len = data.length;
- const needle = self._needle;
- const needleLen = needle.length;
- // Positive: points to a position in `data`
- // pos == 3 points to data[3]
- // Negative: points to a position in the lookbehind buffer
- // pos == -2 points to lookbehind[lookbehindSize - 2]
- let pos = -self._lookbehindSize;
- const lastNeedleCharPos = needleLen - 1;
- const lastNeedleChar = needle[lastNeedleCharPos];
- const end = len - needleLen;
- const occ = self._occ;
- const lookbehind = self._lookbehind;
- if (pos < 0) {
- // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
- // search with character lookup code that considers both the
- // lookbehind buffer and the current round's haystack data.
- //
- // Loop until
- // there is a match.
- // or until
- // we've moved past the position that requires the
- // lookbehind buffer. In this case we switch to the
- // optimized loop.
- // or until
- // the character to look at lies outside the haystack.
- while (pos < 0 && pos <= end) {
- const nextPos = pos + lastNeedleCharPos;
- const ch = (nextPos < 0
- ? lookbehind[self._lookbehindSize + nextPos]
- : data[nextPos]);
- if (ch === lastNeedleChar
- && matchNeedle(self, data, pos, lastNeedleCharPos)) {
- self._lookbehindSize = 0;
- ++self.matches;
- if (pos > -self._lookbehindSize)
- self._cb(true, lookbehind, 0, self._lookbehindSize + pos, false);
- else
- self._cb(true, undefined, 0, 0, true);
- return (self._bufPos = pos + needleLen);
- }
- pos += occ[ch];
- }
- // No match.
- // There's too few data for Boyer-Moore-Horspool to run,
- // so let's use a different algorithm to skip as much as
- // we can.
- // Forward pos until
- // the trailing part of lookbehind + data
- // looks like the beginning of the needle
- // or until
- // pos == 0
- while (pos < 0 && !matchNeedle(self, data, pos, len - pos))
- ++pos;
- if (pos < 0) {
- // Cut off part of the lookbehind buffer that has
- // been processed and append the entire haystack
- // into it.
- const bytesToCutOff = self._lookbehindSize + pos;
- if (bytesToCutOff > 0) {
- // The cut off data is guaranteed not to contain the needle.
- self._cb(false, lookbehind, 0, bytesToCutOff, false);
- }
- self._lookbehindSize -= bytesToCutOff;
- lookbehind.copy(lookbehind, 0, bytesToCutOff, self._lookbehindSize);
- lookbehind.set(data, self._lookbehindSize);
- self._lookbehindSize += len;
- self._bufPos = len;
- return len;
- }
- // Discard lookbehind buffer.
- self._cb(false, lookbehind, 0, self._lookbehindSize, false);
- self._lookbehindSize = 0;
- }
- pos += self._bufPos;
- const firstNeedleChar = needle[0];
- // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
- // search with optimized character lookup code that only considers
- // the current round's haystack data.
- while (pos <= end) {
- const ch = data[pos + lastNeedleCharPos];
- if (ch === lastNeedleChar
- && data[pos] === firstNeedleChar
- && memcmp(needle, 0, data, pos, lastNeedleCharPos)) {
- ++self.matches;
- if (pos > 0)
- self._cb(true, data, self._bufPos, pos, true);
- else
- self._cb(true, undefined, 0, 0, true);
- return (self._bufPos = pos + needleLen);
- }
- pos += occ[ch];
- }
- // There was no match. If there's trailing haystack data that we cannot
- // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
- // data is less than the needle size) then match using a modified
- // algorithm that starts matching from the beginning instead of the end.
- // Whatever trailing data is left after running this algorithm is added to
- // the lookbehind buffer.
- while (pos < len) {
- if (data[pos] !== firstNeedleChar
- || !memcmp(data, pos, needle, 0, len - pos)) {
- ++pos;
- continue;
- }
- data.copy(lookbehind, 0, pos, len);
- self._lookbehindSize = len - pos;
- break;
- }
- // Everything until `pos` is guaranteed not to contain needle data.
- if (pos > 0)
- self._cb(false, data, self._bufPos, pos < len ? pos : len, true);
- self._bufPos = len;
- return len;
- }
- function matchNeedle(self, data, pos, len) {
- const lb = self._lookbehind;
- const lbSize = self._lookbehindSize;
- const needle = self._needle;
- for (let i = 0; i < len; ++i, ++pos) {
- const ch = (pos < 0 ? lb[lbSize + pos] : data[pos]);
- if (ch !== needle[i])
- return false;
- }
- return true;
- }
- module.exports = SBMH;
|