sbmh.js 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. 'use strict';
  2. /*
  3. Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
  4. by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
  5. */
  6. function memcmp(buf1, pos1, buf2, pos2, num) {
  7. for (let i = 0; i < num; ++i) {
  8. if (buf1[pos1 + i] !== buf2[pos2 + i])
  9. return false;
  10. }
  11. return true;
  12. }
  13. class SBMH {
  14. constructor(needle, cb) {
  15. if (typeof cb !== 'function')
  16. throw new Error('Missing match callback');
  17. if (typeof needle === 'string')
  18. needle = Buffer.from(needle);
  19. else if (!Buffer.isBuffer(needle))
  20. throw new Error(`Expected Buffer for needle, got ${typeof needle}`);
  21. const needleLen = needle.length;
  22. this.maxMatches = Infinity;
  23. this.matches = 0;
  24. this._cb = cb;
  25. this._lookbehindSize = 0;
  26. this._needle = needle;
  27. this._bufPos = 0;
  28. this._lookbehind = Buffer.allocUnsafe(needleLen);
  29. // Initialize occurrence table.
  30. this._occ = [
  31. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  32. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  33. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  34. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  35. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  36. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  37. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  38. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  39. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  40. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  41. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  42. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  43. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  44. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  45. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  46. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  47. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  48. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  49. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  50. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  51. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  52. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  53. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  54. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  55. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  56. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  57. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  58. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  59. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  60. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  61. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  62. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  63. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  64. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  65. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  66. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  67. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  68. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  69. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  70. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  71. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  72. needleLen, needleLen, needleLen, needleLen, needleLen, needleLen,
  73. needleLen, needleLen, needleLen, needleLen
  74. ];
  75. // Populate occurrence table with analysis of the needle, ignoring the last
  76. // letter.
  77. if (needleLen > 1) {
  78. for (let i = 0; i < needleLen - 1; ++i)
  79. this._occ[needle[i]] = needleLen - 1 - i;
  80. }
  81. }
  82. reset() {
  83. this.matches = 0;
  84. this._lookbehindSize = 0;
  85. this._bufPos = 0;
  86. }
  87. push(chunk, pos) {
  88. let result;
  89. if (!Buffer.isBuffer(chunk))
  90. chunk = Buffer.from(chunk, 'latin1');
  91. const chunkLen = chunk.length;
  92. this._bufPos = pos || 0;
  93. while (result !== chunkLen && this.matches < this.maxMatches)
  94. result = feed(this, chunk);
  95. return result;
  96. }
  97. destroy() {
  98. const lbSize = this._lookbehindSize;
  99. if (lbSize)
  100. this._cb(false, this._lookbehind, 0, lbSize, false);
  101. this.reset();
  102. }
  103. }
  104. function feed(self, data) {
  105. const len = data.length;
  106. const needle = self._needle;
  107. const needleLen = needle.length;
  108. // Positive: points to a position in `data`
  109. // pos == 3 points to data[3]
  110. // Negative: points to a position in the lookbehind buffer
  111. // pos == -2 points to lookbehind[lookbehindSize - 2]
  112. let pos = -self._lookbehindSize;
  113. const lastNeedleCharPos = needleLen - 1;
  114. const lastNeedleChar = needle[lastNeedleCharPos];
  115. const end = len - needleLen;
  116. const occ = self._occ;
  117. const lookbehind = self._lookbehind;
  118. if (pos < 0) {
  119. // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
  120. // search with character lookup code that considers both the
  121. // lookbehind buffer and the current round's haystack data.
  122. //
  123. // Loop until
  124. // there is a match.
  125. // or until
  126. // we've moved past the position that requires the
  127. // lookbehind buffer. In this case we switch to the
  128. // optimized loop.
  129. // or until
  130. // the character to look at lies outside the haystack.
  131. while (pos < 0 && pos <= end) {
  132. const nextPos = pos + lastNeedleCharPos;
  133. const ch = (nextPos < 0
  134. ? lookbehind[self._lookbehindSize + nextPos]
  135. : data[nextPos]);
  136. if (ch === lastNeedleChar
  137. && matchNeedle(self, data, pos, lastNeedleCharPos)) {
  138. self._lookbehindSize = 0;
  139. ++self.matches;
  140. if (pos > -self._lookbehindSize)
  141. self._cb(true, lookbehind, 0, self._lookbehindSize + pos, false);
  142. else
  143. self._cb(true, undefined, 0, 0, true);
  144. return (self._bufPos = pos + needleLen);
  145. }
  146. pos += occ[ch];
  147. }
  148. // No match.
  149. // There's too few data for Boyer-Moore-Horspool to run,
  150. // so let's use a different algorithm to skip as much as
  151. // we can.
  152. // Forward pos until
  153. // the trailing part of lookbehind + data
  154. // looks like the beginning of the needle
  155. // or until
  156. // pos == 0
  157. while (pos < 0 && !matchNeedle(self, data, pos, len - pos))
  158. ++pos;
  159. if (pos < 0) {
  160. // Cut off part of the lookbehind buffer that has
  161. // been processed and append the entire haystack
  162. // into it.
  163. const bytesToCutOff = self._lookbehindSize + pos;
  164. if (bytesToCutOff > 0) {
  165. // The cut off data is guaranteed not to contain the needle.
  166. self._cb(false, lookbehind, 0, bytesToCutOff, false);
  167. }
  168. self._lookbehindSize -= bytesToCutOff;
  169. lookbehind.copy(lookbehind, 0, bytesToCutOff, self._lookbehindSize);
  170. lookbehind.set(data, self._lookbehindSize);
  171. self._lookbehindSize += len;
  172. self._bufPos = len;
  173. return len;
  174. }
  175. // Discard lookbehind buffer.
  176. self._cb(false, lookbehind, 0, self._lookbehindSize, false);
  177. self._lookbehindSize = 0;
  178. }
  179. pos += self._bufPos;
  180. const firstNeedleChar = needle[0];
  181. // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
  182. // search with optimized character lookup code that only considers
  183. // the current round's haystack data.
  184. while (pos <= end) {
  185. const ch = data[pos + lastNeedleCharPos];
  186. if (ch === lastNeedleChar
  187. && data[pos] === firstNeedleChar
  188. && memcmp(needle, 0, data, pos, lastNeedleCharPos)) {
  189. ++self.matches;
  190. if (pos > 0)
  191. self._cb(true, data, self._bufPos, pos, true);
  192. else
  193. self._cb(true, undefined, 0, 0, true);
  194. return (self._bufPos = pos + needleLen);
  195. }
  196. pos += occ[ch];
  197. }
  198. // There was no match. If there's trailing haystack data that we cannot
  199. // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
  200. // data is less than the needle size) then match using a modified
  201. // algorithm that starts matching from the beginning instead of the end.
  202. // Whatever trailing data is left after running this algorithm is added to
  203. // the lookbehind buffer.
  204. while (pos < len) {
  205. if (data[pos] !== firstNeedleChar
  206. || !memcmp(data, pos, needle, 0, len - pos)) {
  207. ++pos;
  208. continue;
  209. }
  210. data.copy(lookbehind, 0, pos, len);
  211. self._lookbehindSize = len - pos;
  212. break;
  213. }
  214. // Everything until `pos` is guaranteed not to contain needle data.
  215. if (pos > 0)
  216. self._cb(false, data, self._bufPos, pos < len ? pos : len, true);
  217. self._bufPos = len;
  218. return len;
  219. }
  220. function matchNeedle(self, data, pos, len) {
  221. const lb = self._lookbehind;
  222. const lbSize = self._lookbehindSize;
  223. const needle = self._needle;
  224. for (let i = 0; i < len; ++i, ++pos) {
  225. const ch = (pos < 0 ? lb[lbSize + pos] : data[pos]);
  226. if (ch !== needle[i])
  227. return false;
  228. }
  229. return true;
  230. }
  231. module.exports = SBMH;