summaryrefslogtreecommitdiff
path: root/deps/v8/src/strings/string-search.h
diff options
context:
space:
mode:
Diffstat (limited to 'deps/v8/src/strings/string-search.h')
-rw-r--r--deps/v8/src/strings/string-search.h548
1 files changed, 548 insertions, 0 deletions
diff --git a/deps/v8/src/strings/string-search.h b/deps/v8/src/strings/string-search.h
new file mode 100644
index 0000000000..1d5800ebcf
--- /dev/null
+++ b/deps/v8/src/strings/string-search.h
@@ -0,0 +1,548 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_STRINGS_STRING_SEARCH_H_
+#define V8_STRINGS_STRING_SEARCH_H_
+
+#include "src/execution/isolate.h"
+#include "src/utils/vector.h"
+
+namespace v8 {
+namespace internal {
+
+//---------------------------------------------------------------------
+// String Search object.
+//---------------------------------------------------------------------
+
+// Class holding constants and methods that apply to all string search variants,
+// independently of subject and pattern char size.
+class StringSearchBase {
+ protected:
+ // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
+ // limit, we can fix the size of tables. For a needle longer than this limit,
+ // search will not be optimal, since we only build tables for a suffix
+ // of the string, but it is a safe approximation.
+ static const int kBMMaxShift = Isolate::kBMMaxShift;
+
+ // Reduce alphabet to this size.
+ // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
+ // proportional to the input alphabet. We reduce the alphabet size by
+ // equating input characters modulo a smaller alphabet size. This gives
+ // a potentially less efficient searching, but is a safe approximation.
+ // For needles using only characters in the same Unicode 256-code point page,
+ // there is no search speed degradation.
+ static const int kLatin1AlphabetSize = 256;
+ static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
+
+ // Bad-char shift table stored in the state. It's length is the alphabet size.
+ // For patterns below this length, the skip length of Boyer-Moore is too short
+ // to compensate for the algorithmic overhead compared to simple brute force.
+ static const int kBMMinPatternLength = 7;
+
+ static inline bool IsOneByteString(Vector<const uint8_t> string) {
+ return true;
+ }
+
+ static inline bool IsOneByteString(Vector<const uc16> string) {
+ return String::IsOneByte(string.begin(), string.length());
+ }
+
+ friend class Isolate;
+};
+
+template <typename PatternChar, typename SubjectChar>
+class StringSearch : private StringSearchBase {
+ public:
+ StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
+ : isolate_(isolate),
+ pattern_(pattern),
+ start_(Max(0, pattern.length() - kBMMaxShift)) {
+ if (sizeof(PatternChar) > sizeof(SubjectChar)) {
+ if (!IsOneByteString(pattern_)) {
+ strategy_ = &FailSearch;
+ return;
+ }
+ }
+ int pattern_length = pattern_.length();
+ if (pattern_length < kBMMinPatternLength) {
+ if (pattern_length == 1) {
+ strategy_ = &SingleCharSearch;
+ return;
+ }
+ strategy_ = &LinearSearch;
+ return;
+ }
+ strategy_ = &InitialSearch;
+ }
+
+ int Search(Vector<const SubjectChar> subject, int index) {
+ return strategy_(this, subject, index);
+ }
+
+ static inline int AlphabetSize() {
+ if (sizeof(PatternChar) == 1) {
+ // Latin1 needle.
+ return kLatin1AlphabetSize;
+ } else {
+ DCHECK_EQ(sizeof(PatternChar), 2);
+ // UC16 needle.
+ return kUC16AlphabetSize;
+ }
+ }
+
+ private:
+ using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*,
+ Vector<const SubjectChar>, int);
+
+ static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
+ Vector<const SubjectChar>, int) {
+ return -1;
+ }
+
+ static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index);
+
+ static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int start_index);
+
+ static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int start_index);
+
+ static int BoyerMooreHorspoolSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int start_index);
+
+ static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index);
+
+ void PopulateBoyerMooreHorspoolTable();
+
+ void PopulateBoyerMooreTable();
+
+ static inline bool exceedsOneByte(uint8_t c) { return false; }
+
+ static inline bool exceedsOneByte(uint16_t c) {
+ return c > String::kMaxOneByteCharCodeU;
+ }
+
+ static inline int CharOccurrence(int* bad_char_occurrence,
+ SubjectChar char_code) {
+ if (sizeof(SubjectChar) == 1) {
+ return bad_char_occurrence[static_cast<int>(char_code)];
+ }
+ if (sizeof(PatternChar) == 1) {
+ if (exceedsOneByte(char_code)) {
+ return -1;
+ }
+ return bad_char_occurrence[static_cast<unsigned int>(char_code)];
+ }
+ // Both pattern and subject are UC16. Reduce character to equivalence class.
+ int equiv_class = char_code % kUC16AlphabetSize;
+ return bad_char_occurrence[equiv_class];
+ }
+
+ // The following tables are shared by all searches.
+ // TODO(lrn): Introduce a way for a pattern to keep its tables
+ // between searches (e.g., for an Atom RegExp).
+
+ // Store for the BoyerMoore(Horspool) bad char shift table.
+ // Return a table covering the last kBMMaxShift+1 positions of
+ // pattern.
+ int* bad_char_table() { return isolate_->bad_char_shift_table(); }
+
+ // Store for the BoyerMoore good suffix shift table.
+ int* good_suffix_shift_table() {
+ // Return biased pointer that maps the range [start_..pattern_.length()
+ // to the kGoodSuffixShiftTable array.
+ return isolate_->good_suffix_shift_table() - start_;
+ }
+
+ // Table used temporarily while building the BoyerMoore good suffix
+ // shift table.
+ int* suffix_table() {
+ // Return biased pointer that maps the range [start_..pattern_.length()
+ // to the kSuffixTable array.
+ return isolate_->suffix_table() - start_;
+ }
+
+ Isolate* isolate_;
+ // The pattern to search for.
+ Vector<const PatternChar> pattern_;
+ // Pointer to implementation of the search.
+ SearchFunction strategy_;
+ // Cache value of Max(0, pattern_length() - kBMMaxShift)
+ int start_;
+};
+
+template <typename T, typename U>
+inline T AlignDown(T value, U alignment) {
+ return reinterpret_cast<T>(
+ (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
+}
+
+inline uint8_t GetHighestValueByte(uc16 character) {
+ return Max(static_cast<uint8_t>(character & 0xFF),
+ static_cast<uint8_t>(character >> 8));
+}
+
+inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
+
+template <typename PatternChar, typename SubjectChar>
+inline int FindFirstCharacter(Vector<const PatternChar> pattern,
+ Vector<const SubjectChar> subject, int index) {
+ const PatternChar pattern_first_char = pattern[0];
+ const int max_n = (subject.length() - pattern.length() + 1);
+
+ const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
+ const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
+ int pos = index;
+ do {
+ DCHECK_GE(max_n - pos, 0);
+ const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
+ memchr(subject.begin() + pos, search_byte,
+ (max_n - pos) * sizeof(SubjectChar)));
+ if (char_pos == nullptr) return -1;
+ char_pos = AlignDown(char_pos, sizeof(SubjectChar));
+ pos = static_cast<int>(char_pos - subject.begin());
+ if (subject[pos] == search_char) return pos;
+ } while (++pos < max_n);
+
+ return -1;
+}
+
+//---------------------------------------------------------------------
+// Single Character Pattern Search Strategy
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int index) {
+ DCHECK_EQ(1, search->pattern_.length());
+ PatternChar pattern_first_char = search->pattern_[0];
+ if (sizeof(PatternChar) > sizeof(SubjectChar)) {
+ if (exceedsOneByte(pattern_first_char)) {
+ return -1;
+ }
+ }
+ return FindFirstCharacter(search->pattern_, subject, index);
+}
+
+//---------------------------------------------------------------------
+// Linear Search Strategy
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject,
+ int length) {
+ DCHECK_GT(length, 0);
+ int pos = 0;
+ do {
+ if (pattern[pos] != subject[pos]) {
+ return false;
+ }
+ pos++;
+ } while (pos < length);
+ return true;
+}
+
+// Simple linear search for short patterns. Never bails out.
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::LinearSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int index) {
+ Vector<const PatternChar> pattern = search->pattern_;
+ DCHECK_GT(pattern.length(), 1);
+ int pattern_length = pattern.length();
+ int i = index;
+ int n = subject.length() - pattern_length;
+ while (i <= n) {
+ i = FindFirstCharacter(pattern, subject, i);
+ if (i == -1) return -1;
+ DCHECK_LE(i, n);
+ i++;
+ // Loop extracted to separate function to allow using return to do
+ // a deeper break.
+ if (CharCompare(pattern.begin() + 1, subject.begin() + i,
+ pattern_length - 1)) {
+ return i - 1;
+ }
+ }
+ return -1;
+}
+
+//---------------------------------------------------------------------
+// Boyer-Moore string search
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int start_index) {
+ Vector<const PatternChar> pattern = search->pattern_;
+ int subject_length = subject.length();
+ int pattern_length = pattern.length();
+ // Only preprocess at most kBMMaxShift last characters of pattern.
+ int start = search->start_;
+
+ int* bad_char_occurence = search->bad_char_table();
+ int* good_suffix_shift = search->good_suffix_shift_table();
+
+ PatternChar last_char = pattern[pattern_length - 1];
+ int index = start_index;
+ // Continue search from i.
+ while (index <= subject_length - pattern_length) {
+ int j = pattern_length - 1;
+ int c;
+ while (last_char != (c = subject[index + j])) {
+ int shift = j - CharOccurrence(bad_char_occurence, c);
+ index += shift;
+ if (index > subject_length - pattern_length) {
+ return -1;
+ }
+ }
+ while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
+ if (j < 0) {
+ return index;
+ } else if (j < start) {
+ // we have matched more than our tables allow us to be smart about.
+ // Fall back on BMH shift.
+ index += pattern_length - 1 -
+ CharOccurrence(bad_char_occurence,
+ static_cast<SubjectChar>(last_char));
+ } else {
+ int gs_shift = good_suffix_shift[j + 1];
+ int bc_occ = CharOccurrence(bad_char_occurence, c);
+ int shift = j - bc_occ;
+ if (gs_shift > shift) {
+ shift = gs_shift;
+ }
+ index += shift;
+ }
+ }
+
+ return -1;
+}
+
+template <typename PatternChar, typename SubjectChar>
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
+ int pattern_length = pattern_.length();
+ const PatternChar* pattern = pattern_.begin();
+ // Only look at the last kBMMaxShift characters of pattern (from start_
+ // to pattern_length).
+ int start = start_;
+ int length = pattern_length - start;
+
+ // Biased tables so that we can use pattern indices as table indices,
+ // even if we only cover the part of the pattern from offset start.
+ int* shift_table = good_suffix_shift_table();
+ int* suffix_table = this->suffix_table();
+
+ // Initialize table.
+ for (int i = start; i < pattern_length; i++) {
+ shift_table[i] = length;
+ }
+ shift_table[pattern_length] = 1;
+ suffix_table[pattern_length] = pattern_length + 1;
+
+ if (pattern_length <= start) {
+ return;
+ }
+
+ // Find suffixes.
+ PatternChar last_char = pattern[pattern_length - 1];
+ int suffix = pattern_length + 1;
+ {
+ int i = pattern_length;
+ while (i > start) {
+ PatternChar c = pattern[i - 1];
+ while (suffix <= pattern_length && c != pattern[suffix - 1]) {
+ if (shift_table[suffix] == length) {
+ shift_table[suffix] = suffix - i;
+ }
+ suffix = suffix_table[suffix];
+ }
+ suffix_table[--i] = --suffix;
+ if (suffix == pattern_length) {
+ // No suffix to extend, so we check against last_char only.
+ while ((i > start) && (pattern[i - 1] != last_char)) {
+ if (shift_table[pattern_length] == length) {
+ shift_table[pattern_length] = pattern_length - i;
+ }
+ suffix_table[--i] = pattern_length;
+ }
+ if (i > start) {
+ suffix_table[--i] = --suffix;
+ }
+ }
+ }
+ }
+ // Build shift table using suffixes.
+ if (suffix < pattern_length) {
+ for (int i = start; i <= pattern_length; i++) {
+ if (shift_table[i] == length) {
+ shift_table[i] = suffix - start;
+ }
+ if (i == suffix) {
+ suffix = suffix_table[suffix];
+ }
+ }
+ }
+}
+
+//---------------------------------------------------------------------
+// Boyer-Moore-Horspool string search.
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int start_index) {
+ Vector<const PatternChar> pattern = search->pattern_;
+ int subject_length = subject.length();
+ int pattern_length = pattern.length();
+ int* char_occurrences = search->bad_char_table();
+ int badness = -pattern_length;
+
+ // How bad we are doing without a good-suffix table.
+ PatternChar last_char = pattern[pattern_length - 1];
+ int last_char_shift =
+ pattern_length - 1 -
+ CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
+ // Perform search
+ int index = start_index; // No matches found prior to this index.
+ while (index <= subject_length - pattern_length) {
+ int j = pattern_length - 1;
+ int subject_char;
+ while (last_char != (subject_char = subject[index + j])) {
+ int bc_occ = CharOccurrence(char_occurrences, subject_char);
+ int shift = j - bc_occ;
+ index += shift;
+ badness += 1 - shift; // at most zero, so badness cannot increase.
+ if (index > subject_length - pattern_length) {
+ return -1;
+ }
+ }
+ j--;
+ while (j >= 0 && pattern[j] == (subject[index + j])) j--;
+ if (j < 0) {
+ return index;
+ } else {
+ index += last_char_shift;
+ // Badness increases by the number of characters we have
+ // checked, and decreases by the number of characters we
+ // can skip by shifting. It's a measure of how we are doing
+ // compared to reading each character exactly once.
+ badness += (pattern_length - j) - last_char_shift;
+ if (badness > 0) {
+ search->PopulateBoyerMooreTable();
+ search->strategy_ = &BoyerMooreSearch;
+ return BoyerMooreSearch(search, subject, index);
+ }
+ }
+ }
+ return -1;
+}
+
+template <typename PatternChar, typename SubjectChar>
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
+ int pattern_length = pattern_.length();
+
+ int* bad_char_occurrence = bad_char_table();
+
+ // Only preprocess at most kBMMaxShift last characters of pattern.
+ int start = start_;
+ // Run forwards to populate bad_char_table, so that *last* instance
+ // of character equivalence class is the one registered.
+ // Notice: Doesn't include the last character.
+ int table_size = AlphabetSize();
+ if (start == 0) { // All patterns less than kBMMaxShift in length.
+ memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
+ } else {
+ for (int i = 0; i < table_size; i++) {
+ bad_char_occurrence[i] = start - 1;
+ }
+ }
+ for (int i = start; i < pattern_length - 1; i++) {
+ PatternChar c = pattern_[i];
+ int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
+ bad_char_occurrence[bucket] = i;
+ }
+}
+
+//---------------------------------------------------------------------
+// Linear string search with bailout to BMH.
+//---------------------------------------------------------------------
+
+// Simple linear search for short patterns, which bails out if the string
+// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::InitialSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject, int index) {
+ Vector<const PatternChar> pattern = search->pattern_;
+ int pattern_length = pattern.length();
+ // Badness is a count of how much work we have done. When we have
+ // done enough work we decide it's probably worth switching to a better
+ // algorithm.
+ int badness = -10 - (pattern_length << 2);
+
+ // We know our pattern is at least 2 characters, we cache the first so
+ // the common case of the first character not matching is faster.
+ for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
+ badness++;
+ if (badness <= 0) {
+ i = FindFirstCharacter(pattern, subject, i);
+ if (i == -1) return -1;
+ DCHECK_LE(i, n);
+ int j = 1;
+ do {
+ if (pattern[j] != subject[i + j]) {
+ break;
+ }
+ j++;
+ } while (j < pattern_length);
+ if (j == pattern_length) {
+ return i;
+ }
+ badness += j;
+ } else {
+ search->PopulateBoyerMooreHorspoolTable();
+ search->strategy_ = &BoyerMooreHorspoolSearch;
+ return BoyerMooreHorspoolSearch(search, subject, i);
+ }
+ }
+ return -1;
+}
+
+// Perform a a single stand-alone search.
+// If searching multiple times for the same pattern, a search
+// object should be constructed once and the Search function then called
+// for each search.
+template <typename SubjectChar, typename PatternChar>
+int SearchString(Isolate* isolate, Vector<const SubjectChar> subject,
+ Vector<const PatternChar> pattern, int start_index) {
+ StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
+ return search.Search(subject, start_index);
+}
+
+// A wrapper function around SearchString that wraps raw pointers to the subject
+// and pattern as vectors before calling SearchString. Used from the
+// StringIndexOf builtin.
+template <typename SubjectChar, typename PatternChar>
+intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
+ int subject_length, const PatternChar* pattern_ptr,
+ int pattern_length, int start_index) {
+ DisallowHeapAllocation no_gc;
+ Vector<const SubjectChar> subject(subject_ptr, subject_length);
+ Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
+ return SearchString(isolate, subject, pattern, start_index);
+}
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_STRINGS_STRING_SEARCH_H_