diff options
Diffstat (limited to 'deps/v8/src/strings/string-search.h')
-rw-r--r-- | deps/v8/src/strings/string-search.h | 548 |
1 files changed, 548 insertions, 0 deletions
diff --git a/deps/v8/src/strings/string-search.h b/deps/v8/src/strings/string-search.h new file mode 100644 index 0000000000..1d5800ebcf --- /dev/null +++ b/deps/v8/src/strings/string-search.h @@ -0,0 +1,548 @@ +// Copyright 2011 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_STRINGS_STRING_SEARCH_H_ +#define V8_STRINGS_STRING_SEARCH_H_ + +#include "src/execution/isolate.h" +#include "src/utils/vector.h" + +namespace v8 { +namespace internal { + +//--------------------------------------------------------------------- +// String Search object. +//--------------------------------------------------------------------- + +// Class holding constants and methods that apply to all string search variants, +// independently of subject and pattern char size. +class StringSearchBase { + protected: + // Cap on the maximal shift in the Boyer-Moore implementation. By setting a + // limit, we can fix the size of tables. For a needle longer than this limit, + // search will not be optimal, since we only build tables for a suffix + // of the string, but it is a safe approximation. + static const int kBMMaxShift = Isolate::kBMMaxShift; + + // Reduce alphabet to this size. + // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size + // proportional to the input alphabet. We reduce the alphabet size by + // equating input characters modulo a smaller alphabet size. This gives + // a potentially less efficient searching, but is a safe approximation. + // For needles using only characters in the same Unicode 256-code point page, + // there is no search speed degradation. + static const int kLatin1AlphabetSize = 256; + static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; + + // Bad-char shift table stored in the state. It's length is the alphabet size. + // For patterns below this length, the skip length of Boyer-Moore is too short + // to compensate for the algorithmic overhead compared to simple brute force. + static const int kBMMinPatternLength = 7; + + static inline bool IsOneByteString(Vector<const uint8_t> string) { + return true; + } + + static inline bool IsOneByteString(Vector<const uc16> string) { + return String::IsOneByte(string.begin(), string.length()); + } + + friend class Isolate; +}; + +template <typename PatternChar, typename SubjectChar> +class StringSearch : private StringSearchBase { + public: + StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) + : isolate_(isolate), + pattern_(pattern), + start_(Max(0, pattern.length() - kBMMaxShift)) { + if (sizeof(PatternChar) > sizeof(SubjectChar)) { + if (!IsOneByteString(pattern_)) { + strategy_ = &FailSearch; + return; + } + } + int pattern_length = pattern_.length(); + if (pattern_length < kBMMinPatternLength) { + if (pattern_length == 1) { + strategy_ = &SingleCharSearch; + return; + } + strategy_ = &LinearSearch; + return; + } + strategy_ = &InitialSearch; + } + + int Search(Vector<const SubjectChar> subject, int index) { + return strategy_(this, subject, index); + } + + static inline int AlphabetSize() { + if (sizeof(PatternChar) == 1) { + // Latin1 needle. + return kLatin1AlphabetSize; + } else { + DCHECK_EQ(sizeof(PatternChar), 2); + // UC16 needle. + return kUC16AlphabetSize; + } + } + + private: + using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*, + Vector<const SubjectChar>, int); + + static int FailSearch(StringSearch<PatternChar, SubjectChar>*, + Vector<const SubjectChar>, int) { + return -1; + } + + static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, + int start_index); + + static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int start_index); + + static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int start_index); + + static int BoyerMooreHorspoolSearch( + StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int start_index); + + static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, + int start_index); + + void PopulateBoyerMooreHorspoolTable(); + + void PopulateBoyerMooreTable(); + + static inline bool exceedsOneByte(uint8_t c) { return false; } + + static inline bool exceedsOneByte(uint16_t c) { + return c > String::kMaxOneByteCharCodeU; + } + + static inline int CharOccurrence(int* bad_char_occurrence, + SubjectChar char_code) { + if (sizeof(SubjectChar) == 1) { + return bad_char_occurrence[static_cast<int>(char_code)]; + } + if (sizeof(PatternChar) == 1) { + if (exceedsOneByte(char_code)) { + return -1; + } + return bad_char_occurrence[static_cast<unsigned int>(char_code)]; + } + // Both pattern and subject are UC16. Reduce character to equivalence class. + int equiv_class = char_code % kUC16AlphabetSize; + return bad_char_occurrence[equiv_class]; + } + + // The following tables are shared by all searches. + // TODO(lrn): Introduce a way for a pattern to keep its tables + // between searches (e.g., for an Atom RegExp). + + // Store for the BoyerMoore(Horspool) bad char shift table. + // Return a table covering the last kBMMaxShift+1 positions of + // pattern. + int* bad_char_table() { return isolate_->bad_char_shift_table(); } + + // Store for the BoyerMoore good suffix shift table. + int* good_suffix_shift_table() { + // Return biased pointer that maps the range [start_..pattern_.length() + // to the kGoodSuffixShiftTable array. + return isolate_->good_suffix_shift_table() - start_; + } + + // Table used temporarily while building the BoyerMoore good suffix + // shift table. + int* suffix_table() { + // Return biased pointer that maps the range [start_..pattern_.length() + // to the kSuffixTable array. + return isolate_->suffix_table() - start_; + } + + Isolate* isolate_; + // The pattern to search for. + Vector<const PatternChar> pattern_; + // Pointer to implementation of the search. + SearchFunction strategy_; + // Cache value of Max(0, pattern_length() - kBMMaxShift) + int start_; +}; + +template <typename T, typename U> +inline T AlignDown(T value, U alignment) { + return reinterpret_cast<T>( + (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); +} + +inline uint8_t GetHighestValueByte(uc16 character) { + return Max(static_cast<uint8_t>(character & 0xFF), + static_cast<uint8_t>(character >> 8)); +} + +inline uint8_t GetHighestValueByte(uint8_t character) { return character; } + +template <typename PatternChar, typename SubjectChar> +inline int FindFirstCharacter(Vector<const PatternChar> pattern, + Vector<const SubjectChar> subject, int index) { + const PatternChar pattern_first_char = pattern[0]; + const int max_n = (subject.length() - pattern.length() + 1); + + const uint8_t search_byte = GetHighestValueByte(pattern_first_char); + const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); + int pos = index; + do { + DCHECK_GE(max_n - pos, 0); + const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( + memchr(subject.begin() + pos, search_byte, + (max_n - pos) * sizeof(SubjectChar))); + if (char_pos == nullptr) return -1; + char_pos = AlignDown(char_pos, sizeof(SubjectChar)); + pos = static_cast<int>(char_pos - subject.begin()); + if (subject[pos] == search_char) return pos; + } while (++pos < max_n); + + return -1; +} + +//--------------------------------------------------------------------- +// Single Character Pattern Search Strategy +//--------------------------------------------------------------------- + +template <typename PatternChar, typename SubjectChar> +int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( + StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int index) { + DCHECK_EQ(1, search->pattern_.length()); + PatternChar pattern_first_char = search->pattern_[0]; + if (sizeof(PatternChar) > sizeof(SubjectChar)) { + if (exceedsOneByte(pattern_first_char)) { + return -1; + } + } + return FindFirstCharacter(search->pattern_, subject, index); +} + +//--------------------------------------------------------------------- +// Linear Search Strategy +//--------------------------------------------------------------------- + +template <typename PatternChar, typename SubjectChar> +inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject, + int length) { + DCHECK_GT(length, 0); + int pos = 0; + do { + if (pattern[pos] != subject[pos]) { + return false; + } + pos++; + } while (pos < length); + return true; +} + +// Simple linear search for short patterns. Never bails out. +template <typename PatternChar, typename SubjectChar> +int StringSearch<PatternChar, SubjectChar>::LinearSearch( + StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int index) { + Vector<const PatternChar> pattern = search->pattern_; + DCHECK_GT(pattern.length(), 1); + int pattern_length = pattern.length(); + int i = index; + int n = subject.length() - pattern_length; + while (i <= n) { + i = FindFirstCharacter(pattern, subject, i); + if (i == -1) return -1; + DCHECK_LE(i, n); + i++; + // Loop extracted to separate function to allow using return to do + // a deeper break. + if (CharCompare(pattern.begin() + 1, subject.begin() + i, + pattern_length - 1)) { + return i - 1; + } + } + return -1; +} + +//--------------------------------------------------------------------- +// Boyer-Moore string search +//--------------------------------------------------------------------- + +template <typename PatternChar, typename SubjectChar> +int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( + StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int start_index) { + Vector<const PatternChar> pattern = search->pattern_; + int subject_length = subject.length(); + int pattern_length = pattern.length(); + // Only preprocess at most kBMMaxShift last characters of pattern. + int start = search->start_; + + int* bad_char_occurence = search->bad_char_table(); + int* good_suffix_shift = search->good_suffix_shift_table(); + + PatternChar last_char = pattern[pattern_length - 1]; + int index = start_index; + // Continue search from i. + while (index <= subject_length - pattern_length) { + int j = pattern_length - 1; + int c; + while (last_char != (c = subject[index + j])) { + int shift = j - CharOccurrence(bad_char_occurence, c); + index += shift; + if (index > subject_length - pattern_length) { + return -1; + } + } + while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; + if (j < 0) { + return index; + } else if (j < start) { + // we have matched more than our tables allow us to be smart about. + // Fall back on BMH shift. + index += pattern_length - 1 - + CharOccurrence(bad_char_occurence, + static_cast<SubjectChar>(last_char)); + } else { + int gs_shift = good_suffix_shift[j + 1]; + int bc_occ = CharOccurrence(bad_char_occurence, c); + int shift = j - bc_occ; + if (gs_shift > shift) { + shift = gs_shift; + } + index += shift; + } + } + + return -1; +} + +template <typename PatternChar, typename SubjectChar> +void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { + int pattern_length = pattern_.length(); + const PatternChar* pattern = pattern_.begin(); + // Only look at the last kBMMaxShift characters of pattern (from start_ + // to pattern_length). + int start = start_; + int length = pattern_length - start; + + // Biased tables so that we can use pattern indices as table indices, + // even if we only cover the part of the pattern from offset start. + int* shift_table = good_suffix_shift_table(); + int* suffix_table = this->suffix_table(); + + // Initialize table. + for (int i = start; i < pattern_length; i++) { + shift_table[i] = length; + } + shift_table[pattern_length] = 1; + suffix_table[pattern_length] = pattern_length + 1; + + if (pattern_length <= start) { + return; + } + + // Find suffixes. + PatternChar last_char = pattern[pattern_length - 1]; + int suffix = pattern_length + 1; + { + int i = pattern_length; + while (i > start) { + PatternChar c = pattern[i - 1]; + while (suffix <= pattern_length && c != pattern[suffix - 1]) { + if (shift_table[suffix] == length) { + shift_table[suffix] = suffix - i; + } + suffix = suffix_table[suffix]; + } + suffix_table[--i] = --suffix; + if (suffix == pattern_length) { + // No suffix to extend, so we check against last_char only. + while ((i > start) && (pattern[i - 1] != last_char)) { + if (shift_table[pattern_length] == length) { + shift_table[pattern_length] = pattern_length - i; + } + suffix_table[--i] = pattern_length; + } + if (i > start) { + suffix_table[--i] = --suffix; + } + } + } + } + // Build shift table using suffixes. + if (suffix < pattern_length) { + for (int i = start; i <= pattern_length; i++) { + if (shift_table[i] == length) { + shift_table[i] = suffix - start; + } + if (i == suffix) { + suffix = suffix_table[suffix]; + } + } + } +} + +//--------------------------------------------------------------------- +// Boyer-Moore-Horspool string search. +//--------------------------------------------------------------------- + +template <typename PatternChar, typename SubjectChar> +int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( + StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int start_index) { + Vector<const PatternChar> pattern = search->pattern_; + int subject_length = subject.length(); + int pattern_length = pattern.length(); + int* char_occurrences = search->bad_char_table(); + int badness = -pattern_length; + + // How bad we are doing without a good-suffix table. + PatternChar last_char = pattern[pattern_length - 1]; + int last_char_shift = + pattern_length - 1 - + CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); + // Perform search + int index = start_index; // No matches found prior to this index. + while (index <= subject_length - pattern_length) { + int j = pattern_length - 1; + int subject_char; + while (last_char != (subject_char = subject[index + j])) { + int bc_occ = CharOccurrence(char_occurrences, subject_char); + int shift = j - bc_occ; + index += shift; + badness += 1 - shift; // at most zero, so badness cannot increase. + if (index > subject_length - pattern_length) { + return -1; + } + } + j--; + while (j >= 0 && pattern[j] == (subject[index + j])) j--; + if (j < 0) { + return index; + } else { + index += last_char_shift; + // Badness increases by the number of characters we have + // checked, and decreases by the number of characters we + // can skip by shifting. It's a measure of how we are doing + // compared to reading each character exactly once. + badness += (pattern_length - j) - last_char_shift; + if (badness > 0) { + search->PopulateBoyerMooreTable(); + search->strategy_ = &BoyerMooreSearch; + return BoyerMooreSearch(search, subject, index); + } + } + } + return -1; +} + +template <typename PatternChar, typename SubjectChar> +void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { + int pattern_length = pattern_.length(); + + int* bad_char_occurrence = bad_char_table(); + + // Only preprocess at most kBMMaxShift last characters of pattern. + int start = start_; + // Run forwards to populate bad_char_table, so that *last* instance + // of character equivalence class is the one registered. + // Notice: Doesn't include the last character. + int table_size = AlphabetSize(); + if (start == 0) { // All patterns less than kBMMaxShift in length. + memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); + } else { + for (int i = 0; i < table_size; i++) { + bad_char_occurrence[i] = start - 1; + } + } + for (int i = start; i < pattern_length - 1; i++) { + PatternChar c = pattern_[i]; + int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); + bad_char_occurrence[bucket] = i; + } +} + +//--------------------------------------------------------------------- +// Linear string search with bailout to BMH. +//--------------------------------------------------------------------- + +// Simple linear search for short patterns, which bails out if the string +// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. +template <typename PatternChar, typename SubjectChar> +int StringSearch<PatternChar, SubjectChar>::InitialSearch( + StringSearch<PatternChar, SubjectChar>* search, + Vector<const SubjectChar> subject, int index) { + Vector<const PatternChar> pattern = search->pattern_; + int pattern_length = pattern.length(); + // Badness is a count of how much work we have done. When we have + // done enough work we decide it's probably worth switching to a better + // algorithm. + int badness = -10 - (pattern_length << 2); + + // We know our pattern is at least 2 characters, we cache the first so + // the common case of the first character not matching is faster. + for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { + badness++; + if (badness <= 0) { + i = FindFirstCharacter(pattern, subject, i); + if (i == -1) return -1; + DCHECK_LE(i, n); + int j = 1; + do { + if (pattern[j] != subject[i + j]) { + break; + } + j++; + } while (j < pattern_length); + if (j == pattern_length) { + return i; + } + badness += j; + } else { + search->PopulateBoyerMooreHorspoolTable(); + search->strategy_ = &BoyerMooreHorspoolSearch; + return BoyerMooreHorspoolSearch(search, subject, i); + } + } + return -1; +} + +// Perform a a single stand-alone search. +// If searching multiple times for the same pattern, a search +// object should be constructed once and the Search function then called +// for each search. +template <typename SubjectChar, typename PatternChar> +int SearchString(Isolate* isolate, Vector<const SubjectChar> subject, + Vector<const PatternChar> pattern, int start_index) { + StringSearch<PatternChar, SubjectChar> search(isolate, pattern); + return search.Search(subject, start_index); +} + +// A wrapper function around SearchString that wraps raw pointers to the subject +// and pattern as vectors before calling SearchString. Used from the +// StringIndexOf builtin. +template <typename SubjectChar, typename PatternChar> +intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr, + int subject_length, const PatternChar* pattern_ptr, + int pattern_length, int start_index) { + DisallowHeapAllocation no_gc; + Vector<const SubjectChar> subject(subject_ptr, subject_length); + Vector<const PatternChar> pattern(pattern_ptr, pattern_length); + return SearchString(isolate, subject, pattern, start_index); +} + +} // namespace internal +} // namespace v8 + +#endif // V8_STRINGS_STRING_SEARCH_H_ |