diff options
Diffstat (limited to 'deps/icu-small/source/common/unicode/edits.h')
-rw-r--r-- | deps/icu-small/source/common/unicode/edits.h | 150 |
1 files changed, 129 insertions, 21 deletions
diff --git a/deps/icu-small/source/common/unicode/edits.h b/deps/icu-small/source/common/unicode/edits.h index 5a72574c14..f767a8d3b4 100644 --- a/deps/icu-small/source/common/unicode/edits.h +++ b/deps/icu-small/source/common/unicode/edits.h @@ -17,10 +17,57 @@ U_NAMESPACE_BEGIN +class UnicodeString; + /** - * Records lengths of string edits but not replacement text. - * Supports replacements, insertions, deletions in linear progression. - * Does not support moving/reordering of text. + * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions + * in linear progression. Does not support moving/reordering of text. + * + * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to + * instances of this class using {@link #addReplace(int, int)} (for change edits) and + * {@link #addUnchanged(int)} (for no-change edits). Change edits are retained with full granularity, + * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one + * mapping between code points in the source and destination strings. + * + * After all edits have been added, instances of this class should be considered immutable, and an + * {@link Edits::Iterator} can be used for queries. + * + * There are four flavors of Edits::Iterator: + * + * <ul> + * <li>{@link #getFineIterator()} retains full granularity of change edits. + * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling + * next() on the iterator, skips over no-change edits (unchanged regions). + * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change + * edits are automatically merged during the construction phase.) + * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when + * calling next() on the iterator, skips over no-change edits (unchanged regions). + * </ul> + * + * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the + * following fine edits: + * <ul> + * <li>abc ⇨ abc (no-change) + * <li>ß ⇨ ss (change) + * <li>D ⇨ d (change) + * <li>e ⇨ e (no-change) + * <li>F ⇨ f (change) + * </ul> + * and the following coarse edits (note how adjacent change edits get merged together): + * <ul> + * <li>abc ⇨ abc (no-change) + * <li>ßD ⇨ ssd (change) + * <li>e ⇨ e (no-change) + * <li>F ⇨ f (change) + * </ul> + * + * The "fine changes" and "coarse changes" iterators will step through only the change edits when their + * {@link Edits::Iterator#next()} methods are called. They are identical to the non-change iterators when + * their {@link Edits::Iterator#findSourceIndex(int)} or {@link Edits::Iterator#findDestinationIndex(int)} + * methods are used to walk through the string. + * + * For examples of how to use this class, see the test <code>TestCaseMapEditsIteratorDocs</code> in + * UCharacterCaseTest.java. * * An Edits object tracks a separate UErrorCode, but ICU string transformation functions * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. @@ -91,13 +138,13 @@ public: void reset() U_NOEXCEPT; /** - * Adds a record for an unchanged segment of text. + * Adds a no-change edit: a record for an unchanged segment of text. * Normally called from inside ICU string transformation functions, not user code. * @stable ICU 59 */ void addUnchanged(int32_t unchangedLength); /** - * Adds a record for a text replacement/insertion/deletion. + * Adds a change edit: a record for a text replacement/insertion/deletion. * Normally called from inside ICU string transformation functions, not user code. * @stable ICU 59 */ @@ -136,6 +183,18 @@ public: /** * Access to the list of edits. + * + * At any moment in time, an instance of this class points to a single edit: a "window" into a span + * of the source string and the corresponding span of the destination string. The source string span + * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string + * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. + * + * The iterator can be moved between edits using the {@link #next()}, {@link #findSourceIndex(int)}, + * and {@link #findDestinationIndex(int)} methods. Calling any of these methods mutates the iterator + * to make it point to the corresponding edit. + * + * For more information, see the documentation for {@link Edits}. + * * @see getCoarseIterator * @see getFineIterator * @stable ICU 59 @@ -162,7 +221,7 @@ public: Iterator &operator=(const Iterator &other) = default; /** - * Advances to the next edit. + * Advances the iterator to the next edit. * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, * or else the function returns immediately. Check for U_FAILURE() * on output or use with function chaining. (See User Guide for details.) @@ -172,9 +231,9 @@ public: UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); } /** - * Finds the edit that contains the source index. - * The source index may be found in a non-change - * even if normal iteration would skip non-changes. + * Moves the iterator to the edit that contains the source index. + * The source index may be found in a no-change edit + * even if normal iteration would skip no-change edits. * Normal iteration can continue from a found edit. * * The iterator state before this search logically does not matter. @@ -196,9 +255,9 @@ public: #ifndef U_HIDE_DRAFT_API /** - * Finds the edit that contains the destination index. - * The destination index may be found in a non-change - * even if normal iteration would skip non-changes. + * Moves the iterator to the edit that contains the destination index. + * The destination index may be found in a no-change edit + * even if normal iteration would skip no-change edits. * Normal iteration can continue from a found edit. * * The iterator state before this search logically does not matter. @@ -219,7 +278,7 @@ public: } /** - * Returns the destination index corresponding to the given source index. + * Computes the destination index corresponding to the given source index. * If the source index is inside a change edit (not at its start), * then the destination index at the end of that edit is returned, * since there is no information about index mapping inside a change edit. @@ -243,7 +302,7 @@ public: int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode); /** - * Returns the source index corresponding to the given destination index. + * Computes the source index corresponding to the given destination index. * If the destination index is inside a change edit (not at its start), * then the source index at the end of that edit is returned, * since there is no information about index mapping inside a change edit. @@ -268,17 +327,27 @@ public: #endif // U_HIDE_DRAFT_API /** + * Returns whether the edit currently represented by the iterator is a change edit. + * * @return TRUE if this edit replaces oldLength() units with newLength() different ones. * FALSE if oldLength units remain unchanged. * @stable ICU 59 */ UBool hasChange() const { return changed; } + /** + * The length of the current span in the source string, which starts at {@link #sourceIndex}. + * * @return the number of units in the original string which are replaced or remain unchanged. * @stable ICU 59 */ int32_t oldLength() const { return oldLength_; } + /** + * The length of the current span in the destination string, which starts at + * {@link #destinationIndex}, or in the replacement string, which starts at + * {@link #replacementIndex}. + * * @return the number of units in the modified string, if hasChange() is TRUE. * Same as oldLength if hasChange() is FALSE. * @stable ICU 59 @@ -286,22 +355,52 @@ public: int32_t newLength() const { return newLength_; } /** + * The start index of the current span in the source string; the span has length + * {@link #oldLength}. + * * @return the current index into the source string * @stable ICU 59 */ int32_t sourceIndex() const { return srcIndex; } + /** + * The start index of the current span in the replacement string; the span has length + * {@link #newLength}. Well-defined only if the current edit is a change edit. + * <p> + * The <em>replacement string</em> is the concatenation of all substrings of the destination + * string corresponding to change edits. + * <p> + * This method is intended to be used together with operations that write only replacement + * characters (e.g., {@link CaseMap#omitUnchangedText()}). The source string can then be modified + * in-place. + * * @return the current index into the replacement-characters-only string, * not counting unchanged spans * @stable ICU 59 */ - int32_t replacementIndex() const { return replIndex; } + int32_t replacementIndex() const { + // TODO: Throw an exception if we aren't in a change edit? + return replIndex; + } + /** + * The start index of the current span in the destination string; the span has length + * {@link #newLength}. + * * @return the current index into the full destination string * @stable ICU 59 */ int32_t destinationIndex() const { return destIndex; } +#ifndef U_HIDE_INTERNAL_API + /** + * A string representation of the current edit represented by the iterator for debugging. You + * should not depend on the contents of the return string. + * @internal + */ + UnicodeString& toString(UnicodeString& appendTo) const; +#endif // U_HIDE_INTERNAL_API + private: friend class Edits; @@ -330,8 +429,10 @@ public: }; /** - * Returns an Iterator for coarse-grained changes for simple string updates. - * Skips non-changes. + * Returns an Iterator for coarse-grained change edits + * (adjacent change edits are treated as one). + * Can be used to perform simple string updates. + * Skips no-change edits. * @return an Iterator that merges adjacent changes. * @stable ICU 59 */ @@ -340,7 +441,10 @@ public: } /** - * Returns an Iterator for coarse-grained changes and non-changes for simple string updates. + * Returns an Iterator for coarse-grained change and no-change edits + * (adjacent change edits are treated as one). + * Can be used to perform simple string updates. + * Adjacent change edits are treated as one edit. * @return an Iterator that merges adjacent changes. * @stable ICU 59 */ @@ -349,8 +453,10 @@ public: } /** - * Returns an Iterator for fine-grained changes for modifying styled text. - * Skips non-changes. + * Returns an Iterator for fine-grained change edits + * (full granularity of change edits is retained). + * Can be used for modifying styled text. + * Skips no-change edits. * @return an Iterator that separates adjacent changes. * @stable ICU 59 */ @@ -359,7 +465,9 @@ public: } /** - * Returns an Iterator for fine-grained changes and non-changes for modifying styled text. + * Returns an Iterator for fine-grained change and no-change edits + * (full granularity of change edits is retained). + * Can be used for modifying styled text. * @return an Iterator that separates adjacent changes. * @stable ICU 59 */ |