| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2002-2004, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: uiter.h |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2002jan18 |
| * created by: Markus W. Scherer |
| */ |
| |
| #ifndef __UITER_H__ |
| #define __UITER_H__ |
| |
| /** |
| * \file |
| * \brief C API: Unicode Character Iteration |
| * |
| * @see UCharIterator |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #ifdef XP_CPLUSPLUS |
| U_NAMESPACE_BEGIN |
| |
| class CharacterIterator; |
| class Replaceable; |
| |
| U_NAMESPACE_END |
| #endif |
| |
| U_CDECL_BEGIN |
| |
| struct UCharIterator; |
| typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ |
| |
| /** |
| * Origin constants for UCharIterator.getIndex() and UCharIterator.move(). |
| * @see UCharIteratorMove |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef enum UCharIteratorOrigin { |
| UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH |
| } UCharIteratorOrigin; |
| |
| /** Constants for UCharIterator. @stable ICU 2.6 */ |
| enum { |
| /** |
| * Constant value that may be returned by UCharIteratorMove |
| * indicating that the final UTF-16 index is not known, but that the move succeeded. |
| * This can occur when moving relative to limit or length, or |
| * when moving relative to the current index after a setState() |
| * when the current UTF-16 index is not known. |
| * |
| * It would be very inefficient to have to count from the beginning of the text |
| * just to get the current/limit/length index after moving relative to it. |
| * The actual index can be determined with getIndex(UITER_CURRENT) |
| * which will count the UChars if necessary. |
| * |
| * @stable ICU 2.6 |
| */ |
| UITER_UNKNOWN_INDEX=-2 |
| }; |
| |
| |
| /** |
| * Constant for UCharIterator getState() indicating an error or |
| * an unknown state. |
| * Returned by uiter_getState()/UCharIteratorGetState |
| * when an error occurs. |
| * Also, some UCharIterator implementations may not be able to return |
| * a valid state for each position. This will be clearly documented |
| * for each such iterator (none of the public ones here). |
| * |
| * @stable ICU 2.6 |
| */ |
| #define UITER_NO_STATE ((uint32_t)0xffffffff) |
| |
| /** |
| * Function type declaration for UCharIterator.getIndex(). |
| * |
| * Gets the current position, or the start or limit of the |
| * iteration range. |
| * |
| * This function may perform slowly for UITER_CURRENT after setState() was called, |
| * or for UITER_LENGTH, because an iterator implementation may have to count |
| * UChars if the underlying storage is not UTF-16. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @param origin get the 0, start, limit, length, or current index |
| * @return the requested index, or U_SENTINEL in an error condition |
| * |
| * @see UCharIteratorOrigin |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef int32_t U_CALLCONV |
| UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin); |
| |
| /** |
| * Function type declaration for UCharIterator.move(). |
| * |
| * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index). |
| * |
| * Moves the current position relative to the start or limit of the |
| * iteration range, or relative to the current position itself. |
| * The movement is expressed in numbers of code units forward |
| * or backward by specifying a positive or negative delta. |
| * Out of bounds movement will be pinned to the start or limit. |
| * |
| * This function may perform slowly for moving relative to UITER_LENGTH |
| * because an iterator implementation may have to count the rest of the |
| * UChars if the native storage is not UTF-16. |
| * |
| * When moving relative to the limit or length, or |
| * relative to the current position after setState() was called, |
| * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient |
| * determination of the actual UTF-16 index. |
| * The actual index can be determined with getIndex(UITER_CURRENT) |
| * which will count the UChars if necessary. |
| * See UITER_UNKNOWN_INDEX for details. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @param delta can be positive, zero, or negative |
| * @param origin move relative to the 0, start, limit, length, or current index |
| * @return the new index, or U_SENTINEL on an error condition, |
| * or UITER_UNKNOWN_INDEX when the index is not known. |
| * |
| * @see UCharIteratorOrigin |
| * @see UCharIterator |
| * @see UITER_UNKNOWN_INDEX |
| * @stable ICU 2.1 |
| */ |
| typedef int32_t U_CALLCONV |
| UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin); |
| |
| /** |
| * Function type declaration for UCharIterator.hasNext(). |
| * |
| * Check if current() and next() can still |
| * return another code unit. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return boolean value for whether current() and next() can still return another code unit |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef UBool U_CALLCONV |
| UCharIteratorHasNext(UCharIterator *iter); |
| |
| /** |
| * Function type declaration for UCharIterator.hasPrevious(). |
| * |
| * Check if previous() can still return another code unit. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return boolean value for whether previous() can still return another code unit |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef UBool U_CALLCONV |
| UCharIteratorHasPrevious(UCharIterator *iter); |
| |
| /** |
| * Function type declaration for UCharIterator.current(). |
| * |
| * Return the code unit at the current position, |
| * or U_SENTINEL if there is none (index is at the limit). |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the current code unit |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef UChar32 U_CALLCONV |
| UCharIteratorCurrent(UCharIterator *iter); |
| |
| /** |
| * Function type declaration for UCharIterator.next(). |
| * |
| * Return the code unit at the current index and increment |
| * the index (post-increment, like s[i++]), |
| * or return U_SENTINEL if there is none (index is at the limit). |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the current code unit (and post-increment the current index) |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef UChar32 U_CALLCONV |
| UCharIteratorNext(UCharIterator *iter); |
| |
| /** |
| * Function type declaration for UCharIterator.previous(). |
| * |
| * Decrement the index and return the code unit from there |
| * (pre-decrement, like s[--i]), |
| * or return U_SENTINEL if there is none (index is at the start). |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the previous code unit (after pre-decrementing the current index) |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef UChar32 U_CALLCONV |
| UCharIteratorPrevious(UCharIterator *iter); |
| |
| /** |
| * Function type declaration for UCharIterator.reservedFn(). |
| * Reserved for future use. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @param something some integer argument |
| * @return some integer |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| typedef int32_t U_CALLCONV |
| UCharIteratorReserved(UCharIterator *iter, int32_t something); |
| |
| /** |
| * Function type declaration for UCharIterator.getState(). |
| * |
| * Get the "state" of the iterator in the form of a single 32-bit word. |
| * It is recommended that the state value be calculated to be as small as |
| * is feasible. For strings with limited lengths, fewer than 32 bits may |
| * be sufficient. |
| * |
| * This is used together with setState()/UCharIteratorSetState |
| * to save and restore the iterator position more efficiently than with |
| * getIndex()/move(). |
| * |
| * The iterator state is defined as a uint32_t value because it is designed |
| * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state |
| * of the character iterator. |
| * |
| * With some UCharIterator implementations (e.g., UTF-8), |
| * getting and setting the UTF-16 index with existing functions |
| * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but |
| * relatively slow because the iterator has to "walk" from a known index |
| * to the requested one. |
| * This takes more time the farther it needs to go. |
| * |
| * An opaque state value allows an iterator implementation to provide |
| * an internal index (UTF-8: the source byte array index) for |
| * fast, constant-time restoration. |
| * |
| * After calling setState(), a getIndex(UITER_CURRENT) may be slow because |
| * the UTF-16 index may not be restored as well, but the iterator can deliver |
| * the correct text contents and move relative to the current position |
| * without performance degradation. |
| * |
| * Some UCharIterator implementations may not be able to return |
| * a valid state for each position, in which case they return UITER_NO_STATE instead. |
| * This will be clearly documented for each such iterator (none of the public ones here). |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the state word |
| * |
| * @see UCharIterator |
| * @see UCharIteratorSetState |
| * @see UITER_NO_STATE |
| * @stable ICU 2.6 |
| */ |
| typedef uint32_t U_CALLCONV |
| UCharIteratorGetState(const UCharIterator *iter); |
| |
| /** |
| * Function type declaration for UCharIterator.setState(). |
| * |
| * Restore the "state" of the iterator using a state word from a getState() call. |
| * The iterator object need not be the same one as for which getState() was called, |
| * but it must be of the same type (set up using the same uiter_setXYZ function) |
| * and it must iterate over the same string |
| * (binary identical regardless of memory address). |
| * For more about the state word see UCharIteratorGetState. |
| * |
| * After calling setState(), a getIndex(UITER_CURRENT) may be slow because |
| * the UTF-16 index may not be restored as well, but the iterator can deliver |
| * the correct text contents and move relative to the current position |
| * without performance degradation. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @param state the state word from a getState() call |
| * on a same-type, same-string iterator |
| * @param pErrorCode Must be a valid pointer to an error code value, |
| * which must not indicate a failure before the function call. |
| * |
| * @see UCharIterator |
| * @see UCharIteratorGetState |
| * @stable ICU 2.6 |
| */ |
| typedef void U_CALLCONV |
| UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); |
| |
| |
| /** |
| * C API for code unit iteration. |
| * This can be used as a C wrapper around |
| * CharacterIterator, Replaceable, or implemented using simple strings, etc. |
| * |
| * There are two roles for using UCharIterator: |
| * |
| * A "provider" sets the necessary function pointers and controls the "protected" |
| * fields of the UCharIterator structure. A "provider" passes a UCharIterator |
| * into C APIs that need a UCharIterator as an abstract, flexible string interface. |
| * |
| * Implementations of such C APIs are "callers" of UCharIterator functions; |
| * they only use the "public" function pointers and never access the "protected" |
| * fields directly. |
| * |
| * The current() and next() functions only check the current index against the |
| * limit, and previous() only checks the current index against the start, |
| * to see if the iterator already reached the end of the iteration range. |
| * |
| * The assumption - in all iterators - is that the index is moved via the API, |
| * which means it won't go out of bounds, or the index is modified by |
| * user code that knows enough about the iterator implementation to set valid |
| * index values. |
| * |
| * UCharIterator functions return code unit values 0..0xffff, |
| * or U_SENTINEL if the iteration bounds are reached. |
| * |
| * @stable ICU 2.1 |
| */ |
| struct UCharIterator { |
| /** |
| * (protected) Pointer to string or wrapped object or similar. |
| * Not used by caller. |
| * @stable ICU 2.1 |
| */ |
| const void *context; |
| |
| /** |
| * (protected) Length of string or similar. |
| * Not used by caller. |
| * @stable ICU 2.1 |
| */ |
| int32_t length; |
| |
| /** |
| * (protected) Start index or similar. |
| * Not used by caller. |
| * @stable ICU 2.1 |
| */ |
| int32_t start; |
| |
| /** |
| * (protected) Current index or similar. |
| * Not used by caller. |
| * @stable ICU 2.1 |
| */ |
| int32_t index; |
| |
| /** |
| * (protected) Limit index or similar. |
| * Not used by caller. |
| * @stable ICU 2.1 |
| */ |
| int32_t limit; |
| |
| /** |
| * (protected) Used by UTF-8 iterators and possibly others. |
| * @stable ICU 2.1 |
| */ |
| int32_t reservedField; |
| |
| /** |
| * (public) Returns the current position or the |
| * start or limit index of the iteration range. |
| * |
| * @see UCharIteratorGetIndex |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorGetIndex *getIndex; |
| |
| /** |
| * (public) Moves the current position relative to the start or limit of the |
| * iteration range, or relative to the current position itself. |
| * The movement is expressed in numbers of code units forward |
| * or backward by specifying a positive or negative delta. |
| * |
| * @see UCharIteratorMove |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorMove *move; |
| |
| /** |
| * (public) Check if current() and next() can still |
| * return another code unit. |
| * |
| * @see UCharIteratorHasNext |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorHasNext *hasNext; |
| |
| /** |
| * (public) Check if previous() can still return another code unit. |
| * |
| * @see UCharIteratorHasPrevious |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorHasPrevious *hasPrevious; |
| |
| /** |
| * (public) Return the code unit at the current position, |
| * or U_SENTINEL if there is none (index is at the limit). |
| * |
| * @see UCharIteratorCurrent |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorCurrent *current; |
| |
| /** |
| * (public) Return the code unit at the current index and increment |
| * the index (post-increment, like s[i++]), |
| * or return U_SENTINEL if there is none (index is at the limit). |
| * |
| * @see UCharIteratorNext |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorNext *next; |
| |
| /** |
| * (public) Decrement the index and return the code unit from there |
| * (pre-decrement, like s[--i]), |
| * or return U_SENTINEL if there is none (index is at the start). |
| * |
| * @see UCharIteratorPrevious |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorPrevious *previous; |
| |
| /** |
| * (public) Reserved for future use. Currently NULL. |
| * |
| * @see UCharIteratorReserved |
| * @stable ICU 2.1 |
| */ |
| UCharIteratorReserved *reservedFn; |
| |
| /** |
| * (public) Return the state of the iterator, to be restored later with setState(). |
| * This function pointer is NULL if the iterator does not implement it. |
| * |
| * @see UCharIteratorGet |
| * @stable ICU 2.6 |
| */ |
| UCharIteratorGetState *getState; |
| |
| /** |
| * (public) Restore the iterator state from the state word from a call |
| * to getState(). |
| * This function pointer is NULL if the iterator does not implement it. |
| * |
| * @see UCharIteratorSet |
| * @stable ICU 2.6 |
| */ |
| UCharIteratorSetState *setState; |
| }; |
| |
| /** |
| * Helper function for UCharIterator to get the code point |
| * at the current index. |
| * |
| * Return the code point that includes the code unit at the current position, |
| * or U_SENTINEL if there is none (index is at the limit). |
| * If the current code unit is a lead or trail surrogate, |
| * then the following or preceding surrogate is used to form |
| * the code point value. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the current code point |
| * |
| * @see UCharIterator |
| * @see U16_GET |
| * @see UnicodeString::char32At() |
| * @stable ICU 2.1 |
| */ |
| U_STABLE UChar32 U_EXPORT2 |
| uiter_current32(UCharIterator *iter); |
| |
| /** |
| * Helper function for UCharIterator to get the next code point. |
| * |
| * Return the code point at the current index and increment |
| * the index (post-increment, like s[i++]), |
| * or return U_SENTINEL if there is none (index is at the limit). |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the current code point (and post-increment the current index) |
| * |
| * @see UCharIterator |
| * @see U16_NEXT |
| * @stable ICU 2.1 |
| */ |
| U_STABLE UChar32 U_EXPORT2 |
| uiter_next32(UCharIterator *iter); |
| |
| /** |
| * Helper function for UCharIterator to get the previous code point. |
| * |
| * Decrement the index and return the code point from there |
| * (pre-decrement, like s[--i]), |
| * or return U_SENTINEL if there is none (index is at the start). |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the previous code point (after pre-decrementing the current index) |
| * |
| * @see UCharIterator |
| * @see U16_PREV |
| * @stable ICU 2.1 |
| */ |
| U_STABLE UChar32 U_EXPORT2 |
| uiter_previous32(UCharIterator *iter); |
| |
| /** |
| * Get the "state" of the iterator in the form of a single 32-bit word. |
| * This is a convenience function that calls iter->getState(iter) |
| * if iter->getState is not NULL; |
| * if it is NULL or any other error occurs, then UITER_NO_STATE is returned. |
| * |
| * Some UCharIterator implementations may not be able to return |
| * a valid state for each position, in which case they return UITER_NO_STATE instead. |
| * This will be clearly documented for each such iterator (none of the public ones here). |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @return the state word |
| * |
| * @see UCharIterator |
| * @see UCharIteratorGetState |
| * @see UITER_NO_STATE |
| * @stable ICU 2.6 |
| */ |
| U_STABLE uint32_t U_EXPORT2 |
| uiter_getState(const UCharIterator *iter); |
| |
| /** |
| * Restore the "state" of the iterator using a state word from a getState() call. |
| * This is a convenience function that calls iter->setState(iter, state, pErrorCode) |
| * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set. |
| * |
| * @param iter the UCharIterator structure ("this pointer") |
| * @param state the state word from a getState() call |
| * on a same-type, same-string iterator |
| * @param pErrorCode Must be a valid pointer to an error code value, |
| * which must not indicate a failure before the function call. |
| * |
| * @see UCharIterator |
| * @see UCharIteratorSetState |
| * @stable ICU 2.6 |
| */ |
| U_STABLE void U_EXPORT2 |
| uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); |
| |
| /** |
| * Set up a UCharIterator to iterate over a string. |
| * |
| * Sets the UCharIterator function pointers for iteration over the string s |
| * with iteration boundaries start=index=0 and length=limit=string length. |
| * The "provider" may set the start, index, and limit values at any time |
| * within the range 0..length. |
| * The length field will be ignored. |
| * |
| * The string pointer s is set into UCharIterator.context without copying |
| * or reallocating the string contents. |
| * |
| * getState() simply returns the current index. |
| * move() will always return the final index. |
| * |
| * @param iter UCharIterator structure to be set for iteration |
| * @param s String to iterate over |
| * @param length Length of s, or -1 if NUL-terminated |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| U_STABLE void U_EXPORT2 |
| uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); |
| |
| /** |
| * Set up a UCharIterator to iterate over a UTF-16BE string |
| * (byte vector with a big-endian pair of bytes per UChar). |
| * |
| * Everything works just like with a normal UChar iterator (uiter_setString), |
| * except that UChars are assembled from byte pairs, |
| * and that the length argument here indicates an even number of bytes. |
| * |
| * getState() simply returns the current index. |
| * move() will always return the final index. |
| * |
| * @param iter UCharIterator structure to be set for iteration |
| * @param s UTF-16BE string to iterate over |
| * @param length Length of s as an even number of bytes, or -1 if NUL-terminated |
| * (NUL means pair of 0 bytes at even index from s) |
| * |
| * @see UCharIterator |
| * @see uiter_setString |
| * @stable ICU 2.6 |
| */ |
| U_STABLE void U_EXPORT2 |
| uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length); |
| |
| /** |
| * Set up a UCharIterator to iterate over a UTF-8 string. |
| * |
| * Sets the UCharIterator function pointers for iteration over the UTF-8 string s |
| * with UTF-8 iteration boundaries 0 and length. |
| * The implementation counts the UTF-16 index on the fly and |
| * lazily evaluates the UTF-16 length of the text. |
| * |
| * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length. |
| * When the reservedField is not 0, then it contains a supplementary code point |
| * and the UTF-16 index is between the two corresponding surrogates. |
| * At that point, the UTF-8 index is behind that code point. |
| * |
| * The UTF-8 string pointer s is set into UCharIterator.context without copying |
| * or reallocating the string contents. |
| * |
| * getState() returns a state value consisting of |
| * - the current UTF-8 source byte index (bits 31..1) |
| * - a flag (bit 0) that indicates whether the UChar position is in the middle |
| * of a surrogate pair |
| * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point) |
| * |
| * getState() cannot also encode the UTF-16 index in the state value. |
| * move(relative to limit or length), or |
| * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX. |
| * |
| * @param iter UCharIterator structure to be set for iteration |
| * @param s UTF-8 string to iterate over |
| * @param length Length of s in bytes, or -1 if NUL-terminated |
| * |
| * @see UCharIterator |
| * @stable ICU 2.6 |
| */ |
| U_STABLE void U_EXPORT2 |
| uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); |
| |
| #ifdef XP_CPLUSPLUS |
| |
| /** |
| * Set up a UCharIterator to wrap around a C++ CharacterIterator. |
| * |
| * Sets the UCharIterator function pointers for iteration using the |
| * CharacterIterator charIter. |
| * |
| * The CharacterIterator pointer charIter is set into UCharIterator.context |
| * without copying or cloning the CharacterIterator object. |
| * The other "protected" UCharIterator fields are set to 0 and will be ignored. |
| * The iteration index and boundaries are controlled by the CharacterIterator. |
| * |
| * getState() simply returns the current index. |
| * move() will always return the final index. |
| * |
| * @param iter UCharIterator structure to be set for iteration |
| * @param charIter CharacterIterator to wrap |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| U_STABLE void U_EXPORT2 |
| uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter); |
| |
| /** |
| * Set up a UCharIterator to iterate over a C++ Replaceable. |
| * |
| * Sets the UCharIterator function pointers for iteration over the |
| * Replaceable rep with iteration boundaries start=index=0 and |
| * length=limit=rep->length(). |
| * The "provider" may set the start, index, and limit values at any time |
| * within the range 0..length=rep->length(). |
| * The length field will be ignored. |
| * |
| * The Replaceable pointer rep is set into UCharIterator.context without copying |
| * or cloning/reallocating the Replaceable object. |
| * |
| * getState() simply returns the current index. |
| * move() will always return the final index. |
| * |
| * @param iter UCharIterator structure to be set for iteration |
| * @param rep Replaceable to iterate over |
| * |
| * @see UCharIterator |
| * @stable ICU 2.1 |
| */ |
| U_STABLE void U_EXPORT2 |
| uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep); |
| |
| #endif |
| |
| U_CDECL_END |
| |
| #endif |