| // Copyright 2006 Google Inc. |
| // Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Implementation of the Bentley/McIlroy algorithm for finding differences. |
| // Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings. |
| // http://citeseer.ist.psu.edu/555557.html |
| |
| #ifndef OPEN_VCDIFF_BLOCKHASH_H_ |
| #define OPEN_VCDIFF_BLOCKHASH_H_ |
| |
| #include <config.h> |
| #include <stddef.h> // size_t |
| #include <stdint.h> // uint32_t |
| #include <vector> |
| |
| namespace open_vcdiff { |
| |
| // A generic hash table which will be used to keep track of byte runs |
| // of size kBlockSize in both the incrementally processed target data |
| // and the preprocessed source dictionary. |
| // |
| // A custom hash table implementation is used instead of the standard |
| // hash_map template because we know that there will be exactly one |
| // entry in the BlockHash corresponding to each kBlockSize bytes |
| // in the source data, which makes certain optimizations possible: |
| // * The memory for the hash table and for all hash entries can be allocated |
| // in one step rather than incrementally for each insert operation. |
| // * A single integer can be used to represent both |
| // the index of the next hash entry in the chain |
| // and the position of the entry within the source data |
| // (== kBlockSize * block_number). This greatly reduces the size |
| // of a hash entry. |
| // |
| class BlockHash { |
| public: |
| // Block size as per Bentley/McIlroy; must be a power of two. |
| // |
| // Using (for example) kBlockSize = 4 guarantees that no match smaller |
| // than size 4 will be identified, that some matches having sizes |
| // 4, 5, or 6 may be identified, and that all matches |
| // having size 7 or greater will be identified (because any string of |
| // 7 bytes must contain a complete aligned block of 4 bytes.) |
| // |
| // Increasing kBlockSize by a factor of two will halve the amount of |
| // memory needed for the next block table, and will halve the setup time |
| // for a new BlockHash. However, it also doubles the minimum |
| // match length that is guaranteed to be found in FindBestMatch(), |
| // so that function will be less effective in finding matches. |
| // |
| // Computational effort in FindBestMatch (which is the inner loop of |
| // the encoding algorithm) will be proportional to the number of |
| // matches found, and a low value of kBlockSize will waste time |
| // tracking down small matches. On the other hand, if this value |
| // is set too high, no matches will be found at all. |
| // |
| // It is suggested that different values of kBlockSize be tried against |
| // a representative data set to find the best tradeoff between |
| // memory/CPU and the effectiveness of FindBestMatch(). |
| // |
| // If you change kBlockSize to a smaller value, please increase |
| // kMaxMatchesToCheck accordingly. |
| static const int kBlockSize = 16; |
| |
| // This class is used to store the best match found by FindBestMatch() |
| // and return it to the caller. |
| class Match { |
| public: |
| Match() : size_(0), source_offset_(-1), target_offset_(-1) { } |
| |
| void ReplaceIfBetterMatch(size_t candidate_size, |
| int candidate_source_offset, |
| int candidate_target_offset) { |
| if (candidate_size > size_) { |
| size_ = candidate_size; |
| source_offset_ = candidate_source_offset; |
| target_offset_ = candidate_target_offset; |
| } |
| } |
| |
| size_t size() const { return size_; } |
| int source_offset() const { return source_offset_; } |
| int target_offset() const { return target_offset_; } |
| |
| private: |
| // The size of the best (longest) match passed to ReplaceIfBetterMatch(). |
| size_t size_; |
| |
| // The source offset of the match, including the starting_offset_ |
| // of the BlockHash for which the match was found. |
| int source_offset_; |
| |
| // The target offset of the match. An offset of 0 corresponds to the |
| // data at target_start, which is an argument of FindBestMatch(). |
| int target_offset_; |
| |
| // Making these private avoids implicit copy constructor |
| // & assignment operator |
| Match(const Match&); // NOLINT |
| void operator=(const Match&); |
| }; |
| |
| // A BlockHash is created using a buffer of source data. The hash table |
| // will contain one entry for each kBlockSize-byte block in the |
| // source data. |
| // |
| // See the comments for starting_offset_, below, for a description of |
| // the starting_offset argument. For a hash of source (dictionary) data, |
| // starting_offset_ will be zero; for a hash of previously encoded |
| // target data, starting_offset_ will be equal to the dictionary size. |
| // |
| BlockHash(const char* source_data, size_t source_size, int starting_offset); |
| |
| ~BlockHash(); |
| |
| // Initializes the object before use. |
| // This method must be called after constructing a BlockHash object, |
| // and before any other method may be called. This is because |
| // Init() dynamically allocates hash_table_ and next_block_table_. |
| // Returns true if initialization succeeded, or false if an error occurred, |
| // in which case no other method except the destructor may then be used |
| // on the object. |
| // |
| // If populate_hash_table is true, then AddAllBlocks() will be called |
| // to populate the hash table. If populate_hash_table is false, then |
| // classes that inherit from BlockHash are expected to call AddBlock() |
| // to incrementally populate individual blocks of data. |
| // |
| bool Init(bool populate_hash_table); |
| |
| // In the context of the open-vcdiff encoder, BlockHash is used for two |
| // purposes: to hash the source (dictionary) data, and to hash |
| // the previously encoded target data. The main differences between |
| // a dictionary BlockHash and a target BlockHash are as follows: |
| // |
| // 1. The best_match->source_offset() returned from FindBestMatch() |
| // for a target BlockHash is computed in the following manner: |
| // the starting offset of the first byte in the target data |
| // is equal to the dictionary size. FindBestMatch() will add |
| // starting_offset_ to any best_match->source_offset() value it returns, |
| // in order to produce the correct offset value for a target BlockHash. |
| // 2. For a dictionary BlockHash, the entire data set is hashed at once |
| // when Init() is called with the parameter populate_hash_table = true. |
| // For a target BlockHash, because the previously encoded target data |
| // includes only the data seen up to the current encoding position, |
| // the data blocks are hashed incrementally as the encoding position |
| // advances, using AddOneIndexHash() and AddAllBlocksThroughIndex(). |
| // |
| // The following two factory functions can be used to create BlockHash |
| // objects for each of these two purposes. Each factory function calls |
| // the object constructor and also calls Init(). If an error occurs, |
| // NULL is returned; otherwise a valid BlockHash object is returned. |
| // Since a dictionary BlockHash is not expected to be modified after |
| // initialization, a const object is returned. |
| // The caller is responsible for deleting the returned object |
| // (using the C++ delete operator) once it is no longer needed. |
| static const BlockHash* CreateDictionaryHash(const char* dictionary_data, |
| size_t dictionary_size); |
| static BlockHash* CreateTargetHash(const char* target_data, |
| size_t target_size, |
| size_t dictionary_size); |
| |
| // This function will be called to add blocks incrementally to the target hash |
| // as the encoding position advances through the target data. It will be |
| // called for every kBlockSize-byte block in the target data, regardless |
| // of whether the block is aligned evenly on a block boundary. The |
| // BlockHash will only store hash entries for the evenly-aligned blocks. |
| // |
| void AddOneIndexHash(int index, uint32_t hash_value) { |
| if (index == NextIndexToAdd()) { |
| AddBlock(hash_value); |
| } |
| } |
| |
| // Calls AddBlock() for each kBlockSize-byte block in the range |
| // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints. |
| // If end_index <= the last index added (last_block_added_ * kBlockSize), |
| // this function does nothing. |
| // |
| // A partial block beginning anywhere up to (end_index - 1) is also added, |
| // unless it extends outside the end of the source data. Like AddAllBlocks(), |
| // this function computes the hash value for each of the blocks in question |
| // from scratch, so it is not a good option if the hash values have already |
| // been computed for some other purpose. |
| // |
| // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are |
| // 14 bytes of source data. |
| // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock() |
| // only for block number 2 (at index 8). |
| // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call |
| // AddBlock() at all, because block 3 (beginning at index 12) would |
| // fall outside the range of source data. |
| // |
| // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to |
| // add a whole range of data to a target hash when a COPY instruction |
| // is generated. |
| void AddAllBlocksThroughIndex(int end_index); |
| |
| // FindBestMatch takes a position within the unencoded target data |
| // (target_candidate_start) and the hash value of the kBlockSize bytes |
| // beginning at that position (hash_value). It attempts to find a matching |
| // set of bytes within the source (== dictionary) data, expanding |
| // the match both below and above the target block. It cannot expand |
| // the match outside the bounds of the source data, or below |
| // target_start within the target data, or past |
| // the end limit of (target_start + target_length). |
| // |
| // target_candidate_start is the start of the candidate block within the |
| // target data for which a match will be sought, while |
| // target_start (which is <= target_candidate_start) |
| // is the start of the target data that has yet to be encoded. |
| // |
| // If a match is found whose size is greater than the size |
| // of best_match, this function populates *best_match with the |
| // size, source_offset, and target_offset of the match found. |
| // best_match->source_offset() will contain the index of the start of the |
| // matching source data, plus starting_offset_ |
| // (see description of starting_offset_ for details); |
| // best_match->target_offset() will contain the offset of the match |
| // beginning with target_start = offset 0, such that |
| // 0 <= best_match->target_offset() |
| // <= (target_candidate_start - target_start); |
| // and best_match->size() will contain the size of the match. |
| // If no such match is found, this function leaves *best_match unmodified. |
| // |
| // On calling FindBestMatch(), best_match must |
| // point to a valid Match object, and cannot be NULL. |
| // The same Match object can be passed |
| // when calling FindBestMatch() on a different BlockHash object |
| // for the same candidate data block, in order to find |
| // the best match possible across both objects. For example: |
| // |
| // open_vcdiff::BlockHash::Match best_match; |
| // uint32_t hash_value = |
| // RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start); |
| // bh1.FindBestMatch(hash_value, |
| // target_candidate_start, |
| // target_start, |
| // target_length, |
| // &best_match); |
| // bh2.FindBestMatch(hash_value, |
| // target_candidate_start, |
| // target_start, |
| // target_length, |
| // &best_match); |
| // if (best_size >= 0) { |
| // // a match was found; its size, source offset, and target offset |
| // // can be found in best_match |
| // } |
| // |
| // hash_value is passed as a separate parameter from target_candidate_start, |
| // (rather than calculated within FindBestMatch) in order to take |
| // advantage of the rolling hash, which quickly calculates the hash value |
| // of the block starting at target_candidate_start based on |
| // the known hash value of the block starting at (target_candidate_start - 1). |
| // See vcdiffengine.cc for more details. |
| // |
| // Example: |
| // kBlockSize: 4 |
| // target text: "ANDREW LLOYD WEBBER" |
| // 1^ 5^2^ 3^ |
| // dictionary: "INSURANCE : LLOYDS OF LONDON" |
| // 4^ |
| // hashed dictionary blocks: |
| // "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON" |
| // |
| // 1: target_start (beginning of unencoded data) |
| // 2: target_candidate_start (for the block "LLOY") |
| // 3: target_length (points one byte beyond the last byte of data.) |
| // 4: best_match->source_offset() (after calling FindBestMatch) |
| // 5: best_match->target_offset() (after calling FindBestMatch) |
| // |
| // Under these conditions, FindBestMatch will find a matching |
| // hashed dictionary block for "LLOY", and will extend the beginning of |
| // this match backwards by one byte, and the end of the match forwards |
| // by one byte, finding that the best match is " LLOYD" |
| // with best_match->source_offset() = 10 |
| // (offset of " LLOYD" in the source string), |
| // best_match->target_offset() = 6 |
| // (offset of " LLOYD" in the target string), |
| // and best_match->size() = 6. |
| // |
| void FindBestMatch(uint32_t hash_value, |
| const char* target_candidate_start, |
| const char* target_start, |
| size_t target_size, |
| Match* best_match) const; |
| |
| protected: |
| // FindBestMatch() will not process more than this number |
| // of matching hash entries. |
| // |
| // It is necessary to have a limit on the maximum number of matches |
| // that will be checked in order to avoid the worst-case performance |
| // possible if, for example, all the blocks in the dictionary have |
| // the same hash value. See the unit test SearchStringFindsTooManyMatches |
| // for an example of such a case. The encoder uses a loop in |
| // VCDiffEngine::Encode over each target byte, containing a loop in |
| // BlockHash::FindBestMatch over the number of matches (up to a maximum |
| // of the number of source blocks), containing two loops that extend |
| // the match forwards and backwards up to the number of source bytes. |
| // Total complexity in the worst case is |
| // O([target size] * source_size_ * source_size_) |
| // Placing a limit on the possible number of matches checked changes this to |
| // O([target size] * source_size_ * kMaxMatchesToCheck) |
| // |
| // In empirical testing on real HTML text, using a block size of 4, |
| // the number of true matches per call to FindBestMatch() did not exceed 78; |
| // with a block size of 32, the number of matches did not exceed 3. |
| // |
| // The expected number of true matches scales super-linearly |
| // with the inverse of kBlockSize, but here a linear scale is used |
| // for block sizes smaller than 32. |
| static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 32 : |
| (32 * (32 / kBlockSize)); |
| |
| // Do not skip more than this number of non-matching hash collisions |
| // to find the next matching entry in the hash chain. |
| static const int kMaxProbes = 16; |
| |
| // Internal routine which calculates a hash table size based on kBlockSize and |
| // the dictionary_size. Will return a power of two if successful, or 0 if an |
| // internal error occurs. Some calculations (such as GetHashTableIndex()) |
| // depend on the table size being a power of two. |
| static size_t CalcTableSize(const size_t dictionary_size); |
| |
| size_t GetNumberOfBlocks() const { |
| return source_size_ / kBlockSize; |
| } |
| |
| // Use the lowest-order bits of the hash value |
| // as the index into the hash table. |
| uint32_t GetHashTableIndex(uint32_t hash_value) const { |
| return hash_value & hash_table_mask_; |
| } |
| |
| // The index within source_data_ of the next block |
| // for which AddBlock() should be called. |
| int NextIndexToAdd() const { |
| return (last_block_added_ + 1) * kBlockSize; |
| } |
| |
| static inline bool TooManyMatches(int* match_counter); |
| |
| const char* source_data() { return source_data_; } |
| size_t source_size() { return source_size_; } |
| |
| // Adds an entry to the hash table for one block of source data of length |
| // kBlockSize, starting at source_data_[block_number * kBlockSize], |
| // where block_number is always (last_block_added_ + 1). That is, |
| // AddBlock() must be called once for each block in source_data_ |
| // in increasing order. |
| void AddBlock(uint32_t hash_value); |
| |
| // Calls AddBlock() for each complete kBlockSize-byte block between |
| // source_data_ and (source_data_ + source_size_). It is equivalent |
| // to calling AddAllBlocksThroughIndex(source_data + source_size). |
| // This function is called when Init(true) is invoked. |
| void AddAllBlocks(); |
| |
| // Returns true if the contents of the kBlockSize-byte block |
| // beginning at block1 are identical to the contents of |
| // the block beginning at block2; false otherwise. |
| static bool BlockContentsMatch(const char* block1, const char* block2); |
| |
| // Compares each machine word of the two (possibly unaligned) blocks, rather |
| // than each byte, thus reducing the number of test-and-branch instructions |
| // executed. Returns a boolean (do the blocks match?) rather than |
| // the signed byte difference returned by memcmp. |
| // |
| // BlockContentsMatch will use either this function or memcmp to do its work, |
| // depending on which is faster for a particular architecture. |
| // |
| // For gcc on x86-based architectures, this function has been shown to run |
| // about twice as fast as the library function memcmp(), and between five and |
| // nine times faster than the assembly instructions (repz and cmpsb) that gcc |
| // uses by default for builtin memcmp. On other architectures, or using |
| // other compilers, this function has not shown to be faster than memcmp. |
| static bool BlockCompareWords(const char* block1, const char* block2); |
| |
| // Finds the first block number within the hashed data |
| // that represents a match for the given hash value. |
| // Returns -1 if no match was found. |
| // |
| // Init() must have been called and returned true before using |
| // FirstMatchingBlock or NextMatchingBlock. No check is performed |
| // for this condition; the code will crash if this condition is violated. |
| // |
| // The hash table is initially populated with -1 (not found) values, |
| // so if this function is called before the hash table has been populated |
| // using AddAllBlocks() or AddBlock(), it will simply return -1 |
| // for any value of hash_value. |
| int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const; |
| |
| // Given a block number returned by FirstMatchingBlock() |
| // or by a previous call to NextMatchingBlock(), returns |
| // the next block number that matches the same hash value. |
| // Returns -1 if no match was found. |
| int NextMatchingBlock(int block_number, const char* block_ptr) const; |
| |
| // Inline version of FirstMatchingBlock. This saves the cost of a function |
| // call when this routine is called from within the module. The external |
| // (non-inlined) version is called only by unit tests. |
| inline int FirstMatchingBlockInline(uint32_t hash_value, |
| const char* block_ptr) const; |
| |
| // Walk through the hash entry chain, skipping over any false matches |
| // (for which the lowest bits of the fingerprints match, |
| // but the actual block data does not.) Returns the block number of |
| // the first true match found, or -1 if no true match was found. |
| // If block_number is a matching block, the function will return block_number |
| // without skipping to the next block. |
| int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const; |
| |
| // Returns the number of bytes to the left of source_match_start |
| // that match the corresponding bytes to the left of target_match_start. |
| // Will not examine more than max_bytes bytes, which is to say that |
| // the return value will be in the range [0, max_bytes] inclusive. |
| static int MatchingBytesToLeft(const char* source_match_start, |
| const char* target_match_start, |
| int max_bytes); |
| |
| // Returns the number of bytes starting at source_match_end |
| // that match the corresponding bytes starting at target_match_end. |
| // Will not examine more than max_bytes bytes, which is to say that |
| // the return value will be in the range [0, max_bytes] inclusive. |
| static int MatchingBytesToRight(const char* source_match_end, |
| const char* target_match_end, |
| int max_bytes); |
| |
| // The protected functions BlockContentsMatch, FirstMatchingBlock, |
| // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight |
| // should be made accessible to unit tests. |
| friend class BlockHashTest; |
| |
| private: |
| const char* const source_data_; |
| const size_t source_size_; |
| |
| // The size of this array is determined using CalcTableSize(). It has at |
| // least one element for each kBlockSize-byte block in the source data. |
| // GetHashTableIndex() returns an index into this table for a given hash |
| // value. The value of each element of hash_table_ is the lowest block |
| // number in the source data whose hash value would return the same value from |
| // GetHashTableIndex(), or -1 if there is no matching block. This value can |
| // then be used as an index into next_block_table_ to retrieve the entire set |
| // of matching block numbers. |
| std::vector<int> hash_table_; |
| |
| // An array containing one element for each source block. Each element is |
| // either -1 (== not found) or the index of the next block whose hash value |
| // would produce a matching result from GetHashTableIndex(). |
| std::vector<int> next_block_table_; |
| |
| // This vector has the same size as next_block_table_. For every block number |
| // B that is referenced in hash_table_, last_block_table_[B] will contain |
| // the maximum block number that has the same GetHashTableIndex() value |
| // as block B. This number may be B itself. For a block number B' that |
| // is not referenced in hash_table_, the value of last_block_table_[B'] is -1. |
| // This table is used only while populating the hash table, not while looking |
| // up hash values in the table. Keeping track of the last block number in the |
| // chain allows us to construct the block chains as FIFO rather than LIFO |
| // lists, so that the match with the lowest index is returned first. This |
| // should result in a more compact encoding because the VCDIFF format favors |
| // smaller index values and repeated index values. |
| std::vector<int> last_block_table_; |
| |
| // Performing a bitwise AND with hash_table_mask_ will produce a value ranging |
| // from 0 to the number of elements in hash_table_. |
| uint32_t hash_table_mask_; |
| |
| // The offset of the first byte of source data (the data at source_data_[0]). |
| // For the purpose of computing offsets, the source data and target data |
| // are considered to be concatenated -- not literally in a single memory |
| // buffer, but conceptually as described in the RFC. |
| // The first byte of the previously encoded target data |
| // has an offset that is equal to dictionary_size, i.e., just after |
| // the last byte of source data. |
| // For a hash of source (dictionary) data, starting_offset_ will be zero; |
| // for a hash of previously encoded target data, starting_offset_ will be |
| // equal to the dictionary size. |
| const int starting_offset_; |
| |
| // The last index added by AddBlock(). This determines the block number |
| // for successive calls to AddBlock(), and is also |
| // used to determine the starting block for AddAllBlocksThroughIndex(). |
| int last_block_added_; |
| |
| // Making these private avoids implicit copy constructor & assignment operator |
| BlockHash(const BlockHash&); // NOLINT |
| void operator=(const BlockHash&); |
| }; |
| |
| } // namespace open_vcdiff |
| |
| #endif // OPEN_VCDIFF_BLOCKHASH_H_ |