blob: 64cdb86fe20a22c053918782f968f94368bc6e5c [file] [log] [blame]
// Copyright 2007 Google Inc.
// Author: Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef OPEN_VCDIFF_VCENCODER_H_
#define OPEN_VCDIFF_VCENCODER_H_
#include <stddef.h> // size_t
#include <vector>
#include "google/output_string.h"
namespace open_vcdiff {
class VCDiffEngine;
class VCDiffStreamingEncoderImpl;
// These flags are passed to the constructor of VCDiffStreamingEncoder
// to determine whether certain open-vcdiff format extensions
// (which are not part of the RFC 3284 draft standard for VCDIFF)
// are employed.
//
// Because these extensions are not part of the VCDIFF standard, if
// any of these flags except VCD_STANDARD_FORMAT is specified, then the caller
// must be certain that the receiver of the data will be using open-vcdiff
// to decode the delta file, or at least that the receiver can interpret
// these extensions. The encoder will use an 'S' as the fourth character
// in the delta file to indicate that non-standard extensions are being used.
//
enum VCDiffFormatExtensionFlagValues {
// No extensions: the encoded format will conform to the RFC
// draft standard for VCDIFF.
VCD_STANDARD_FORMAT = 0x00,
// If this flag is specified, then the encoder writes each delta file
// window by interleaving instructions and sizes with their corresponding
// addresses and data, rather than placing these elements
// into three separate sections. This facilitates providing partially
// decoded results when only a portion of a delta file window is received
// (e.g. when HTTP over TCP is used as the transmission protocol.)
VCD_FORMAT_INTERLEAVED = 0x01,
// If this flag is specified, then an Adler32 checksum
// of the target window data is included in the delta window.
VCD_FORMAT_CHECKSUM = 0x02
};
typedef int VCDiffFormatExtensionFlags;
// A HashedDictionary must be constructed from the dictionary data
// in order to use VCDiffStreamingEncoder. If the same dictionary will
// be used to perform several encoding operations, then the caller should
// create the HashedDictionary once and cache it for reuse. This object
// is thread-safe: the same const HashedDictionary can be used
// by several threads simultaneously, each with its own VCDiffStreamingEncoder.
//
// dictionary_contents is copied into the HashedDictionary, so the
// caller may free that string, if desired, after the constructor returns.
//
class HashedDictionary {
public:
HashedDictionary(const char* dictionary_contents,
size_t dictionary_size);
~HashedDictionary();
// Init() must be called before using the HashedDictionary as an argument
// to the VCDiffStreamingEncoder, or for any other purpose except
// destruction. It returns true if initialization succeeded, or false
// if an error occurred, in which case the caller should destroy the object
// without using it.
bool Init();
const VCDiffEngine* engine() const { return engine_; }
private:
const VCDiffEngine* engine_;
// Make the copy constructor and assignment operator private
// so that they don't inadvertently get used.
HashedDictionary(const HashedDictionary&); // NOLINT
void operator=(const HashedDictionary&);
};
// The standard streaming interface to the VCDIFF (RFC 3284) encoder.
// "Streaming" in this context means that, even though the entire set of
// input data to be encoded may not be available at once, the encoder
// can produce partial output based on what is available. Of course,
// the caller should try to maximize the sizes of the data chunks passed
// to the encoder.
class VCDiffStreamingEncoder {
public:
// The HashedDictionary object passed to the constructor must remain valid,
// without being deleted, for the lifetime of the VCDiffStreamingEncoder
// object.
//
// format_extensions allows certain open-vcdiff extensions to the VCDIFF
// format to be included in the encoded output. These extensions are not
// part of the RFC 3284 draft standard, so specifying any extension flags
// will make the output compatible only with open-vcdiff, or with other
// VCDIFF implementations that accept these extensions. See above for an
// explanation of each possible flag value.
//
// *** look_for_target_matches:
// The VCDIFF format allows COPY instruction addresses to reference data from
// the source (dictionary), or from previously encoded target data.
//
// If look_for_target_matches is false, then the encoder will only
// produce COPY instructions that reference source data from the dictionary,
// never from previously encoded target data. This will speed up the encoding
// process, but the encoded data will not be as compact.
//
// If this value is true, then the encoder will produce COPY instructions
// that reference either source data or target data. A COPY instruction from
// the previously encoded target data may even extend into the range of the
// data being produced by that same COPY instruction; for example, if the
// previously encoded target data is "LA", then a single COPY instruction of
// length 10 can produce the additional target data "LALALALALA".
//
// There is a third type of COPY instruction that starts within
// the source data and extends from the end of the source data
// into the beginning of the target data. This VCDIFF encoder will never
// produce a COPY instruction of this third type (regardless of the value of
// look_for_target_matches) because the cost of checking for matches
// across the source-target boundary would not justify its benefits.
//
VCDiffStreamingEncoder(const HashedDictionary* dictionary,
VCDiffFormatExtensionFlags format_extensions,
bool look_for_target_matches);
~VCDiffStreamingEncoder();
// The client should use these routines as follows:
// HashedDictionary hd(dictionary, dictionary_size);
// if (!hd.Init()) {
// HandleError();
// return;
// }
// string output_string;
// VCDiffStreamingEncoder v(hd, false, false);
// if (!v.StartEncoding(&output_string)) {
// HandleError();
// return; // No need to call FinishEncoding()
// }
// Process(output_string.data(), output_string.size());
// output_string.clear();
// while (get data_buf) {
// if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
// HandleError();
// return; // No need to call FinishEncoding()
// }
// // The encoding is appended to output_string at each call,
// // so clear output_string once its contents have been processed.
// Process(output_string.data(), output_string.size());
// output_string.clear();
// }
// if (!v.FinishEncoding(&output_string)) {
// HandleError();
// return;
// }
// Process(output_string.data(), output_string.size());
// output_string.clear();
//
// I.e., the allowed pattern of calls is
// StartEncoding EncodeChunk* FinishEncoding
//
// The size of the encoded output depends on the sizes of the chunks
// passed in (i.e. the chunking boundary affects compression).
// However the decoded output is independent of chunk boundaries.
// Sets up the data structures for encoding.
// Writes a VCDIFF delta file header (as defined in RFC section 4.1)
// to *output_string.
//
// Note: we *append*, so the old contents of *output_string stick around.
// This convention differs from the non-streaming Encode/Decode
// interfaces in VCDiffEncoder.
//
// If an error occurs, this function returns false; otherwise it returns true.
// If this function returns false, the caller does not need to call
// FinishEncoding or to do any cleanup except destroying the
// VCDiffStreamingEncoder object.
template<class OutputType>
bool StartEncoding(OutputType* output) {
OutputString<OutputType> output_string(output);
return StartEncodingToInterface(&output_string);
}
bool StartEncodingToInterface(OutputStringInterface* output_string);
// Appends compressed encoding for "data" (one complete VCDIFF delta window)
// to *output_string.
// If an error occurs (for example, if StartEncoding was not called
// earlier or StartEncoding returned false), this function returns false;
// otherwise it returns true. The caller does not need to call FinishEncoding
// or do any cleanup except destroying the VCDiffStreamingEncoder
// if this function returns false.
template<class OutputType>
bool EncodeChunk(const char* data, size_t len, OutputType* output) {
OutputString<OutputType> output_string(output);
return EncodeChunkToInterface(data, len, &output_string);
}
bool EncodeChunkToInterface(const char* data, size_t len,
OutputStringInterface* output_string);
// Finishes encoding and appends any leftover encoded data to *output_string.
// If an error occurs (for example, if StartEncoding was not called
// earlier or StartEncoding returned false), this function returns false;
// otherwise it returns true. The caller does not need to
// do any cleanup except destroying the VCDiffStreamingEncoder
// if this function returns false.
template<class OutputType>
bool FinishEncoding(OutputType* output) {
OutputString<OutputType> output_string(output);
return FinishEncodingToInterface(&output_string);
}
bool FinishEncodingToInterface(OutputStringInterface* output_string);
// Replaces the contents of match_counts with a vector of integers,
// one for each possible match length. The value of match_counts[n]
// is equal to the number of matches of length n found so far
// for this VCDiffStreamingEncoder object.
void GetMatchCounts(std::vector<int>* match_counts) const;
private:
VCDiffStreamingEncoderImpl* const impl_;
// Make the copy constructor and assignment operator private
// so that they don't inadvertently get used.
VCDiffStreamingEncoder(const VCDiffStreamingEncoder&); // NOLINT
void operator=(const VCDiffStreamingEncoder&);
};
// A simpler (non-streaming) interface to the VCDIFF encoder that can be used
// if the entire target data string is available.
//
class VCDiffEncoder {
public:
VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
: dictionary_(dictionary_contents, dictionary_size),
encoder_(NULL),
flags_(VCD_STANDARD_FORMAT),
look_for_target_matches_(true) { }
~VCDiffEncoder() {
delete encoder_;
}
// By default, VCDiffEncoder uses standard VCDIFF format. This function
// can be used before calling Encode(), to specify that interleaved format
// and/or checksum format should be used.
void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }
// By default, VCDiffEncoder looks for matches in the dictionary and also in
// the previously encoded target data. This function can be used before
// calling Encode(), to specify whether or not target matching should be
// enabled.
void SetTargetMatching(bool look_for_target_matches) {
look_for_target_matches_ = look_for_target_matches;
}
// Replaces old contents of output_string with the encoded form of
// target_data.
template<class OutputType>
bool Encode(const char* target_data,
size_t target_len,
OutputType* output) {
OutputString<OutputType> output_string(output);
return EncodeToInterface(target_data, target_len, &output_string);
}
private:
bool EncodeToInterface(const char* target_data,
size_t target_len,
OutputStringInterface* output_string);
HashedDictionary dictionary_;
VCDiffStreamingEncoder* encoder_;
VCDiffFormatExtensionFlags flags_;
bool look_for_target_matches_;
// Make the copy constructor and assignment operator private
// so that they don't inadvertently get used.
VCDiffEncoder(const VCDiffEncoder&); // NOLINT
void operator=(const VCDiffEncoder&);
};
} // namespace open_vcdiff
#endif // OPEN_VCDIFF_VCENCODER_H_